HDF5 compressed chunk direct read
This notebooks illustrate how to read compressed chunks directly and decompress them from Python for Blosc2
and Bitshuffle
filters.
It compares this approach with reading compressed chunks with h5py
and hdf5plugin
.
hdf5plugin config
Performance changes depending on hdf5plugin
build config, environment variables (OPENMP_NUM_THREADS
and BLOSC_NTHREADS
) and available CPU cores.
[1]:
#Set affinity and multithreading env. var. before any import
import os
os.sched_setaffinity(0, [0])
AFFINITY = os.sched_getaffinity(0)
NCPU = len(AFFINITY)
print(f"Number of CPU: {NCPU}; Affinity: {AFFINITY}")
os.environ["OPENMP_NUM_THREADS"] = str(NCPU)
os.environ["BLOSC_NTHREADS"] = str(NCPU)
print(f"""env:
OPENMP_NUM_THREADS: {os.environ.get("OPENMP_NUM_THREADS", "unset")}
BLOSC_NTHREADS: {os.environ.get("BLOSC_NTHREADS", "unset")}
""")
import h5py
import hdf5plugin
config = hdf5plugin.get_config()
print(f"""hdf5plugin:
Version: {hdf5plugin.version}
Build config:
{'''
'''.join(f' {k}: {v}' for k, v in config.build_config._asdict().items())}
""")
Number of CPU: 1; Affinity: {0}
env:
OPENMP_NUM_THREADS: 1
BLOSC_NTHREADS: 1
hdf5plugin:
Version: 4.2.0
Build config:
openmp: True
native: True
bmi2: True
sse2: True
avx2: True
avx512: False
cpp11: True
cpp14: True
ipp: False
filter_file_extension: .so
embedded_filters: ('blosc', 'blosc2', 'bshuf', 'bzip2', 'fcidecomp', 'lz4', 'sz', 'sz3', 'zfp', 'zstd')
Demo data
Data file is available here: http://www.silx.org/pub/pyFAI/pyFAI_UM_2020/data_ID13/kevlar.h5
Prepare 2 files with 2 compressed datasets with Blosc2
and Bitshuffle
filters.
[2]:
# Download dataset
!wget -O /dev/shm/kevlar.h5 http://www.silx.org/pub/pyFAI/pyFAI_UM_2020/data_ID13/kevlar.h5
[3]:
import h5py
import hdf5plugin
with h5py.File("/dev/shm/kevlar.h5", "r") as h:
data_ref = h["/entry/data/data"][500]
with h5py.File("/dev/shm/kevlar_blosc2.h5", "w") as h:
h.create_dataset(
"data",
data=data_ref,
chunks=data_ref.shape,
compression=hdf5plugin.Blosc2(
cname='lz4',
clevel=5,
filters=hdf5plugin.Blosc2.BITSHUFFLE,
),
)
with h5py.File("/dev/shm/kevlar_bitshuffle.h5", "w") as h:
h.create_dataset(
"data",
data=data_ref,
chunks=data_ref.shape,
compression=hdf5plugin.Bitshuffle(),
)
With Blosc2
Read compressed chunk with read_direct_chunk
and decompress it with blosc2
.
[4]:
import blosc2
import numpy
def decompress_blosc2_chunk(chunk: bytes, array: numpy.ndarray):
"""Decompress chunk data to provided array"""
blosc2.schunk_from_cframe(chunk).get_slice(out=array)
[5]:
# Allocate array
with h5py.File("/dev/shm/kevlar_blosc2.h5", "r") as h:
ds = h["data"]
array = numpy.empty(ds.shape, dtype=ds.dtype)
[6]:
%%timeit -r10 -n10 -o -q
# Read compressed chunk and decompress into array
with h5py.File("/dev/shm/kevlar_blosc2.h5", "r") as h5file:
ds = h5file["data"]
filter_mask, chunk = ds.id.read_direct_chunk(ds.id.get_chunk_info(0).chunk_offset)
decompress_blosc2_chunk(chunk, array)
[6]:
<TimeitResult : 3.7 ms ± 209 µs per loop (mean ± std. dev. of 10 runs, 10 loops each)>
[7]:
%%timeit -r10 -n10 -o -q
# Read data through h5py and libhdf5
with h5py.File("/dev/shm/kevlar_blosc2.h5", "r") as h5file:
data = h5file["data"][()]
[7]:
<TimeitResult : 8.92 ms ± 116 µs per loop (mean ± std. dev. of 10 runs, 10 loops each)>
[8]:
%%timeit -r10 -n10 -o -q
# Use read_direct
with h5py.File("/dev/shm/kevlar_blosc2.h5", "r") as h:
h["data"].read_direct(array)
[8]:
<TimeitResult : 5.18 ms ± 110 µs per loop (mean ± std. dev. of 10 runs, 10 loops each)>
With Bitshuffle
Read compressed chunk with read_direct_chunk
and decompress it with bitshuffle
.
[9]:
import struct
import bitshuffle
import numpy
def decompress_bslz4_chunk(payload, dtype, chunk_shape):
"""This function decompresses ONE chunk with bitshuffle-LZ4.
The library needs to be compiled without OpenMP when using threads !
:param payload: string with the compressed data as read by h5py.
:param dtype: data type of the stored content
:param chunk_shape: shape of one chunk
:return: decompressed chunk"""
total_nbytes, block_nbytes = struct.unpack(">QI", payload[:12])
block_size = block_nbytes // dtype.itemsize
arr = numpy.frombuffer(payload, dtype=numpy.uint8, offset=12) # No copy here
chunk_data = bitshuffle.decompress_lz4(arr, chunk_shape, dtype, block_size)
return chunk_data
[10]:
%%timeit -r10 -n10 -o -q
# Read compressed chunk and decompress it
with h5py.File("/dev/shm/kevlar_bitshuffle.h5", "r") as h:
ds = h["data"]
filter_mask, chunk = ds.id.read_direct_chunk(ds.id.get_chunk_info(0).chunk_offset)
array = decompress_bslz4_chunk(chunk, ds.dtype, ds.chunks)
[10]:
<TimeitResult : 4.35 ms ± 104 µs per loop (mean ± std. dev. of 10 runs, 10 loops each)>
[11]:
%%timeit -r10 -n10 -o -q
# Read data through h5py and libhdf5
with h5py.File("/dev/shm/kevlar_bitshuffle.h5", "r") as h:
data = h["data"][()]
[11]:
<TimeitResult : 9.24 ms ± 154 µs per loop (mean ± std. dev. of 10 runs, 10 loops each)>
[12]:
%%timeit -r10 -n10 -o -q
# Use read_direct
with h5py.File("/dev/shm/kevlar_bitshuffle.h5", "r") as h:
h["data"].read_direct(array)
[12]:
<TimeitResult : 5.45 ms ± 175 µs per loop (mean ± std. dev. of 10 runs, 10 loops each)>
[ ]: