Skip to content

Commit 38278c8

Browse files
authored
Custom filter support (#159)
* add support for bitshuffle * bump version
1 parent f1cf192 commit 38278c8

File tree

4 files changed

+64
-12
lines changed

4 files changed

+64
-12
lines changed

h5pyd/_apps/utillib.py

Lines changed: 42 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,13 @@
2828
MIN_CHUNK_SIZE = 1 * 1024 * 1024
2929
MAC_CHUNK_SIZE = 8 * 1024 * 1024
3030

31+
H5Z_FILTER_MAP = { 32001: "blosclz",
32+
32004: "lz4",
33+
32008: "bitshuffle",
34+
32015: "zstd",
35+
}
36+
37+
3138
# check if hdf5 library version supports chunk iteration
3239
hdf_library_version = h5py.version.hdf5_version_tuple
3340
library_has_chunk_iter = (hdf_library_version >= (1, 14, 0) or (hdf_library_version < (1, 12, 0) and (hdf_library_version >= (1, 10, 10))))
@@ -761,8 +768,8 @@ def create_chunktable(dset, dset_dims, ctx):
761768
chunk_key += str(index[dim] // chunk_dims[dim])
762769
if dim < rank - 1:
763770
chunk_key += "_"
764-
logging.debug(f"adding chunk_key: {chunk_key}")
765-
chunk_map[chunk_key] = (chunk_info.byte_offset, chunk_info.size)
771+
logging.debug(f"adding chunk_key: {chunk_key}")
772+
chunk_map[chunk_key] = (chunk_info.byte_offset, chunk_info.size)
766773

767774
chunks["class"] = "H5D_CHUNKED_REF"
768775
if not extend:
@@ -1121,6 +1128,7 @@ def create_dataset(dobj, ctx):
11211128
# or vlen
11221129
pass
11231130
else:
1131+
logging.debug(f"filter setup for {dobj.name}")
11241132
if not ctx["ignorefilters"]:
11251133
kwargs["compression"] = dobj.compression
11261134
kwargs["compression_opts"] = dobj.compression_opts
@@ -1134,7 +1142,7 @@ def create_dataset(dobj, ctx):
11341142

11351143
# TBD: it would be better if HSDS could let us know what filters
11361144
# are supported (like it does with compressors)
1137-
# For now, just hard-code fletcher32 and scaleoffset to be ignored
1145+
# For now, just hard-ccreate_datasetcreate_datasetode fletcher32 and scaleoffset to be ignored
11381146
if dobj.fletcher32:
11391147
msg = f"fletcher32 filter used by dataset: {dobj.name} is not "
11401148
msg += "supported by HSDS, this filter will not be used"
@@ -1144,7 +1152,35 @@ def create_dataset(dobj, ctx):
11441152
msg = f"scaleoffset filter used by dataset: {dobj.name} is not "
11451153
msg += "supported by HSDS, this filter will not be used"
11461154
logging.warning(msg)
1147-
# kwargs["scaleoffset"] = dobj.scaleoffset
1155+
1156+
if is_h5py(dobj) and not kwargs.get("compression"):
1157+
# apply any custom filters as long as they are supported in HSDS
1158+
for filter_id in dobj._filters:
1159+
filter_opts = dobj._filters[filter_id]
1160+
try:
1161+
filter_id = int(filter_id)
1162+
except ValueError:
1163+
msg = "unrecognized filter id: {filter_id} for {dobj.name}, ignoring"
1164+
logging.warning(msg)
1165+
1166+
if not isinstance(filter_id, int):
1167+
continue
1168+
1169+
if filter_id in H5Z_FILTER_MAP:
1170+
filter_name = H5Z_FILTER_MAP[filter_id]
1171+
if filter_name == "bitshuffle":
1172+
kwargs["shuffle"] = filter_name
1173+
logging.info(f"using bitshuffle on {dobj.name}")
1174+
else:
1175+
# supported non-standard compressor
1176+
kwargs["compression"] = filter_name
1177+
logging.info(f"using compressor: {filter_name} for {dobj.name}")
1178+
kwargs["compression_opts"] = filter_opts
1179+
logging.info(f"compression_opts: {filter_opts}")
1180+
else:
1181+
logging.warning(f"filter id {filter_id} for {dobj.name} not supported")
1182+
1183+
# kwargs["scaleoffset"] = dobj.scaleoffset
11481184
# setting the fillvalue is failing in some cases
11491185
# see: https://github.com/HDFGroup/h5pyd/issues/119
11501186
# don't set fill value for reference types
@@ -1501,14 +1537,15 @@ def load_file(
15011537

15021538
logging.info(f"input file: {fin.filename}")
15031539
logging.info(f"output file: {fout.filename}")
1540+
logging.info(f"dataload: {dataload}")
15041541
if dataload != "ingest":
15051542
if not dataload:
15061543
logging.info("no data load")
15071544
elif dataload in ("link", "fastlink"):
15081545
if not s3path:
15091546
logging.error("s3path expected to be set")
15101547
sys.exit(1)
1511-
logging.info("using s3path")
1548+
logging.info(f"using s3path: {s3path}")
15121549
else:
15131550
logging.error(f"unexpected dataload value: {dataload}")
15141551
sys.exit(1)

h5pyd/_hl/filters.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -177,8 +177,23 @@ def rq_tuple(tpl, name):
177177
filters.append(filter_scaleoffset)
178178

179179
if shuffle:
180-
filter_shuffle = {"class": "H5Z_FILTER_SHUFFLE"}
181-
filter_shuffle["id"] = 2
180+
if isinstance(shuffle, int) and shuffle == 32008:
181+
bitshuffle = True
182+
elif isinstance(shuffle, str) and shuffle == "bitshuffle":
183+
bitshuffle = True
184+
else:
185+
bitshuffle = False
186+
187+
if bitshuffle:
188+
filter_shuffle = {"class": "H5Z_FILTER_BITSHUFFLE"}
189+
filter_shuffle["id"] = 32008
190+
filter_shuffle["name"] = "bitshuffle"
191+
192+
else:
193+
# regular shuffle filter
194+
filter_shuffle = {"class": "H5Z_FILTER_SHUFFLE"}
195+
filter_shuffle["id"] = 2
196+
filter_shuffle["name"] = "shuffle"
182197
filters.append(filter_shuffle)
183198

184199
if compression == "gzip":

h5pyd/version.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
import sys
1717
import numpy
1818

19-
version = "0.16.0"
19+
version = "0.17.0"
2020

2121
hdf5_version = "REST"
2222

@@ -28,8 +28,8 @@
2828
else ("",)
2929
)
3030

31-
api_version_tuple = (0, 16, 0)
32-
api_version = "0.16.0"
31+
api_version_tuple = (0, 17, 0)
32+
api_version = "0.17.0"
3333

3434
__doc__ = """\
3535
This is h5pyd **%s**

setup.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323

2424
setup(
2525
name="h5pyd",
26-
version="0.16.0",
26+
version="0.17.0",
2727
description="h5py compatible client lib for HDF REST API",
2828
long_description=long_description,
2929
url="http://github.com/HDFGroup/h5pyd",
@@ -70,4 +70,4 @@
7070
"hsstat = h5pyd._apps.hsstat:main",
7171
]
7272
},
73-
)
73+
)

0 commit comments

Comments
 (0)