Skip to content

Commit cdb7f73

Browse files
committed
explicit h5py dependency
1 parent b12fa5e commit cdb7f73

File tree

2 files changed

+19
-8
lines changed

2 files changed

+19
-8
lines changed

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,7 @@
166166
"aiohttp",
167167
"elasticsearch>=7.17.12,<8.0.0", # 8.0 asks users to provide hosts or cloud_id when instantiating ElasticSearch(); 7.9.1 has legacy numpy.float_ which was fixed in https://github.com/elastic/elasticsearch-py/pull/2551.
168168
"faiss-cpu>=1.8.0.post1", # Pins numpy < 2
169+
"h5py", # FIXME: probably needs a lower bound
169170
"jax>=0.3.14; sys_platform != 'win32'",
170171
"jaxlib>=0.3.14; sys_platform != 'win32'",
171172
"lz4",

src/datasets/packaged_modules/hdf5/hdf5.py

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
11
import itertools
22
from dataclasses import dataclass
3-
from typing import Any, Dict, List, Optional
3+
from typing import TYPE_CHECKING, Any, Dict, List, Optional
44

55
import numpy as np
66
import pyarrow as pa
77

88
import datasets
9-
import h5py
109
from datasets.features.features import (
1110
Array2D,
1211
Array3D,
@@ -21,6 +20,9 @@
2120
from datasets.table import table_cast
2221

2322

23+
if TYPE_CHECKING:
24+
import h5py
25+
2426
logger = datasets.utils.logging.get_logger(__name__)
2527

2628
EXTENSIONS = [".h5", ".hdf5"]
@@ -56,6 +58,8 @@ def _info(self):
5658
return datasets.DatasetInfo(features=self.config.features)
5759

5860
def _split_generators(self, dl_manager):
61+
import h5py
62+
5963
if not self.config.data_files:
6064
raise ValueError(f"At least one data file must be specified, but got data_files={self.config.data_files}")
6165
dl_manager.download_config.extract_on_the_fly = True
@@ -119,6 +123,8 @@ def _cast_table(self, pa_table: pa.Table) -> pa.Table:
119123
return pa_table
120124

121125
def _generate_tables(self, files):
126+
import h5py
127+
122128
batch_size_cfg = self.config.batch_size
123129
for file_idx, file in enumerate(itertools.chain.from_iterable(files)):
124130
try:
@@ -179,7 +185,9 @@ def _generate_tables(self, files):
179185
raise
180186

181187

182-
def _traverse_datasets(h5_obj, prefix: str = "") -> Dict[str, h5py.Dataset]:
188+
def _traverse_datasets(h5_obj, prefix: str = "") -> Dict[str, "h5py.Dataset"]:
189+
import h5py
190+
183191
mapping: Dict[str, h5py.Dataset] = {}
184192

185193
def collect_datasets(name, obj):
@@ -201,7 +209,7 @@ def _is_complex_dtype(dtype: np.dtype) -> bool:
201209
return dtype.kind == "c"
202210

203211

204-
def _create_complex_features(base_path: str, dset: h5py.Dataset) -> Dict[str, Value]:
212+
def _create_complex_features(base_path: str, dset: "h5py.Dataset") -> Dict[str, Value]:
205213
"""Create separate features for real and imaginary parts of complex data.
206214
207215
NOTE: Always uses float64 for the real and imaginary parts.
@@ -212,7 +220,7 @@ def _create_complex_features(base_path: str, dset: h5py.Dataset) -> Dict[str, Va
212220
return {f"{base_path}_real": Value("float64"), f"{base_path}_imag": Value("float64")}
213221

214222

215-
def _convert_complex_to_separate_columns(base_path: str, arr: np.ndarray, dset: h5py.Dataset) -> Dict[str, pa.Array]:
223+
def _convert_complex_to_separate_columns(base_path: str, arr: np.ndarray, dset: "h5py.Dataset") -> Dict[str, pa.Array]:
216224
"""Convert complex array to separate real and imaginary columns."""
217225
result = {}
218226
result[f"{base_path}_real"] = datasets.features.features.numpy_to_pyarrow_listarray(arr.real)
@@ -236,7 +244,7 @@ def __init__(self, dtype):
236244
self.names = dtype.names
237245

238246

239-
def _create_compound_features(base_path: str, dset: h5py.Dataset) -> Dict[str, Any]:
247+
def _create_compound_features(base_path: str, dset: "h5py.Dataset") -> Dict[str, Any]:
240248
"""Create separate features for each field in compound data."""
241249
field_names = list(dset.dtype.names)
242250
logger.info(
@@ -262,7 +270,9 @@ def _create_compound_features(base_path: str, dset: h5py.Dataset) -> Dict[str, A
262270
return features
263271

264272

265-
def _convert_compound_to_separate_columns(base_path: str, arr: np.ndarray, dset: h5py.Dataset) -> Dict[str, pa.Array]:
273+
def _convert_compound_to_separate_columns(
274+
base_path: str, arr: np.ndarray, dset: "h5py.Dataset"
275+
) -> Dict[str, pa.Array]:
266276
"""Convert compound array to separate columns for each field."""
267277
result = {}
268278
for field_name in list(dset.dtype.names):
@@ -314,7 +324,7 @@ def _convert_vlen_string_to_array(arr: np.ndarray) -> pa.Array:
314324
# └───────────┘
315325

316326

317-
def _infer_feature_from_dataset(dset: h5py.Dataset):
327+
def _infer_feature_from_dataset(dset: "h5py.Dataset"):
318328
# non-string varlen
319329
if hasattr(dset.dtype, "metadata") and dset.dtype.metadata and "vlen" in dset.dtype.metadata:
320330
vlen_dtype = dset.dtype.metadata["vlen"]

0 commit comments

Comments
 (0)