Skip to content

Commit 3e13d30

Browse files
authored
Allow streaming hdf5 files (#7814)
allow streaming hdf5 files
1 parent aa7f2a9 commit 3e13d30

File tree

1 file changed

+18
-16
lines changed
  • src/datasets/packaged_modules/hdf5

1 file changed

+18
-16
lines changed

src/datasets/packaged_modules/hdf5/hdf5.py

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,9 @@ def _split_generators(self, dl_manager):
6161
# Infer features from first file
6262
if self.info.features is None:
6363
for first_file in itertools.chain.from_iterable(files):
64-
with h5py.File(first_file, "r") as h5:
65-
self.info.features = _recursive_infer_features(h5)
64+
with open(first_file, "rb") as f:
65+
with h5py.File(f, "r") as h5:
66+
self.info.features = _recursive_infer_features(h5)
6667
break
6768
splits.append(datasets.SplitGenerator(name=split_name, gen_kwargs={"files": files}))
6869
return splits
@@ -73,22 +74,23 @@ def _generate_tables(self, files):
7374
batch_size_cfg = self.config.batch_size
7475
for file_idx, file in enumerate(itertools.chain.from_iterable(files)):
7576
try:
76-
with h5py.File(file, "r") as h5:
77-
# Infer features and lengths from first file
78-
if self.info.features is None:
79-
self.info.features = _recursive_infer_features(h5)
80-
num_rows = _check_dataset_lengths(h5, self.info.features)
81-
if num_rows is None:
82-
logger.warning(f"File {file} contains no data, skipping...")
83-
continue
84-
effective_batch = batch_size_cfg or self._writer_batch_size or num_rows
85-
for start in range(0, num_rows, effective_batch):
86-
end = min(start + effective_batch, num_rows)
87-
pa_table = _recursive_load_arrays(h5, self.info.features, start, end)
88-
if pa_table is None:
77+
with open(file, "rb") as f:
78+
with h5py.File(f, "r") as h5:
79+
# Infer features and lengths from first file
80+
if self.info.features is None:
81+
self.info.features = _recursive_infer_features(h5)
82+
num_rows = _check_dataset_lengths(h5, self.info.features)
83+
if num_rows is None:
8984
logger.warning(f"File {file} contains no data, skipping...")
9085
continue
91-
yield f"{file_idx}_{start}", cast_table_to_features(pa_table, self.info.features)
86+
effective_batch = batch_size_cfg or self._writer_batch_size or num_rows
87+
for start in range(0, num_rows, effective_batch):
88+
end = min(start + effective_batch, num_rows)
89+
pa_table = _recursive_load_arrays(h5, self.info.features, start, end)
90+
if pa_table is None:
91+
logger.warning(f"File {file} contains no data, skipping...")
92+
continue
93+
yield f"{file_idx}_{start}", cast_table_to_features(pa_table, self.info.features)
9294
except ValueError as e:
9395
logger.error(f"Failed to read file '{file}' with error {type(e)}: {e}")
9496
raise

0 commit comments

Comments
 (0)