@@ -61,8 +61,9 @@ def _split_generators(self, dl_manager):
6161 # Infer features from first file
6262 if self .info .features is None :
6363 for first_file in itertools .chain .from_iterable (files ):
64- with h5py .File (first_file , "r" ) as h5 :
65- self .info .features = _recursive_infer_features (h5 )
64+ with open (first_file , "rb" ) as f :
65+ with h5py .File (f , "r" ) as h5 :
66+ self .info .features = _recursive_infer_features (h5 )
6667 break
6768 splits .append (datasets .SplitGenerator (name = split_name , gen_kwargs = {"files" : files }))
6869 return splits
@@ -73,22 +74,23 @@ def _generate_tables(self, files):
7374 batch_size_cfg = self .config .batch_size
7475 for file_idx , file in enumerate (itertools .chain .from_iterable (files )):
7576 try :
76- with h5py .File (file , "r" ) as h5 :
77- # Infer features and lengths from first file
78- if self .info .features is None :
79- self .info .features = _recursive_infer_features (h5 )
80- num_rows = _check_dataset_lengths (h5 , self .info .features )
81- if num_rows is None :
82- logger .warning (f"File { file } contains no data, skipping..." )
83- continue
84- effective_batch = batch_size_cfg or self ._writer_batch_size or num_rows
85- for start in range (0 , num_rows , effective_batch ):
86- end = min (start + effective_batch , num_rows )
87- pa_table = _recursive_load_arrays (h5 , self .info .features , start , end )
88- if pa_table is None :
77+ with open (file , "rb" ) as f :
78+ with h5py .File (f , "r" ) as h5 :
79+ # Infer features and lengths from first file
80+ if self .info .features is None :
81+ self .info .features = _recursive_infer_features (h5 )
82+ num_rows = _check_dataset_lengths (h5 , self .info .features )
83+ if num_rows is None :
8984 logger .warning (f"File { file } contains no data, skipping..." )
9085 continue
91- yield f"{ file_idx } _{ start } " , cast_table_to_features (pa_table , self .info .features )
86+ effective_batch = batch_size_cfg or self ._writer_batch_size or num_rows
87+ for start in range (0 , num_rows , effective_batch ):
88+ end = min (start + effective_batch , num_rows )
89+ pa_table = _recursive_load_arrays (h5 , self .info .features , start , end )
90+ if pa_table is None :
91+ logger .warning (f"File { file } contains no data, skipping..." )
92+ continue
93+ yield f"{ file_idx } _{ start } " , cast_table_to_features (pa_table , self .info .features )
9294 except ValueError as e :
9395 logger .error (f"Failed to read file '{ file } ' with error { type (e )} : { e } " )
9496 raise
0 commit comments