1
1
import itertools
2
2
from dataclasses import dataclass
3
- from typing import TYPE_CHECKING , Any , Dict , List as ListT , Optional
3
+ from typing import TYPE_CHECKING , Any , Dict , Optional
4
+ from typing import List as ListT
4
5
5
6
import numpy as np
6
7
import pyarrow as pa
11
12
Array3D ,
12
13
Array4D ,
13
14
Array5D ,
15
+ Features ,
14
16
LargeList ,
15
17
List ,
16
18
Value ,
@@ -77,29 +79,16 @@ def _split_generators(self, dl_manager):
77
79
dataset_map = _traverse_datasets (h5 )
78
80
features_dict = {}
79
81
80
- def _check_column_collisions (new_columns , source_dataset_path ):
81
- """Check for column name collisions and raise informative errors."""
82
- for new_col in new_columns :
83
- if new_col in features_dict :
84
- raise ValueError (
85
- f"Column name collision detected: '{ new_col } ' from dataset '{ source_dataset_path } ' "
86
- f"conflicts with existing column. Consider renaming datasets in the HDF5 file."
87
- )
88
-
89
82
for path , dset in dataset_map .items ():
90
83
if _is_complex_dtype (dset .dtype ):
91
84
complex_features = _create_complex_features (path , dset )
92
- _check_column_collisions (complex_features .keys (), path )
93
85
features_dict .update (complex_features )
94
86
elif _is_compound_dtype (dset .dtype ):
95
87
compound_features = _create_compound_features (path , dset )
96
- _check_column_collisions (compound_features .keys (), path )
97
88
features_dict .update (compound_features )
98
89
elif _is_vlen_string_dtype (dset .dtype ):
99
- _check_column_collisions ([path ], path )
100
90
features_dict [path ] = Value ("string" )
101
91
else :
102
- _check_column_collisions ([path ], path )
103
92
feat = _infer_feature_from_dataset (dset )
104
93
features_dict [path ] = feat
105
94
self .info .features = datasets .Features (features_dict )
@@ -175,9 +164,9 @@ def _generate_tables(self, files):
175
164
pa_arr = datasets .features .features .numpy_to_pyarrow_listarray (arr )
176
165
batch_dict [path ] = pa_arr
177
166
elif _is_complex_dtype (dset .dtype ):
178
- batch_dict .update (_convert_complex_to_separate_columns (path , arr , dset ))
167
+ batch_dict .update (_convert_complex_to_nested (path , arr , dset ))
179
168
elif _is_compound_dtype (dset .dtype ):
180
- batch_dict .update (_convert_compound_to_separate_columns (path , arr , dset ))
169
+ batch_dict .update (_convert_compound_to_nested (path , arr , dset ))
181
170
elif dset .dtype .kind == "O" :
182
171
raise ValueError (
183
172
f"Object dtype dataset '{ path } ' is not supported. "
@@ -219,22 +208,36 @@ def _is_complex_dtype(dtype: np.dtype) -> bool:
219
208
return dtype .kind == "c"
220
209
221
210
222
- def _create_complex_features (base_path : str , dset : "h5py.Dataset" ) -> Dict [str , Value ]:
223
- """Create separate features for real and imaginary parts of complex data .
211
+ def _create_complex_features (base_path : str , dset : "h5py.Dataset" ) -> Dict [str , Any ]:
212
+ """Create Features for complex data with real and imaginary parts `real` and `imag` .
224
213
225
214
NOTE: Always uses float64 for the real and imaginary parts.
226
215
"""
227
216
logger .info (
228
- f"Complex dataset '{ base_path } ' (dtype: { dset .dtype } ) split into '{ base_path } _real' and '{ base_path } _imag'"
217
+ f"Complex dataset '{ base_path } ' (dtype: { dset .dtype } ) represented as nested structure with 'real' and 'imag' fields"
218
+ )
219
+ nested_features = Features (
220
+ {
221
+ "real" : Value ("float64" ),
222
+ "imag" : Value ("float64" ),
223
+ }
229
224
)
230
- return {f" { base_path } _real" : Value ( "float64" ), f" { base_path } _imag" : Value ( "float64" ) }
225
+ return {base_path : nested_features }
231
226
232
227
233
- def _convert_complex_to_separate_columns (base_path : str , arr : np .ndarray , dset : "h5py.Dataset" ) -> Dict [str , pa .Array ]:
234
- """Convert complex array to separate real and imaginary columns ."""
228
+ def _convert_complex_to_nested (base_path : str , arr : np .ndarray , dset : "h5py.Dataset" ) -> Dict [str , pa .Array ]:
229
+ """Convert complex to Features with real and imaginary parts `real` and `imag` ."""
235
230
result = {}
236
- result [f"{ base_path } _real" ] = datasets .features .features .numpy_to_pyarrow_listarray (arr .real )
237
- result [f"{ base_path } _imag" ] = datasets .features .features .numpy_to_pyarrow_listarray (arr .imag )
231
+
232
+ def _convert_complex_scalar (complex_val ):
233
+ """Convert a complex scalar to a dictionary."""
234
+ if complex_val .size == 1 :
235
+ return {"real" : float (complex_val .item ().real ), "imag" : float (complex_val .item ().imag )}
236
+ else :
237
+ # For multi-dimensional arrays, convert to list
238
+ return {"real" : complex_val .real .tolist (), "imag" : complex_val .imag .tolist ()}
239
+
240
+ result [base_path ] = pa .array ([_convert_complex_scalar (complex_val ) for complex_val in arr ])
238
241
return result
239
242
240
243
@@ -255,51 +258,56 @@ def __init__(self, dtype):
255
258
256
259
257
260
def _create_compound_features (base_path : str , dset : "h5py.Dataset" ) -> Dict [str , Any ]:
258
- """Create separate features for each field in compound data ."""
261
+ """Create nested features for compound data with field names as keys ."""
259
262
field_names = list (dset .dtype .names )
260
263
logger .info (
261
- f"Compound dataset '{ base_path } ' (dtype: { dset .dtype } ) flattened into { len ( field_names ) } columns : { field_names } "
264
+ f"Compound dataset '{ base_path } ' (dtype: { dset .dtype } ) represented as nested Features with fields : { field_names } "
262
265
)
263
266
264
- features = {}
267
+ nested_features_dict = {}
265
268
for field_name in field_names :
266
269
field_dtype = dset .dtype [field_name ]
267
- field_path = f"{ base_path } _{ field_name } "
268
270
269
271
if _is_complex_dtype (field_dtype ):
270
- features [f"{ field_path } _real" ] = Value ("float64" )
271
- features [f"{ field_path } _imag" ] = Value ("float64" )
272
+ nested_features_dict [field_name ] = Features (
273
+ {
274
+ "real" : Value ("float64" ),
275
+ "imag" : Value ("float64" ),
276
+ }
277
+ )
272
278
elif _is_compound_dtype (field_dtype ):
273
279
mock_dset = _MockDataset (field_dtype )
274
- nested_features = _create_compound_features (field_path , mock_dset )
275
- features .update (nested_features )
280
+ nested_features_dict [field_name ] = _create_compound_features (field_name , mock_dset )[field_name ]
276
281
else :
277
- value_feature = _np_to_pa_to_hf_value (field_dtype )
278
- features [field_path ] = value_feature
282
+ nested_features_dict [field_name ] = _np_to_pa_to_hf_value (field_dtype )
279
283
280
- return features
284
+ nested_features = Features (nested_features_dict )
285
+ return {base_path : nested_features }
281
286
282
287
283
- def _convert_compound_to_separate_columns (
284
- base_path : str , arr : np .ndarray , dset : "h5py.Dataset"
285
- ) -> Dict [str , pa .Array ]:
286
- """Convert compound array to separate columns for each field."""
288
+ def _convert_compound_to_nested (base_path : str , arr : np .ndarray , dset : "h5py.Dataset" ) -> Dict [str , pa .Array ]:
289
+ """Convert compound array to nested structure with field names as keys."""
287
290
result = {}
288
- for field_name in list (dset .dtype .names ):
289
- field_dtype = dset .dtype [field_name ]
290
- field_path = f"{ base_path } _{ field_name } "
291
- field_data = arr [field_name ]
292
-
293
- if _is_complex_dtype (field_dtype ):
294
- result [f"{ field_path } _real" ] = datasets .features .features .numpy_to_pyarrow_listarray (field_data .real )
295
- result [f"{ field_path } _imag" ] = datasets .features .features .numpy_to_pyarrow_listarray (field_data .imag )
296
- elif _is_compound_dtype (field_dtype ):
297
- mock_dset = _MockDataset (field_dtype )
298
- nested_result = _convert_compound_to_separate_columns (field_path , field_data , mock_dset )
299
- result .update (nested_result )
300
- else :
301
- result [field_path ] = datasets .features .features .numpy_to_pyarrow_listarray (field_data )
302
291
292
+ def _convert_compound_recursive (compound_arr , compound_dtype ):
293
+ """Recursively convert compound array to nested structure."""
294
+ nested_data = []
295
+ for row in compound_arr :
296
+ row_dict = {}
297
+ for field_name in compound_dtype .names :
298
+ field_dtype = compound_dtype [field_name ]
299
+ field_data = row [field_name ]
300
+
301
+ if _is_complex_dtype (field_dtype ):
302
+ row_dict [field_name ] = {"real" : float (field_data .real ), "imag" : float (field_data .imag )}
303
+ elif _is_compound_dtype (field_dtype ):
304
+ row_dict [field_name ] = _convert_compound_recursive ([field_data ], field_dtype )[0 ]
305
+ else :
306
+ row_dict [field_name ] = field_data .item () if field_data .size == 1 else field_data .tolist ()
307
+ nested_data .append (row_dict )
308
+ return nested_data
309
+
310
+ result [base_path ] = pa .array (_convert_compound_recursive (arr , dset .dtype ))
303
311
return result
304
312
305
313
0 commit comments