Use Features(dict) for complex and compound

klamike · klamike · commit db2e76a8df16 · 2025-08-11T13:51:21.000-04:00
diff --git a/src/datasets/packaged_modules/hdf5/hdf5.py b/src/datasets/packaged_modules/hdf5/hdf5.py
@@ -1,6 +1,7 @@
 import itertools
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Dict, List as ListT, Optional
+from typing import TYPE_CHECKING, Any, Dict, Optional
+from typing import List as ListT
 
 import numpy as np
 import pyarrow as pa
@@ -11,6 +12,7 @@
     Array3D,
     Array4D,
     Array5D,
+    Features,
     LargeList,
     List,
     Value,
@@ -77,29 +79,16 @@ def _split_generators(self, dl_manager):
                         dataset_map = _traverse_datasets(h5)
                         features_dict = {}
 
-                        def _check_column_collisions(new_columns, source_dataset_path):
-                            """Check for column name collisions and raise informative errors."""
-                            for new_col in new_columns:
-                                if new_col in features_dict:
-                                    raise ValueError(
-                                        f"Column name collision detected: '{new_col}' from dataset '{source_dataset_path}' "
-                                        f"conflicts with existing column. Consider renaming datasets in the HDF5 file."
-                                    )
-
                         for path, dset in dataset_map.items():
                             if _is_complex_dtype(dset.dtype):
                                 complex_features = _create_complex_features(path, dset)
-                                _check_column_collisions(complex_features.keys(), path)
                                 features_dict.update(complex_features)
                             elif _is_compound_dtype(dset.dtype):
                                 compound_features = _create_compound_features(path, dset)
-                                _check_column_collisions(compound_features.keys(), path)
                                 features_dict.update(compound_features)
                             elif _is_vlen_string_dtype(dset.dtype):
-                                _check_column_collisions([path], path)
                                 features_dict[path] = Value("string")
                             else:
-                                _check_column_collisions([path], path)
                                 feat = _infer_feature_from_dataset(dset)
                                 features_dict[path] = feat
                         self.info.features = datasets.Features(features_dict)
@@ -175,9 +164,9 @@ def _generate_tables(self, files):
                                 pa_arr = datasets.features.features.numpy_to_pyarrow_listarray(arr)
                                 batch_dict[path] = pa_arr
                             elif _is_complex_dtype(dset.dtype):
-                                batch_dict.update(_convert_complex_to_separate_columns(path, arr, dset))
+                                batch_dict.update(_convert_complex_to_nested(path, arr, dset))
                             elif _is_compound_dtype(dset.dtype):
-                                batch_dict.update(_convert_compound_to_separate_columns(path, arr, dset))
+                                batch_dict.update(_convert_compound_to_nested(path, arr, dset))
                             elif dset.dtype.kind == "O":
                                 raise ValueError(
                                     f"Object dtype dataset '{path}' is not supported. "
@@ -219,22 +208,36 @@ def _is_complex_dtype(dtype: np.dtype) -> bool:
     return dtype.kind == "c"
 
 
-def _create_complex_features(base_path: str, dset: "h5py.Dataset") -> Dict[str, Value]:
-    """Create separate features for real and imaginary parts of complex data.
+def _create_complex_features(base_path: str, dset: "h5py.Dataset") -> Dict[str, Any]:
+    """Create Features for complex data with real and imaginary parts `real` and `imag`.
 
     NOTE: Always uses float64 for the real and imaginary parts.
     """
     logger.info(
-        f"Complex dataset '{base_path}' (dtype: {dset.dtype}) split into '{base_path}_real' and '{base_path}_imag'"
+        f"Complex dataset '{base_path}' (dtype: {dset.dtype}) represented as nested structure with 'real' and 'imag' fields"
+    )
+    nested_features = Features(
+        {
+            "real": Value("float64"),
+            "imag": Value("float64"),
+        }
     )
-    return {f"{base_path}_real": Value("float64"), f"{base_path}_imag": Value("float64")}
+    return {base_path: nested_features}
 
 
-def _convert_complex_to_separate_columns(base_path: str, arr: np.ndarray, dset: "h5py.Dataset") -> Dict[str, pa.Array]:
-    """Convert complex array to separate real and imaginary columns."""
+def _convert_complex_to_nested(base_path: str, arr: np.ndarray, dset: "h5py.Dataset") -> Dict[str, pa.Array]:
+    """Convert complex to Features with real and imaginary parts `real` and `imag`."""
     result = {}
-    result[f"{base_path}_real"] = datasets.features.features.numpy_to_pyarrow_listarray(arr.real)
-    result[f"{base_path}_imag"] = datasets.features.features.numpy_to_pyarrow_listarray(arr.imag)
+
+    def _convert_complex_scalar(complex_val):
+        """Convert a complex scalar to a dictionary."""
+        if complex_val.size == 1:
+            return {"real": float(complex_val.item().real), "imag": float(complex_val.item().imag)}
+        else:
+            # For multi-dimensional arrays, convert to list
+            return {"real": complex_val.real.tolist(), "imag": complex_val.imag.tolist()}
+
+    result[base_path] = pa.array([_convert_complex_scalar(complex_val) for complex_val in arr])
     return result
 
 
@@ -255,51 +258,56 @@ def __init__(self, dtype):
 
 
 def _create_compound_features(base_path: str, dset: "h5py.Dataset") -> Dict[str, Any]:
-    """Create separate features for each field in compound data."""
+    """Create nested features for compound data with field names as keys."""
     field_names = list(dset.dtype.names)
     logger.info(
-        f"Compound dataset '{base_path}' (dtype: {dset.dtype}) flattened into {len(field_names)} columns: {field_names}"
+        f"Compound dataset '{base_path}' (dtype: {dset.dtype}) represented as nested Features with fields: {field_names}"
     )
 
-    features = {}
+    nested_features_dict = {}
     for field_name in field_names:
         field_dtype = dset.dtype[field_name]
-        field_path = f"{base_path}_{field_name}"
 
         if _is_complex_dtype(field_dtype):
-            features[f"{field_path}_real"] = Value("float64")
-            features[f"{field_path}_imag"] = Value("float64")
+            nested_features_dict[field_name] = Features(
+                {
+                    "real": Value("float64"),
+                    "imag": Value("float64"),
+                }
+            )
         elif _is_compound_dtype(field_dtype):
             mock_dset = _MockDataset(field_dtype)
-            nested_features = _create_compound_features(field_path, mock_dset)
-            features.update(nested_features)
+            nested_features_dict[field_name] = _create_compound_features(field_name, mock_dset)[field_name]
         else:
-            value_feature = _np_to_pa_to_hf_value(field_dtype)
-            features[field_path] = value_feature
+            nested_features_dict[field_name] = _np_to_pa_to_hf_value(field_dtype)
 
-    return features
+    nested_features = Features(nested_features_dict)
+    return {base_path: nested_features}
 
 
-def _convert_compound_to_separate_columns(
-    base_path: str, arr: np.ndarray, dset: "h5py.Dataset"
-) -> Dict[str, pa.Array]:
-    """Convert compound array to separate columns for each field."""
+def _convert_compound_to_nested(base_path: str, arr: np.ndarray, dset: "h5py.Dataset") -> Dict[str, pa.Array]:
+    """Convert compound array to nested structure with field names as keys."""
     result = {}
-    for field_name in list(dset.dtype.names):
-        field_dtype = dset.dtype[field_name]
-        field_path = f"{base_path}_{field_name}"
-        field_data = arr[field_name]
-
-        if _is_complex_dtype(field_dtype):
-            result[f"{field_path}_real"] = datasets.features.features.numpy_to_pyarrow_listarray(field_data.real)
-            result[f"{field_path}_imag"] = datasets.features.features.numpy_to_pyarrow_listarray(field_data.imag)
-        elif _is_compound_dtype(field_dtype):
-            mock_dset = _MockDataset(field_dtype)
-            nested_result = _convert_compound_to_separate_columns(field_path, field_data, mock_dset)
-            result.update(nested_result)
-        else:
-            result[field_path] = datasets.features.features.numpy_to_pyarrow_listarray(field_data)
 
+    def _convert_compound_recursive(compound_arr, compound_dtype):
+        """Recursively convert compound array to nested structure."""
+        nested_data = []
+        for row in compound_arr:
+            row_dict = {}
+            for field_name in compound_dtype.names:
+                field_dtype = compound_dtype[field_name]
+                field_data = row[field_name]
+
+                if _is_complex_dtype(field_dtype):
+                    row_dict[field_name] = {"real": float(field_data.real), "imag": float(field_data.imag)}
+                elif _is_compound_dtype(field_dtype):
+                    row_dict[field_name] = _convert_compound_recursive([field_data], field_dtype)[0]
+                else:
+                    row_dict[field_name] = field_data.item() if field_data.size == 1 else field_data.tolist()
+            nested_data.append(row_dict)
+        return nested_data
+
+    result[base_path] = pa.array(_convert_compound_recursive(arr, dset.dtype))
     return result
 
 
diff --git a/tests/packaged_modules/test_hdf5.py b/tests/packaged_modules/test_hdf5.py
@@ -1,7 +1,7 @@
+import h5py
 import numpy as np
 import pytest
 
-import h5py
 from datasets import Array2D, Array3D, Array4D, Features, List, Value
 from datasets.builder import InvalidConfigName
 from datasets.data_files import DataFilesDict, DataFilesList
@@ -257,41 +257,6 @@ def hdf5_file_with_mixed_data_types(tmp_path):
     return str(filename)
 
 
-@pytest.fixture
-def hdf5_file_with_complex_collision(tmp_path):
-    """Create an HDF5 file where complex dataset would collide with existing dataset name."""
-    filename = tmp_path / "collision.h5"
-
-    with h5py.File(filename, "w") as f:
-        # Create a complex dataset
-        complex_data = np.array([1 + 2j, 3 + 4j], dtype=np.complex64)
-        f.create_dataset("data", data=complex_data)
-
-        # Create a regular dataset that would collide with the complex real part
-        regular_data = np.array([1.0, 2.0], dtype=np.float32)
-        f.create_dataset("data_real", data=regular_data)  # This should cause a collision
-
-    return str(filename)
-
-
-@pytest.fixture
-def hdf5_file_with_compound_collision(tmp_path):
-    """Create an HDF5 file where compound dataset would collide with existing dataset name."""
-    filename = tmp_path / "compound_collision.h5"
-
-    with h5py.File(filename, "w") as f:
-        # Create a compound dataset
-        dt_compound = np.dtype([("x", "i4"), ("y", "f8")])
-        compound_data = np.array([(1, 2.5), (3, 4.5)], dtype=dt_compound)
-        f.create_dataset("position", data=compound_data)
-
-        # Create a regular dataset that would collide with compound field
-        regular_data = np.array([10, 20], dtype=np.int32)
-        f.create_dataset("position_x", data=regular_data)  # This should cause a collision
-
-    return str(filename)
-
-
 def test_config_raises_when_invalid_name():
     """Test that invalid config names raise an error."""
     with pytest.raises(InvalidConfigName, match="Bad characters"):
@@ -675,23 +640,21 @@ def test_hdf5_complex_numbers(hdf5_file_with_complex_data):
     assert len(tables) == 1
     _, table = tables[0]
 
-    # Check that complex numbers are split into real/imaginary parts
+    # Check that complex numbers are represented as nested Features
     expected_columns = {
-        "complex_64_real",
-        "complex_64_imag",
-        "complex_128_real",
-        "complex_128_imag",
-        "complex_array_real",
-        "complex_array_imag",
+        "complex_64",
+        "complex_128",
+        "complex_array",
     }
     assert set(table.column_names) == expected_columns
 
     # Check complex_64 data
-    real_data = table["complex_64_real"].to_pylist()
-    imag_data = table["complex_64_imag"].to_pylist()
-
-    assert real_data == [1.0, 3.0, 5.0, 7.0]
-    assert imag_data == [2.0, 4.0, 6.0, 8.0]
+    complex_64_data = table["complex_64"].to_pylist()
+    assert len(complex_64_data) == 4
+    assert complex_64_data[0] == {"real": 1.0, "imag": 2.0}
+    assert complex_64_data[1] == {"real": 3.0, "imag": 4.0}
+    assert complex_64_data[2] == {"real": 5.0, "imag": 6.0}
+    assert complex_64_data[3] == {"real": 7.0, "imag": 8.0}
 
 
 def test_hdf5_compound_types(hdf5_file_with_compound_data):
@@ -706,25 +669,20 @@ def test_hdf5_compound_types(hdf5_file_with_compound_data):
     assert len(tables) == 1
     _, table = tables[0]
 
-    # Check that compound types are flattened into separate columns
+    # Check that compound types are represented as nested structures
     expected_columns = {
-        "simple_compound_x",
-        "simple_compound_y",
-        "complex_compound_real",
-        "complex_compound_imag",
-        "nested_compound_position_x",
-        "nested_compound_position_y",
-        "nested_compound_velocity_vx",
-        "nested_compound_velocity_vy",
+        "simple_compound",
+        "complex_compound",
+        "nested_compound",
     }
     assert set(table.column_names) == expected_columns
 
     # Check simple compound data
-    x_data = table["simple_compound_x"].to_pylist()
-    y_data = table["simple_compound_y"].to_pylist()
-
-    assert x_data == [1, 3, 5]
-    assert y_data == [2.5, 4.5, 6.5]
+    simple_compound_data = table["simple_compound"].to_pylist()
+    assert len(simple_compound_data) == 3
+    assert simple_compound_data[0] == {"x": 1, "y": 2.5}
+    assert simple_compound_data[1] == {"x": 3, "y": 4.5}
+    assert simple_compound_data[2] == {"x": 5, "y": 6.5}
 
 
 def test_hdf5_feature_inference_complex(hdf5_file_with_complex_data):
@@ -743,10 +701,10 @@ def test_hdf5_feature_inference_complex(hdf5_file_with_complex_data):
     features = hdf5.info.features
 
     # Check complex number features
-    assert "complex_64_real" in features
-    assert "complex_64_imag" in features
-    assert features["complex_64_real"] == Value("float64")
-    assert features["complex_64_imag"] == Value("float64")
+    assert "complex_64" in features
+    assert isinstance(features["complex_64"], Features)
+    assert features["complex_64"]["real"] == Value("float64")
+    assert features["complex_64"]["imag"] == Value("float64")
 
 
 def test_hdf5_feature_inference_compound(hdf5_file_with_compound_data):
@@ -765,10 +723,10 @@ def test_hdf5_feature_inference_compound(hdf5_file_with_compound_data):
     features = hdf5.info.features
 
     # Check compound type features
-    assert "simple_compound_x" in features
-    assert "simple_compound_y" in features
-    assert features["simple_compound_x"] == Value("int32")
-    assert features["simple_compound_y"] == Value("float64")
+    assert "simple_compound" in features
+    assert isinstance(features["simple_compound"], Features)
+    assert features["simple_compound"]["x"] == Value("int32")
+    assert features["simple_compound"]["y"] == Value("float64")
 
 
 def test_hdf5_mixed_data_types(hdf5_file_with_mixed_data_types):
@@ -787,43 +745,15 @@ def test_hdf5_mixed_data_types(hdf5_file_with_mixed_data_types):
     expected_columns = {
         "regular_int",
         "regular_float",
-        "complex_data_real",
-        "complex_data_imag",
-        "compound_data_x",
-        "compound_data_y",
+        "complex_data",
+        "compound_data",
     }
     assert set(table.column_names) == expected_columns
 
     # Check data types
     assert table["regular_int"].to_pylist() == [0, 1, 2]
-    assert len(table["complex_data_real"].to_pylist()) == 3
-    assert len(table["compound_data_x"].to_pylist()) == 3
-
-
-def test_hdf5_column_name_collision_detection(hdf5_file_with_complex_collision):
-    """Test that column name collision detection works correctly."""
-    config = HDF5Config()
-    hdf5 = HDF5()
-    hdf5.config = config
-    hdf5.config.data_files = DataFilesDict({"train": [hdf5_file_with_complex_collision]})
-
-    # This should raise a ValueError due to column name collision
-    dl_manager = StreamingDownloadManager()
-    with pytest.raises(ValueError, match="Column name collision detected"):
-        hdf5._split_generators(dl_manager)
-
-
-def test_hdf5_compound_collision_detection(hdf5_file_with_compound_collision):
-    """Test collision detection with compound types."""
-    config = HDF5Config()
-    hdf5 = HDF5()
-    hdf5.config = config
-    hdf5.config.data_files = DataFilesDict({"train": [hdf5_file_with_compound_collision]})
-
-    # This should raise a ValueError due to column name collision
-    dl_manager = StreamingDownloadManager()
-    with pytest.raises(ValueError, match="Column name collision detected"):
-        hdf5._split_generators(dl_manager)
+    assert len(table["complex_data"].to_pylist()) == 3
+    assert len(table["compound_data"].to_pylist()) == 3
 
 
 def test_hdf5_mismatched_lengths_with_column_filtering(hdf5_file_with_mismatched_lengths):