huggingface
diff --git a/‎src/datasets/packaged_modules/hdf5/hdf5.py
Lines changed: 135 additions & 64 deletions b/‎src/datasets/packaged_modules/hdf5/hdf5.py
Lines changed: 135 additions & 64 deletions
@@ -2,12 +2,22 @@
 from dataclasses import dataclass
 from typing import Dict, List, Optional
 
-import h5py
 import numpy as np
 import pyarrow as pa
 
 import datasets
-from datasets.features.features import LargeList, Sequence, _ArrayXD
+import h5py
+from datasets.features.features import (
+    Array2D,
+    Array3D,
+    Array4D,
+    Array5D,
+    LargeList,
+    Sequence,
+    Value,
+    _ArrayXD,
+    _arrow_to_datasets_dtype,
+)
 from datasets.table import table_cast
 
 
@@ -76,7 +86,7 @@ def _split_generators(self, dl_manager):
 
     def _cast_table(self, pa_table: pa.Table) -> pa.Table:
         if self.info.features is not None:
-            has_zero_dims = any(has_zero_dimensions(feature) for feature in self.info.features.values())
+            has_zero_dims = any(_has_zero_dimensions(feature) for feature in self.info.features.values())
             if not has_zero_dims:
                 pa_table = table_cast(pa_table, self.info.features.arrow_schema)
         return pa_table
@@ -105,7 +115,13 @@ def _generate_tables(self, files):
                         if self.config.columns is not None and path not in self.config.columns:
                             continue
                         arr = dset[start:end]
-                        pa_arr = datasets.features.features.numpy_to_pyarrow_listarray(arr)
+                        if _is_ragged_dataset(dset):
+                            if _is_variable_length_string(dset):
+                                pa_arr = _variable_length_string_to_pyarrow(arr, dset)
+                            else:
+                                pa_arr = _ragged_array_to_pyarrow_largelist(arr, dset)
+                        else:
+                            pa_arr = datasets.features.features.numpy_to_pyarrow_listarray(arr)  # NOTE: type=None
                         batch_dict[path] = pa_arr
                     pa_table = pa.Table.from_pydict(batch_dict)
                     yield f"{file_idx}_{start}", self._cast_table(pa_table)
@@ -123,82 +139,137 @@ def _traverse_datasets(h5_obj, prefix: str = "") -> Dict[str, h5py.Dataset]:
     return mapping
 
 
-_DTYPE_TO_DATASETS: Dict[np.dtype, str] = {  # FIXME: necessary/check if util exists?
-    np.dtype("bool").newbyteorder("="): "bool",
-    np.dtype("int8").newbyteorder("="): "int8",
-    np.dtype("int16").newbyteorder("="): "int16",
-    np.dtype("int32").newbyteorder("="): "int32",
-    np.dtype("int64").newbyteorder("="): "int64",
-    np.dtype("uint8").newbyteorder("="): "uint8",
-    np.dtype("uint16").newbyteorder("="): "uint16",
-    np.dtype("uint32").newbyteorder("="): "uint32",
-    np.dtype("uint64").newbyteorder("="): "uint64",
-    np.dtype("float16").newbyteorder("="): "float16",
-    np.dtype("float32").newbyteorder("="): "float32",
-    np.dtype("float64").newbyteorder("="): "float64",
-    # np.dtype("complex64").newbyteorder("="): "complex64",
-    # np.dtype("complex128").newbyteorder("="): "complex128",
-}
-
-
-def _dtype_to_dataset_dtype(dtype: np.dtype) -> str:
-    """Map NumPy dtype to datasets.Value dtype string, falls back to "binary" for unknown or unsupported dtypes."""
-
-    # FIXME: endian fix necessary/correct?
-    base_dtype = dtype.newbyteorder("=")
-    if base_dtype in _DTYPE_TO_DATASETS:
-        return _DTYPE_TO_DATASETS[base_dtype]
-
-    if base_dtype.kind in {"S", "a"}:
-        return "binary"
-
-    # FIXME: seems h5 converts unicode back to bytes?
-    if base_dtype.kind == "U":
-        return "binary"
-
-    if base_dtype.kind == "O":
-        return "binary"
-
-    # FIXME: support varlen?
-
-    return "binary"
+def _base_dtype(dtype):
+    if hasattr(dtype, "metadata") and dtype.metadata and "vlen" in dtype.metadata:
+        return dtype.metadata["vlen"]
+    if hasattr(dtype, "subdtype") and dtype.subdtype is not None:
+        return _base_dtype(dtype.subdtype[0])
+    return dtype
+
+
+def _ragged_array_to_pyarrow_largelist(arr: np.ndarray, dset: h5py.Dataset) -> pa.Array:
+    if _is_variable_length_string(dset):
+        list_of_strings = []
+        for item in arr:
+            if item is None:
+                list_of_strings.append(None)
+            else:
+                if isinstance(item, bytes):
+                    item = item.decode("utf-8")
+                list_of_strings.append(item)
+        return datasets.features.features.list_of_pa_arrays_to_pyarrow_listarray(
+            [pa.array([item]) if item is not None else None for item in list_of_strings]
+        )
+    else:
+        return _convert_nested_ragged_array_recursive(arr, dset.dtype)
+
+
+def _convert_nested_ragged_array_recursive(arr: np.ndarray, dtype):
+    if hasattr(dtype, "subdtype") and dtype.subdtype is not None:
+        inner_dtype = dtype.subdtype[0]
+        list_of_arrays = []
+        for item in arr:
+            if item is None:
+                list_of_arrays.append(None)
+            else:
+                inner_array = _convert_nested_ragged_array_recursive(item, inner_dtype)
+                list_of_arrays.append(inner_array)
+        return datasets.features.features.list_of_pa_arrays_to_pyarrow_listarray(
+            [pa.array(item) if item is not None else None for item in list_of_arrays]
+        )
+    else:
+        list_of_arrays = []
+        for item in arr:
+            if item is None:
+                list_of_arrays.append(None)
+            else:
+                if not isinstance(item, np.ndarray):
+                    item = np.array(item, dtype=dtype)
+                list_of_arrays.append(item)
+        return datasets.features.features.list_of_pa_arrays_to_pyarrow_listarray(
+            [pa.array(item) if item is not None else None for item in list_of_arrays]
+        )
 
 
 def _infer_feature_from_dataset(dset: h5py.Dataset):
-    """Infer a ``datasets.Features`` entry for one HDF5 dataset."""
+    if _is_variable_length_string(dset):
+        return Value("string")  # FIXME: large_string?
 
-    import datasets as hfd
+    if _is_ragged_dataset(dset):
+        return _infer_nested_feature_recursive(dset.dtype, dset)
 
-    dtype_str = _dtype_to_dataset_dtype(dset.dtype)
+    value_feature = _np_to_pa_to_hf_value(dset.dtype)
+    dtype_str = value_feature.dtype
     value_shape = dset.shape[1:]
 
-    # Reject ragged datasets (variable-length or None dims)
-    if dset.dtype.kind == "O" or any(s is None for s in value_shape):
-        raise ValueError(f"Ragged dataset {dset.name} with shape {value_shape} and dtype {dset.dtype} not supported")
-
     if dset.dtype.kind not in {"b", "i", "u", "f", "S", "a"}:
-        raise ValueError(f"Unsupported dtype {dset.dtype} for dataset {dset.name}")
+        raise TypeError(f"Unsupported dtype {dset.dtype} for dataset {dset.name}")
 
     rank = len(value_shape)
-    if 2 <= rank <= 5:
-        from datasets.features import Array2D, Array3D, Array4D, Array5D
-
-        array_cls = [None, None, Array2D, Array3D, Array4D, Array5D][rank]
-        return array_cls(shape=value_shape, dtype=dtype_str)
+    if rank == 0:
+        return value_feature
+    elif rank == 1:
+        return Sequence(value_feature, length=value_shape[0])
+    elif 2 <= rank <= 5:
+        return _sized_arrayxd(rank)(shape=value_shape, dtype=dtype_str)
+    else:
+        raise TypeError(f"Array{rank}D not supported. Only up to 5D arrays are supported.")
 
-    # Fallback to nested Sequence
-    def _build_feature(shape: tuple[int, ...]):
-        if len(shape) == 0:
-            return hfd.Value(dtype_str)
-        return hfd.Sequence(length=shape[0], feature=_build_feature(shape[1:]))
 
-    return _build_feature(value_shape)
+def _infer_nested_feature_recursive(dtype, dset: h5py.Dataset):
+    if hasattr(dtype, "subdtype") and dtype.subdtype is not None:
+        inner_dtype = dtype.subdtype[0]
+        inner_feature = _infer_nested_feature_recursive(inner_dtype, dset)
+        return Sequence(inner_feature)
+    else:
+        if hasattr(dtype, "kind") and dtype.kind == "O":
+            if _is_variable_length_string(dset):
+                base_dtype = np.dtype("S1")
+            else:
+                base_dtype = _base_dtype(dset.dtype)
+            return Sequence(_np_to_pa_to_hf_value(base_dtype))
+        else:
+            return _np_to_pa_to_hf_value(dtype)
 
 
-def has_zero_dimensions(feature: _ArrayXD | Sequence | LargeList):
+def _has_zero_dimensions(feature):
     if isinstance(feature, _ArrayXD):
         return any(dim == 0 for dim in feature.shape)
     elif isinstance(feature, (Sequence, LargeList)):
-        return feature.length == 0 or has_zero_dimensions(feature.feature)
+        return feature.length == 0 or _has_zero_dimensions(feature.feature)
     else:
         return False
+
+
+def _sized_arrayxd(rank: int):
+    return {2: Array2D, 3: Array3D, 4: Array4D, 5: Array5D}[rank]
+
+
+def _np_to_pa_to_hf_value(numpy_dtype: np.dtype) -> Value:
+    return Value(dtype=_arrow_to_datasets_dtype(pa.from_numpy_dtype(numpy_dtype)))
+
+
+def _is_ragged_dataset(dset: h5py.Dataset) -> bool:
+    return dset.dtype.kind == "O" and hasattr(dset.dtype, "subdtype")
+
+
+def _is_variable_length_string(dset: h5py.Dataset) -> bool:
+    if not _is_ragged_dataset(dset) or dset.shape[0] == 0:
+        return False
+    num_samples = min(3, dset.shape[0])
+    for i in range(num_samples):
+        try:
+            if isinstance(dset[i], (str, bytes)):
+                return True
+        except (IndexError, TypeError):
+            continue
+    return False
+
+
+def _variable_length_string_to_pyarrow(arr: np.ndarray, dset: h5py.Dataset) -> pa.Array:
+    list_of_strings = []
+    for item in arr:
+        if isinstance(item, bytes):
+            item = item.decode("utf-8")
+        list_of_strings.append(item)
+    return pa.array(list_of_strings)