Add support for typed numpy arrays (#1953)

AlbertvanHouten · Copilot · web-flow · commit abd3ad26e575 · 2025-11-17T09:39:58.000Z
Add support for typed numpy arrays like below: ``` NDArrayInt = npt.NDArray[np.int_] # numpy array of integers class DetectionSample(Sample): bboxes: NDArrayInt = bbox_field(dtype=pl.Int32) ``` Resolves #1949   ### Checklist  - [ ] I have added tests to cover my changes or documented any manual tests. - [ ] I have updated the [documentation](https://github.com/open-edge-platform/datumaro/tree/develop/docs) accordingly --------- Signed-off-by: Albert van Houten <albert.van.houten@intel.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
diff --git a/src/datumaro/experimental/type_registry.py b/src/datumaro/experimental/type_registry.py
@@ -12,7 +12,7 @@
 import logging
 import types
 from collections.abc import Callable
-from typing import Any, Union
+from typing import Any, Union, get_args, get_origin
 
 import numpy as np
 import polars as pl
@@ -165,6 +165,44 @@ def to_numpy(value: Any, dtype: Any = None) -> np.ndarray[Any, Any] | None:
     raise TypeError(f"No converter registered for type {value_type}")
 
 
+def _apply_numpy_dtype_from_type_annotation(array: np.ndarray, target_type: type) -> np.ndarray:
+    """Apply dtype conversion to numpy array based on type annotation.
+
+    Args:
+        array: Numpy array to convert
+        target_type: Type annotation containing dtype information (e.g., npt.NDArray[np.float32])
+
+    Returns:
+        Array with the correct dtype applied
+
+    Example:
+        >>> import numpy.typing as npt
+        >>> arr = np.array([1.0, 2.0], dtype=np.float64)
+        >>> NDArrayFloat32 = npt.NDArray[np.float32]
+        >>> result = _apply_numpy_dtype_from_type_annotation(arr, NDArrayFloat32)
+        >>> result.dtype == np.float32
+        True
+    """
+    type_args = get_args(target_type)
+    # type_args for np.ndarray are typically (shape, dtype)
+    if len(type_args) >= 2:
+        # Extract the dtype from numpy.dtype[T]
+        dtype_generic = type_args[1]
+        # Check if this is a numpy.dtype generic type
+        if get_origin(dtype_generic) is np.dtype:
+            dtype_args = get_args(dtype_generic)
+            if dtype_args:
+                try:
+                    target_dtype = dtype_args[0]
+                    # Only convert if the dtype is different
+                    if array.dtype != np.dtype(target_dtype):
+                        return array.astype(target_dtype)
+                except (AttributeError, TypeError, ValueError):
+                    # If we can't extract or apply dtype, just return the array as-is
+                    pass
+    return array
+
+
 def from_polars_data(polars_data: Any, target_type: type) -> Any:
     """Convert polars data to target type.
 
@@ -189,6 +227,18 @@ def from_polars_data(polars_data: Any, target_type: type) -> Any:
     if target_type in _from_polars_converters:
         return _from_polars_converters[target_type](polars_data)
 
+    # Check if target_type is a generic type (e.g., np.ndarray[Any, np.dtype[np.float32]])
+    origin_type = get_origin(target_type)
+    if origin_type is not None and origin_type in _from_polars_converters:
+        # Handle typed numpy arrays and other generic types
+        result = _from_polars_converters[origin_type](polars_data)
+
+        # For typed numpy arrays, apply the dtype if specified in the type annotation
+        if origin_type is np.ndarray and result is not None:
+            result = _apply_numpy_dtype_from_type_annotation(result, target_type)
+
+        return result
+
     # Handle Union types (e.g., torch.Tensor | np.ndarray)
     # Check if target_type is a Union type (Python 3.10+ style or typing.Union)
     is_union = False
@@ -198,38 +248,33 @@ def from_polars_data(polars_data: Any, target_type: type) -> Any:
     if isinstance(target_type, types.UnionType):
         is_union = True
         union_args = target_type.__args__
-    else:
-        # Check for typing.Union (older syntax: Union[A, B])
-        try:
-            from typing import get_args, get_origin
 
-            if get_origin(target_type) is Union:
-                is_union = True
-                union_args = get_args(target_type)
-        except Exception as e:
-            logger.error(f"Error handling Union type: {e}")
+    # Check for typing.Union (older syntax: Union[A, B])
+    if get_origin(target_type) is Union:
+        is_union = True
+        union_args = get_args(target_type)
 
     if is_union and union_args:
         return _convert_union_types(union_args=union_args, polars_data=polars_data, target_type=target_type)
     raise TypeError(f"No converter registered for type {target_type}")
 
 
 def _convert_union_types(union_args: tuple[type], polars_data: Any, target_type: type) -> Any:
-    if types.NoneType in union_args:
-        # Handle optional types in union (e.g. A | None) when Polars data is None
-        if polars_data is None:
-            return None
-
-        union_args = tuple(arg for arg in union_args if arg is not types.NoneType)
-
-    # For non-optional Union types, try each type in the union until one succeeds
-    for union_type in union_args:
-        if union_type in _from_polars_converters:
-            try:
-                return _from_polars_converters[union_type](polars_data)
-            except KeyError:
-                # If conversion fails, try the next type in the union
-                continue
+    if types.NoneType in union_args and polars_data is None:
+        return None
+
+    non_none_args = tuple(arg for arg in union_args if arg is not types.NoneType)
+
+    # Try each type in the union until one succeeds
+    for union_type in non_none_args:
+        # Try to convert using the union type (which might be generic)
+        try:
+            return from_polars_data(polars_data, union_type)
+        except (KeyError, TypeError):
+            # If conversion fails, try the next type in the union
+            continue
+
+    # If all conversions failed, raise TypeError
     raise TypeError(f"No converter registered for type {target_type}")
 
 
diff --git a/tests/unit/experimental/test_type_registry.py b/tests/unit/experimental/test_type_registry.py
@@ -325,3 +325,187 @@ def test_points_converter_functionality():
     result = to_numpy(points_obj)
     assert isinstance(result, np.ndarray)
     np.testing.assert_array_equal(result, np.array([[10.0, 20.0, 2.0], [30.0, 40.0, 1.0]]))
+
+
+def test_typed_numpy_array_basic():
+    """Test basic typed numpy array conversion from Polars data."""
+    import numpy.typing as npt
+    import polars as pl
+
+    # Test Float32 typed array
+    NDArrayFloat32 = npt.NDArray[np.float32]
+    df = pl.DataFrame({"data": [[0.8, 0.9]]}, schema={"data": pl.List(pl.Float32)})
+    result = from_polars_data(df["data"][0], NDArrayFloat32)
+
+    assert isinstance(result, np.ndarray)
+    assert result.dtype == np.float32
+    np.testing.assert_array_almost_equal(result, np.array([0.8, 0.9], dtype=np.float32))
+
+    # Test Int32 typed array
+    NDArrayInt32 = npt.NDArray[np.int32]
+    df = pl.DataFrame({"data": [[1, 2, 3]]}, schema={"data": pl.List(pl.Int32)})
+    result = from_polars_data(df["data"][0], NDArrayInt32)
+
+    assert isinstance(result, np.ndarray)
+    assert result.dtype == np.int32
+    np.testing.assert_array_equal(result, np.array([1, 2, 3], dtype=np.int32))
+
+    # Test Float64 typed array
+    NDArrayFloat64 = npt.NDArray[np.float64]
+    df = pl.DataFrame({"data": [[1.5, 2.5]]}, schema={"data": pl.List(pl.Float64)})
+    result = from_polars_data(df["data"][0], NDArrayFloat64)
+
+    assert isinstance(result, np.ndarray)
+    assert result.dtype == np.float64
+    np.testing.assert_array_almost_equal(result, np.array([1.5, 2.5], dtype=np.float64))
+
+
+def test_typed_numpy_array_dtype_conversion():
+    """Test that typed numpy arrays trigger dtype conversion when needed."""
+    import numpy.typing as npt
+    import polars as pl
+
+    # Test conversion from float64 to float32
+    NDArrayFloat32 = npt.NDArray[np.float32]
+    df = pl.DataFrame({"data": [[1.0, 2.0]]}, schema={"data": pl.List(pl.Float64)})
+    result = from_polars_data(df["data"][0], NDArrayFloat32)
+
+    assert result.dtype == np.float32, f"Expected float32 but got {result.dtype}"
+    np.testing.assert_array_almost_equal(result, np.array([1.0, 2.0], dtype=np.float32))
+
+    # Test conversion from int64 to int32
+    NDArrayInt32 = npt.NDArray[np.int32]
+    df = pl.DataFrame({"data": [[10, 20]]}, schema={"data": pl.List(pl.Int64)})
+    result = from_polars_data(df["data"][0], NDArrayInt32)
+
+    assert result.dtype == np.int32, f"Expected int32 but got {result.dtype}"
+    np.testing.assert_array_equal(result, np.array([10, 20], dtype=np.int32))
+
+
+def test_typed_numpy_array_optional():
+    """Test optional typed numpy arrays (Type | None)."""
+    import numpy.typing as npt
+    import polars as pl
+
+    NDArrayFloat32 = npt.NDArray[np.float32]
+    OptionalFloat32 = NDArrayFloat32 | None if sys.version_info >= (3, 10) else Optional[NDArrayFloat32]
+
+    # Test with None
+    result = from_polars_data(None, OptionalFloat32)
+    assert result is None
+
+    # Test with actual data
+    df = pl.DataFrame({"data": [[0.8, 0.9]]}, schema={"data": pl.List(pl.Float32)})
+    result = from_polars_data(df["data"][0], OptionalFloat32)
+
+    assert isinstance(result, np.ndarray)
+    assert result.dtype == np.float32
+    np.testing.assert_array_almost_equal(result, np.array([0.8, 0.9], dtype=np.float32))
+
+
+def test_typed_numpy_array_preserves_dtype():
+    """Test that typed numpy arrays preserve dtype from Polars when types match."""
+    import numpy.typing as npt
+    import polars as pl
+
+    # When Polars dtype matches the type annotation, no conversion should occur
+    NDArrayFloat32 = npt.NDArray[np.float32]
+    df = pl.DataFrame({"data": [[0.5, 0.7]]}, schema={"data": pl.List(pl.Float32)})
+    result = from_polars_data(df["data"][0], NDArrayFloat32)
+
+    assert result.dtype == np.float32
+    # Values should be exact (no float precision loss)
+    np.testing.assert_array_equal(result, np.array([0.5, 0.7], dtype=np.float32))
+
+
+def test_typed_numpy_array_various_dtypes():
+    """Test typed numpy arrays with various numpy dtypes."""
+    import numpy.typing as npt
+    import polars as pl
+
+    # Test uint8
+    NDArrayUInt8 = npt.NDArray[np.uint8]
+    df = pl.DataFrame({"data": [[1, 2, 3]]}, schema={"data": pl.List(pl.UInt8)})
+    result = from_polars_data(df["data"][0], NDArrayUInt8)
+    assert result.dtype == np.uint8
+
+    # Test int64
+    NDArrayInt64 = npt.NDArray[np.int64]
+    df = pl.DataFrame({"data": [[100, 200]]}, schema={"data": pl.List(pl.Int64)})
+    result = from_polars_data(df["data"][0], NDArrayInt64)
+    assert result.dtype == np.int64
+
+    # Test uint16
+    NDArrayUInt16 = npt.NDArray[np.uint16]
+    df = pl.DataFrame({"data": [[1000, 2000]]}, schema={"data": pl.List(pl.UInt16)})
+    result = from_polars_data(df["data"][0], NDArrayUInt16)
+    assert result.dtype == np.uint16
+
+
+def test_typed_numpy_array_helper_function():
+    """Test the _apply_numpy_dtype_from_type_annotation helper function directly."""
+    import numpy.typing as npt
+
+    from datumaro.experimental.type_registry import _apply_numpy_dtype_from_type_annotation
+
+    # Test dtype conversion
+    NDArrayFloat32 = npt.NDArray[np.float32]
+    arr = np.array([1.0, 2.0], dtype=np.float64)
+    result = _apply_numpy_dtype_from_type_annotation(arr, NDArrayFloat32)
+    assert result.dtype == np.float32
+
+    # Test no conversion when dtype already matches
+    arr_f32 = np.array([1.0, 2.0], dtype=np.float32)
+    result = _apply_numpy_dtype_from_type_annotation(arr_f32, NDArrayFloat32)
+    assert result.dtype == np.float32
+
+    # Test with generic np.ndarray (should not convert)
+    arr_f64 = np.array([1.0, 2.0], dtype=np.float64)
+    result = _apply_numpy_dtype_from_type_annotation(arr_f64, np.ndarray)
+    assert result.dtype == np.float64  # Should remain unchanged
+
+
+def test_typed_numpy_array_round_trip():
+    """Test round-trip conversion: numpy -> polars -> typed numpy."""
+    import numpy.typing as npt
+    import polars as pl
+
+    NDArrayFloat32 = npt.NDArray[np.float32]
+
+    # Original typed array
+    original = np.array([0.8, 0.95, 0.87], dtype=np.float32)
+
+    # Convert to polars-compatible format
+    from datumaro.experimental.type_registry import to_numpy
+
+    polars_ready = to_numpy(original, pl.Float32)
+
+    # Create polars series
+    series = pl.Series("scores", [polars_ready], dtype=pl.List(pl.Float32))
+
+    # Extract back from polars
+    polars_data = series[0]
+
+    # Convert back to typed numpy array
+    result = from_polars_data(polars_data, NDArrayFloat32)
+
+    # Verify dtype and values are preserved
+    assert result.dtype == np.float32
+    np.testing.assert_array_almost_equal(original, result)
+
+
+def test_typed_numpy_array_multidimensional():
+    """Test typed numpy arrays with multidimensional data."""
+    import numpy.typing as npt
+    import polars as pl
+
+    NDArrayInt32 = npt.NDArray[np.int32]
+
+    # Test with nested lists (2D array)
+    # Note: Polars List type is for 1D arrays, so we test with flattened data
+    df = pl.DataFrame({"data": [[10, 15, 30, 35]]}, schema={"data": pl.List(pl.Int32)})
+    result = from_polars_data(df["data"][0], NDArrayInt32)
+
+    assert result.dtype == np.int32
+    assert result.shape == (4,)
+    np.testing.assert_array_equal(result, np.array([10, 15, 30, 35], dtype=np.int32))