open-edge-platform
diff --git a/‎src/datumaro/experimental/converters/image_converters.py‎
Lines changed: 4 additions & 4 deletions b/‎src/datumaro/experimental/converters/image_converters.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/datumaro/experimental/converters/mask_converters.py‎
Lines changed: 4 additions & 4 deletions b/‎src/datumaro/experimental/converters/mask_converters.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/datumaro/experimental/dataset.py‎
Lines changed: 43 additions & 1 deletion b/‎src/datumaro/experimental/dataset.py‎
Lines changed: 43 additions & 1 deletion
diff --git a/‎src/datumaro/experimental/export_import.py‎
Lines changed: 3 additions & 3 deletions b/‎src/datumaro/experimental/export_import.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/datumaro/experimental/fields/annotations.py‎
Lines changed: 8 additions & 8 deletions b/‎src/datumaro/experimental/fields/annotations.py‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎src/datumaro/experimental/fields/base.py‎
Lines changed: 15 additions & 3 deletions b/‎src/datumaro/experimental/fields/base.py‎
Lines changed: 15 additions & 3 deletions
diff --git a/‎src/datumaro/experimental/fields/datasets.py‎
Lines changed: 15 additions & 2 deletions b/‎src/datumaro/experimental/fields/datasets.py‎
Lines changed: 15 additions & 2 deletions
diff --git a/‎src/datumaro/experimental/fields/images.py‎
Lines changed: 9 additions & 3 deletions b/‎src/datumaro/experimental/fields/images.py‎
Lines changed: 9 additions & 3 deletions
@@ -106,7 +106,7 @@ def filter_output_spec(self) -> bool:
             name=self.output_image.name,
             field=ImageField(
                 semantic=self.input_image.field.semantic,
-                dtype=pl.Float32,
+                dtype=pl.Float32(),
                 format=self.input_image.field.format,
                 channels_first=self.output_image.field.channels_first,
             ),
@@ -161,7 +161,7 @@ def filter_output_spec(self) -> bool:
             name=self.output_image.name,
             field=ImageField(
                 semantic=self.input_path.field.semantic,
-                dtype=pl.UInt8,  # Default to UInt8 for loaded images
+                dtype=pl.UInt8(),  # Default to UInt8 for loaded images
                 format="RGB",  # Default to RGB format
                 channels_first=self.output_image.field.channels_first,
             ),
@@ -275,7 +275,7 @@ def filter_output_spec(self) -> bool:
             name=self.output_image.name,
             field=ImageField(
                 semantic=self.input_bytes.field.semantic,
-                dtype=pl.UInt8,  # Default to UInt8 for decoded images
+                dtype=pl.UInt8(),  # Default to UInt8 for decoded images
                 format="RGB",  # Default to RGB format
                 channels_first=self.output_image.field.channels_first,
             ),
@@ -342,7 +342,7 @@ def filter_output_spec(self) -> bool:
             name=self.output_image.name,
             field=ImageField(
                 semantic=self.input_callable.field.semantic,
-                dtype=pl.UInt8,  # Default to UInt8 for image data
+                dtype=pl.UInt8(),  # Default to UInt8 for image data
                 format=self.input_callable.field.format,  # Use format from callable field
                 channels_first=self.output_image.field.channels_first,
             ),
 
@@ -153,7 +153,7 @@ def apply_conversion_batch(batch_df: pl.DataFrame) -> pl.DataFrame:
 
             return pl.struct(
                 pl.Series(results_batch_polygons).alias("mask"),
-                pl.Series(results_batch_shape, dtype=pl.List(pl.Int32)).alias("shape"),
+                pl.Series(results_batch_shape, dtype=pl.List(pl.Int32())).alias("shape"),
                 eager=True,
             )
 
@@ -165,7 +165,7 @@ def apply_conversion_batch(batch_df: pl.DataFrame) -> pl.DataFrame:
             ]
         ).map_batches(
             apply_conversion_batch,
-            return_dtype=pl.Struct({"mask": pl.List(pl.UInt8), "shape": pl.List(pl.Int32)}),
+            return_dtype=pl.Struct({"mask": pl.List(pl.UInt8()), "shape": pl.List(pl.Int32())}),
         )
 
         return df.with_columns(
@@ -282,7 +282,7 @@ def apply_conversion_batch(batch_df: pl.DataFrame, **kwargs) -> pl.DataFrame:  #
 
             return pl.struct(
                 pl.Series(results_batch_mask).alias("mask"),
-                pl.Series(results_batch_shape, dtype=pl.List(pl.Int32)).alias("shape"),
+                pl.Series(results_batch_shape, dtype=pl.List(pl.Int32())).alias("shape"),
                 eager=True,
             )
 
@@ -294,7 +294,7 @@ def apply_conversion_batch(batch_df: pl.DataFrame, **kwargs) -> pl.DataFrame:  #
         ).map_batches(
             apply_conversion_batch,
             return_dtype=pl.Struct(
-                {"mask": pl.List(self.output_instance_mask.field.dtype), "shape": pl.List(pl.Int32)}
+                {"mask": pl.List(self.output_instance_mask.field.dtype), "shape": pl.List(pl.Int32())}
             ),
         )
 
 
@@ -4,7 +4,9 @@
 
 from __future__ import annotations
 
+import collections.abc
 import types
+import typing
 from functools import cache
 from typing import TYPE_CHECKING, Annotated, Any, Generic, TypeGuard, Union, cast, get_args, get_origin, get_type_hints
 
@@ -35,8 +37,8 @@ def __init__(self, **kwargs: Any):
         """Initialize sample with provided attributes."""
         for key, value in kwargs.items():
             setattr(self, key, value)
-
         self.__post_init__()
+        self.validate()
 
     def __post_init__(self) -> None:
         pass
@@ -46,6 +48,46 @@ def __repr__(self):
         fields = ", ".join(f"{key}={getattr(self, key)}" for key in self.__dict__ if not key.startswith("_"))
         return f"{self.__class__.__name__}({fields})"
 
+    def validate(self) -> None:
+        """
+        Validate the sample's attributes against the inferred schema.
+
+        Raises:
+            ValueError: If required attributes are missing
+            TypeError: If attribute types do not match the schema
+        """
+        schema = self.__class__.infer_schema()  # Cached per class
+        for name, attr_info in schema.attributes.items():
+            if name not in self.__dict__:
+                continue
+            value = getattr(self, name)
+            expected_type = attr_info.type
+            field = attr_info.field
+
+            if not self._validate_attribute_type(expected_type, value):
+                raise TypeError(f"Attribute `{name}` must be of type `{expected_type}`.")
+
+            # Custom field validation (if any)
+            if hasattr(field, "validate"):
+                field.validate(value)
+
+    def _validate_attribute_type(self, expected_type: Any, value: Any) -> bool:
+        """
+        Recursively validate attribute type, handling Union and Callable types.
+        """
+        # Union and Callable types have to be handled separately,
+        # because isinstance() does not work with Callable types.
+        origin = get_origin(expected_type)
+        if origin is Union:
+            # Check each type in the Union
+            return any(self._validate_attribute_type(typ, value) for typ in get_args(expected_type))
+        if origin in {typing.Callable, collections.abc.Callable} or expected_type in {
+            typing.Callable,
+            collections.abc.Callable,
+        }:
+            return callable(value)
+        return isinstance(value, expected_type)
+
     @classmethod
     @cache
     def infer_schema(cls) -> Schema:
 
@@ -393,10 +393,10 @@ def _update_dataframe_with_field(
     if is_path_field:
         if field_name in df.columns:
             df = df.drop(field_name)
-        return df.with_columns(pl.Series(field_name, values, dtype=pl.String))
+        return df.with_columns(pl.Series(field_name, values, dtype=pl.String()))
     if field_name in df.columns:
         return df.with_columns(pl.Series(field_name, values))
-    return df.with_columns(pl.Series(field_name, values, dtype=pl.Object))
+    return df.with_columns(pl.Series(field_name, values, dtype=pl.Object()))
 
 
 def _reconstruct_image_fields(
@@ -430,7 +430,7 @@ def _add_missing_object_columns(
     """Add back any object columns that weren't reconstructed from images."""
     for col_name in object_columns:
         if col_name not in df.columns:
-            df = df.with_columns(pl.Series(col_name, [None] * len(df), dtype=pl.Object))
+            df = df.with_columns(pl.Series(col_name, [None] * len(df), dtype=pl.Object()))
     return df
 
 
 
@@ -6,7 +6,7 @@
 
 import polars as pl
 
-from datumaro.experimental.fields.base import Field, PolarsDataType, Semantic, T, convert_numpy_object_array_to_series
+from datumaro.experimental.fields.base import Field, Semantic, T, convert_numpy_object_array_to_series
 from datumaro.experimental.type_registry import from_polars_data, to_numpy
 
 
@@ -26,7 +26,7 @@ class BBoxField(Field):
     """
 
     semantic: Semantic
-    dtype: PolarsDataType = field(default_factory=pl.Float32)
+    dtype: pl.DataType = field(default_factory=pl.Float32)
     format: str = "x1y1x2y2"
     normalize: bool = False
 
@@ -95,7 +95,7 @@ class RotatedBBoxField(Field):
     """
 
     semantic: Semantic
-    dtype: PolarsDataType = field(default_factory=pl.Float32)
+    dtype: pl.DataType = field(default_factory=pl.Float32)
     format: str = "cxcywhr"
     normalize: bool = False
 
@@ -159,7 +159,7 @@ class LabelField(Field):
     """
 
     semantic: Semantic
-    dtype: PolarsDataType = field(default_factory=pl.UInt8)
+    dtype: pl.DataType = field(default_factory=pl.UInt8)
     multi_label: bool = False  # Flag to indicate if this field should handle multi-labels
     is_list: bool = False
 
@@ -217,7 +217,7 @@ class ScoreField(Field):
     """
 
     semantic: Semantic
-    dtype: PolarsDataType = field(default_factory=pl.Float32)
+    dtype: pl.DataType = field(default_factory=pl.Float32)
     is_list: bool = False
 
     @property
@@ -276,7 +276,7 @@ class PolygonField(Field):
     """
 
     semantic: Semantic
-    dtype: PolarsDataType = field(default_factory=pl.Float32)
+    dtype: pl.DataType = field(default_factory=pl.Float32)
     format: str = "xy"
     normalize: bool = False
 
@@ -335,7 +335,7 @@ class KeypointsField(Field):
     """
 
     semantic: Semantic
-    dtype: PolarsDataType = field(default_factory=pl.Float32)
+    dtype: pl.DataType = field(default_factory=pl.Float32)
     normalize: bool = False
 
     def to_polars_schema(self, name: str) -> dict[str, pl.DataType]:
@@ -395,7 +395,7 @@ class EllipseField(Field):
     """
 
     semantic: Semantic
-    dtype: PolarsDataType = field(default_factory=pl.Float32)
+    dtype: pl.DataType = field(default_factory=pl.Float32)
     format: str = "x1y1x2y2"
     normalize: bool = False
 
 
@@ -12,15 +12,13 @@
 from dataclasses import fields as dataclass_fields
 from dataclasses import is_dataclass
 from enum import Flag, auto
-from typing import Any, TypeAlias, TypeVar
+from typing import Any, TypeVar
 
 import numpy as np
 import polars as pl
 
 T = TypeVar("T")
 
-PolarsDataType: TypeAlias = type[pl.DataType] | pl.DataType
-
 
 class Semantic(Flag):
     """
@@ -48,6 +46,17 @@ class Field:
     """
 
     semantic: Semantic
+    dtype: pl.DataType
+
+    def __post_init__(self):
+        dtype = getattr(self, "dtype")
+        if isinstance(dtype, type) and issubclass(dtype, pl.DataType):
+            raise TypeError(
+                f"dtype must be a Polars 'DataType' (instance), not a Polars 'DataTypeClass' (type). "
+                f"Make sure your dtype declaration uses parentheses ({dtype.__name__}() instead of {dtype.__name__})"
+            )
+        if not isinstance(dtype, pl.DataType):
+            raise TypeError(f"dtype must be a Polars 'DataType', got '{dtype.__name__}' instead.")
 
     def to_polars_schema(self, name: str) -> dict[str, pl.DataType]:
         """
@@ -165,6 +174,9 @@ def from_dict(cls, field_dict: dict[str, Any]) -> "Field":
         # Use dataclass introspection to get all expected fields
         if is_dataclass(field_class):
             for dc_field in dataclass_fields(field_class):
+                if not dc_field.init:
+                    continue  # Skip fields that are not in __init__
+
                 field_name = dc_field.name
 
                 # Skip if not in the serialized data
 
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: MIT
 import types
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from enum import Enum, auto
 from typing import Any, Union, get_args, get_origin
 
@@ -44,6 +44,18 @@ class TileField(Field):
     """
 
     semantic: Semantic
+    dtype: pl.DataType = field(
+        default_factory=lambda: pl.Struct(
+            [
+                pl.Field("source_sample_idx", pl.Int32()),
+                pl.Field("x", pl.Int32()),
+                pl.Field("y", pl.Int32()),
+                pl.Field("width", pl.Int32()),
+                pl.Field("height", pl.Int32()),
+            ]
+        ),
+        init=False,
+    )
 
     def to_polars_schema(self, name: str) -> dict[str, pl.DataType]:
         """Generate Polars schema for tile information."""
@@ -123,6 +135,7 @@ class SubsetField(Field):
 
     semantic: Semantic
     categories: list[str] | None = None
+    dtype: pl.DataType = field(default_factory=pl.Categorical, init=False)
 
     def to_polars_schema(self, name: str) -> dict[str, pl.DataType]:
         """Generate schema with categorical type for subset values."""
@@ -141,7 +154,7 @@ def to_polars(self, name: str, value: Any) -> dict[str, pl.Series]:
             polars_value = str(value)
 
         # Create categorical series with predefined categories if available
-        return {name: pl.Series(name, [polars_value], dtype=pl.Categorical)}
+        return {name: pl.Series(name, [polars_value], dtype=pl.Categorical())}
 
     def from_polars(self, name: str, row_index: int, df: pl.DataFrame, target_type: type[T]) -> T:
         """Reconstruct subset value from Polars data.
 
@@ -7,7 +7,7 @@
 import numpy as np
 import polars as pl
 
-from datumaro.experimental.fields.base import Field, PolarsDataType, Semantic, T
+from datumaro.experimental.fields.base import Field, Semantic, T
 from datumaro.experimental.type_registry import from_polars_data, to_numpy
 
 
@@ -26,7 +26,7 @@ class TensorField(Field):
     """
 
     semantic: Semantic
-    dtype: PolarsDataType = field(default_factory=pl.UInt8)
+    dtype: pl.DataType = field(default_factory=pl.UInt8)
     channels_first: bool = False
 
     def to_polars_schema(self, name: str) -> dict[str, pl.DataType]:
@@ -129,6 +129,7 @@ class ImageBytesField(Field):
     """
 
     semantic: Semantic
+    dtype: pl.DataType = field(default_factory=pl.Binary, init=False)
 
     def to_polars_schema(self, name: str) -> dict[str, pl.DataType]:
         """Generate schema for image bytes as binary data."""
@@ -174,6 +175,9 @@ class ImageInfoField(Field):
     """
 
     semantic: Semantic
+    dtype: pl.DataType = field(
+        default_factory=lambda: pl.Struct([pl.Field("width", pl.Int32()), pl.Field("height", pl.Int32())]), init=False
+    )
 
     def to_polars_schema(self, name: str) -> dict[str, pl.DataType]:
         return {
@@ -228,6 +232,7 @@ class ImagePathField(Field):
     """
 
     semantic: Semantic
+    dtype: pl.DataType = field(default_factory=pl.String, init=False)
 
     def to_polars_schema(self, name: str) -> dict[str, pl.DataType]:
         """Generate schema for string path column."""
@@ -272,10 +277,11 @@ class ImageCallableField(Field):
 
     semantic: Semantic
     format: str = "RGB"
+    dtype: pl.DataType = field(default_factory=pl.Object, init=False)
 
     def to_polars_schema(self, name: str) -> dict[str, pl.DataType]:
         """Return schema with Object type to store callable."""
-        return {name: pl.Object}
+        return {name: pl.Object()}
 
     def to_polars(self, name: str, value: callable) -> dict[str, pl.Series]:
         """Store callable as Object in Polars series."""
Original file line number	Diff line number	Diff line change
`@@ -153,7 +153,7 @@ def apply_conversion_batch(batch_df: pl.DataFrame) -> pl.DataFrame:`
`153`	`153`
`154`	`154`	`return pl.struct(`
`155`	`155`	`pl.Series(results_batch_polygons).alias("mask"),`
`156`		`- pl.Series(results_batch_shape, dtype=pl.List(pl.Int32)).alias("shape"),`
	`156`	`+ pl.Series(results_batch_shape, dtype=pl.List(pl.Int32())).alias("shape"),`
`157`	`157`	`eager=True,`
`158`	`158`	`)`
`159`	`159`
`@@ -165,7 +165,7 @@ def apply_conversion_batch(batch_df: pl.DataFrame) -> pl.DataFrame:`
`165`	`165`	`]`
`166`	`166`	`).map_batches(`
`167`	`167`	`apply_conversion_batch,`
`168`		`- return_dtype=pl.Struct({"mask": pl.List(pl.UInt8), "shape": pl.List(pl.Int32)}),`
	`168`	`+ return_dtype=pl.Struct({"mask": pl.List(pl.UInt8()), "shape": pl.List(pl.Int32())}),`
`169`	`169`	`)`
`170`	`170`
`171`	`171`	`return df.with_columns(`
`@@ -282,7 +282,7 @@ def apply_conversion_batch(batch_df: pl.DataFrame, **kwargs) -> pl.DataFrame: #`
`282`	`282`
`283`	`283`	`return pl.struct(`
`284`	`284`	`pl.Series(results_batch_mask).alias("mask"),`
`285`		`- pl.Series(results_batch_shape, dtype=pl.List(pl.Int32)).alias("shape"),`
	`285`	`+ pl.Series(results_batch_shape, dtype=pl.List(pl.Int32())).alias("shape"),`
`286`	`286`	`eager=True,`
`287`	`287`	`)`
`288`	`288`
`@@ -294,7 +294,7 @@ def apply_conversion_batch(batch_df: pl.DataFrame, **kwargs) -> pl.DataFrame: #`
`294`	`294`	`).map_batches(`
`295`	`295`	`apply_conversion_batch,`
`296`	`296`	`return_dtype=pl.Struct(`
`297`		`- {"mask": pl.List(self.output_instance_mask.field.dtype), "shape": pl.List(pl.Int32)}`
	`297`	`+ {"mask": pl.List(self.output_instance_mask.field.dtype), "shape": pl.List(pl.Int32())}`
`298`	`298`	`),`
`299`	`299`	`)`
`300`	`300`