From 18bf2779b8e7e15e680924ea0ed0824a3818b294 Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Wed, 17 Dec 2025 11:35:25 -0300 Subject: [PATCH 1/5] chore: add explicit discriminator field for processors --- src/data_designer/config/data_designer_config.py | 4 ++-- src/data_designer/config/processors.py | 10 +++++++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/data_designer/config/data_designer_config.py b/src/data_designer/config/data_designer_config.py index 06f02995..d90deb41 100644 --- a/src/data_designer/config/data_designer_config.py +++ b/src/data_designer/config/data_designer_config.py @@ -11,7 +11,7 @@ from data_designer.config.base import ExportableConfigBase from data_designer.config.column_types import ColumnConfigT from data_designer.config.models import ModelConfig -from data_designer.config.processors import ProcessorConfig +from data_designer.config.processors import ProcessorConfigT from data_designer.config.sampler_constraints import ColumnConstraintT from data_designer.config.seed import SeedConfig @@ -37,4 +37,4 @@ class DataDesignerConfig(ExportableConfigBase): seed_config: Optional[SeedConfig] = None constraints: Optional[list[ColumnConstraintT]] = None profilers: Optional[list[ColumnProfilerConfigT]] = None - processors: Optional[list[ProcessorConfig]] = None + processors: Optional[list[Annotated[ProcessorConfigT, Field(discriminator="processor_type")]]] = None diff --git a/src/data_designer/config/processors.py b/src/data_designer/config/processors.py index 171e10e3..e958c440 100644 --- a/src/data_designer/config/processors.py +++ b/src/data_designer/config/processors.py @@ -4,9 +4,10 @@ import json from abc import ABC from enum import Enum -from typing import Any, Literal +from typing import Any, Literal, Union from pydantic import Field, field_validator +from typing_extensions import TypeAlias from data_designer.config.base import ConfigBase from data_designer.config.dataset_builders import BuildStage @@ -47,6 +48,7 @@ class ProcessorConfig(ConfigBase, ABC): default=BuildStage.POST_BATCH, description=f"The stage at which the processor will run. Supported stages: {', '.join(SUPPORTED_STAGES)}", ) + processor_type: ProcessorType @field_validator("build_stage") def validate_build_stage(cls, v: BuildStage) -> BuildStage: @@ -139,3 +141,9 @@ def validate_template(cls, v: dict[str, Any]) -> dict[str, Any]: if "not JSON serializable" in str(e): raise InvalidConfigError("Template must be JSON serializable") return v + + +ProcessorConfigT: TypeAlias = Union[ + DropColumnsProcessorConfig, + SchemaTransformProcessorConfig, +] \ No newline at end of file From 5f9faa4bfaab8bbff06b8e92b843493707c7d58f Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Wed, 17 Dec 2025 11:41:45 -0300 Subject: [PATCH 2/5] using new type instead of base class everywhere --- src/data_designer/config/config_builder.py | 8 ++++---- src/data_designer/config/utils/validation.py | 8 ++++---- .../engine/dataset_builders/utils/config_compiler.py | 4 ++-- tests/config/test_processors.py | 6 +++--- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/data_designer/config/config_builder.py b/src/data_designer/config/config_builder.py index 5f07595d..db382f88 100644 --- a/src/data_designer/config/config_builder.py +++ b/src/data_designer/config/config_builder.py @@ -29,7 +29,7 @@ from data_designer.config.default_model_settings import get_default_model_configs from data_designer.config.errors import BuilderConfigurationError, InvalidColumnTypeError, InvalidConfigError from data_designer.config.models import ModelConfig, load_model_configs -from data_designer.config.processors import ProcessorConfig, ProcessorType, get_processor_config_from_kwargs +from data_designer.config.processors import ProcessorConfigT, ProcessorType, get_processor_config_from_kwargs from data_designer.config.sampler_constraints import ( ColumnConstraintT, ColumnInequalityConstraint, @@ -141,7 +141,7 @@ def __init__(self, model_configs: Optional[Union[list[ModelConfig], str, Path]] """ self._column_configs = {} self._model_configs = _load_model_configs(model_configs) - self._processor_configs: list[ProcessorConfig] = [] + self._processor_configs: list[ProcessorConfigT] = [] self._seed_config: Optional[SeedConfig] = None self._constraints: list[ColumnConstraintT] = [] self._profilers: list[ColumnProfilerConfigT] = [] @@ -298,7 +298,7 @@ def add_constraint( def add_processor( self, - processor_config: Optional[ProcessorConfig] = None, + processor_config: Optional[ProcessorConfigT] = None, *, processor_type: Optional[ProcessorType] = None, **kwargs, @@ -487,7 +487,7 @@ def get_columns_excluding_type(self, column_type: DataDesignerColumnType) -> lis column_type = resolve_string_enum(column_type, DataDesignerColumnType) return [c for c in self._column_configs.values() if c.column_type != column_type] - def get_processor_configs(self) -> dict[BuildStage, list[ProcessorConfig]]: + def get_processor_configs(self) -> dict[BuildStage, list[ProcessorConfigT]]: """Get processor configuration objects. Returns: diff --git a/src/data_designer/config/utils/validation.py b/src/data_designer/config/utils/validation.py index 7d3654ad..dc1ca2e3 100644 --- a/src/data_designer/config/utils/validation.py +++ b/src/data_designer/config/utils/validation.py @@ -16,7 +16,7 @@ from rich.panel import Panel from data_designer.config.column_types import ColumnConfigT, DataDesignerColumnType, column_type_is_model_generated -from data_designer.config.processors import ProcessorConfig, ProcessorType +from data_designer.config.processors import ProcessorConfigT, ProcessorType from data_designer.config.utils.constants import RICH_CONSOLE_THEME from data_designer.config.utils.misc import ( can_run_data_designer_locally, @@ -57,7 +57,7 @@ def has_column(self) -> bool: def validate_data_designer_config( columns: list[ColumnConfigT], - processor_configs: list[ProcessorConfig], + processor_configs: list[ProcessorConfigT], allowed_references: list[str], ) -> list[Violation]: violations = [] @@ -273,7 +273,7 @@ def validate_columns_not_all_dropped( def validate_drop_columns_processor( columns: list[ColumnConfigT], - processor_configs: list[ProcessorConfig], + processor_configs: list[ProcessorConfigT], ) -> list[Violation]: all_column_names = {c.name for c in columns} for processor_config in processor_configs: @@ -294,7 +294,7 @@ def validate_drop_columns_processor( def validate_schema_transform_processor( columns: list[ColumnConfigT], - processor_configs: list[ProcessorConfig], + processor_configs: list[ProcessorConfigT], ) -> list[Violation]: violations = [] diff --git a/src/data_designer/engine/dataset_builders/utils/config_compiler.py b/src/data_designer/engine/dataset_builders/utils/config_compiler.py index 5359bfa1..91c2cd1d 100644 --- a/src/data_designer/engine/dataset_builders/utils/config_compiler.py +++ b/src/data_designer/engine/dataset_builders/utils/config_compiler.py @@ -3,7 +3,7 @@ from data_designer.config.column_types import DataDesignerColumnType from data_designer.config.data_designer_config import DataDesignerConfig -from data_designer.config.processors import ProcessorConfig +from data_designer.config.processors import ProcessorConfigT from data_designer.engine.dataset_builders.multi_column_configs import ( DatasetBuilderColumnConfigT, SamplerMultiColumnConfig, @@ -56,5 +56,5 @@ def compile_dataset_builder_column_configs(config: DataDesignerConfig) -> list[D def compile_dataset_builder_processor_configs( config: DataDesignerConfig, -) -> list[ProcessorConfig]: +) -> list[ProcessorConfigT]: return config.processors or [] diff --git a/tests/config/test_processors.py b/tests/config/test_processors.py index 4dfa0514..d2bbb711 100644 --- a/tests/config/test_processors.py +++ b/tests/config/test_processors.py @@ -8,7 +8,7 @@ from data_designer.config.errors import InvalidConfigError from data_designer.config.processors import ( DropColumnsProcessorConfig, - ProcessorConfig, + ProcessorConfigT, ProcessorType, SchemaTransformProcessorConfig, get_processor_config_from_kwargs, @@ -23,7 +23,7 @@ def test_drop_columns_processor_config_creation(): assert config.build_stage == BuildStage.POST_BATCH assert config.column_names == ["col1", "col2"] assert config.processor_type == ProcessorType.DROP_COLUMNS - assert isinstance(config, ProcessorConfig) + assert isinstance(config, ProcessorConfigT) def test_drop_columns_processor_config_validation(): @@ -64,7 +64,7 @@ def test_schema_transform_processor_config_creation(): assert config.build_stage == BuildStage.POST_BATCH assert config.template == {"text": "{{ col1 }}"} assert config.processor_type == ProcessorType.SCHEMA_TRANSFORM - assert isinstance(config, ProcessorConfig) + assert isinstance(config, ProcessorConfigT) def test_schema_transform_processor_config_validation(): From f0fddca1db2c440ad3472ad413e6be966f3c3491 Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Wed, 17 Dec 2025 12:46:26 -0300 Subject: [PATCH 3/5] lint --- src/data_designer/config/processors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/data_designer/config/processors.py b/src/data_designer/config/processors.py index e958c440..cdc00d7b 100644 --- a/src/data_designer/config/processors.py +++ b/src/data_designer/config/processors.py @@ -146,4 +146,4 @@ def validate_template(cls, v: dict[str, Any]) -> dict[str, Any]: ProcessorConfigT: TypeAlias = Union[ DropColumnsProcessorConfig, SchemaTransformProcessorConfig, -] \ No newline at end of file +] From 687d3846dc5265114c4bd1c91db194ab57c68b69 Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Wed, 17 Dec 2025 13:52:40 -0300 Subject: [PATCH 4/5] using base instead of type in some places --- .../engine/dataset_builders/utils/config_compiler.py | 4 ++-- tests/config/test_processors.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/data_designer/engine/dataset_builders/utils/config_compiler.py b/src/data_designer/engine/dataset_builders/utils/config_compiler.py index 91c2cd1d..5359bfa1 100644 --- a/src/data_designer/engine/dataset_builders/utils/config_compiler.py +++ b/src/data_designer/engine/dataset_builders/utils/config_compiler.py @@ -3,7 +3,7 @@ from data_designer.config.column_types import DataDesignerColumnType from data_designer.config.data_designer_config import DataDesignerConfig -from data_designer.config.processors import ProcessorConfigT +from data_designer.config.processors import ProcessorConfig from data_designer.engine.dataset_builders.multi_column_configs import ( DatasetBuilderColumnConfigT, SamplerMultiColumnConfig, @@ -56,5 +56,5 @@ def compile_dataset_builder_column_configs(config: DataDesignerConfig) -> list[D def compile_dataset_builder_processor_configs( config: DataDesignerConfig, -) -> list[ProcessorConfigT]: +) -> list[ProcessorConfig]: return config.processors or [] diff --git a/tests/config/test_processors.py b/tests/config/test_processors.py index d2bbb711..4dfa0514 100644 --- a/tests/config/test_processors.py +++ b/tests/config/test_processors.py @@ -8,7 +8,7 @@ from data_designer.config.errors import InvalidConfigError from data_designer.config.processors import ( DropColumnsProcessorConfig, - ProcessorConfigT, + ProcessorConfig, ProcessorType, SchemaTransformProcessorConfig, get_processor_config_from_kwargs, @@ -23,7 +23,7 @@ def test_drop_columns_processor_config_creation(): assert config.build_stage == BuildStage.POST_BATCH assert config.column_names == ["col1", "col2"] assert config.processor_type == ProcessorType.DROP_COLUMNS - assert isinstance(config, ProcessorConfigT) + assert isinstance(config, ProcessorConfig) def test_drop_columns_processor_config_validation(): @@ -64,7 +64,7 @@ def test_schema_transform_processor_config_creation(): assert config.build_stage == BuildStage.POST_BATCH assert config.template == {"text": "{{ col1 }}"} assert config.processor_type == ProcessorType.SCHEMA_TRANSFORM - assert isinstance(config, ProcessorConfigT) + assert isinstance(config, ProcessorConfig) def test_schema_transform_processor_config_validation(): From a10f6aae5098508c4fa6cf825bc467ac75aeee03 Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Wed, 17 Dec 2025 14:04:32 -0300 Subject: [PATCH 5/5] processor_type needs to be str for correct ser/de --- src/data_designer/config/processors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/data_designer/config/processors.py b/src/data_designer/config/processors.py index cdc00d7b..17d2ff7b 100644 --- a/src/data_designer/config/processors.py +++ b/src/data_designer/config/processors.py @@ -48,7 +48,7 @@ class ProcessorConfig(ConfigBase, ABC): default=BuildStage.POST_BATCH, description=f"The stage at which the processor will run. Supported stages: {', '.join(SUPPORTED_STAGES)}", ) - processor_type: ProcessorType + processor_type: str @field_validator("build_stage") def validate_build_stage(cls, v: BuildStage) -> BuildStage: