make sampler type a discriminated union; add injection validator (#71)

johnnygreco · web-flow · commit 202eba623065 · 2025-11-24T16:45:58.000-05:00
diff --git a/src/data_designer/config/column_configs.py b/src/data_designer/config/column_configs.py
@@ -2,9 +2,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from abc import ABC
-from typing import Literal, Optional, Type, Union
+from typing import Annotated, Literal, Optional, Type, Union
 
-from pydantic import BaseModel, Field, model_validator
+from pydantic import BaseModel, Discriminator, Field, model_validator
 from typing_extensions import Self
 
 from .base import ConfigBase
@@ -89,11 +89,36 @@ class SamplerColumnConfig(SingleColumnConfig):
     """
 
     sampler_type: SamplerType
-    params: SamplerParamsT
-    conditional_params: dict[str, SamplerParamsT] = {}
+    params: Annotated[SamplerParamsT, Discriminator("sampler_type")]
+    conditional_params: dict[str, Annotated[SamplerParamsT, Discriminator("sampler_type")]] = {}
     convert_to: Optional[str] = None
     column_type: Literal["sampler"] = "sampler"
 
+    @model_validator(mode="before")
+    @classmethod
+    def inject_sampler_type_into_params(cls, data: dict) -> dict:
+        """Inject sampler_type into params dict to enable discriminated union resolution.
+
+        This allows users to pass params as a simple dict without the sampler_type field,
+        which will be automatically added based on the outer sampler_type field.
+        """
+        if isinstance(data, dict):
+            sampler_type = data.get("sampler_type")
+            params = data.get("params")
+
+            # If params is a dict and doesn't have sampler_type, inject it
+            if sampler_type and isinstance(params, dict) and "sampler_type" not in params:
+                data["params"] = {"sampler_type": sampler_type, **params}
+
+            # Handle conditional_params similarly
+            conditional_params = data.get("conditional_params")
+            if conditional_params and isinstance(conditional_params, dict):
+                for condition, cond_params in conditional_params.items():
+                    if isinstance(cond_params, dict) and "sampler_type" not in cond_params:
+                        data["conditional_params"][condition] = {"sampler_type": sampler_type, **cond_params}
+
+        return data
+
 
 class LLMTextColumnConfig(SingleColumnConfig):
     """Configuration for text generation columns using Large Language Models.
diff --git a/src/data_designer/config/sampler_params.py b/src/data_designer/config/sampler_params.py
@@ -66,6 +66,7 @@ class CategorySamplerParams(ConfigBase):
             "Larger values will be sampled with higher probability."
         ),
     )
+    sampler_type: Literal[SamplerType.CATEGORY] = SamplerType.CATEGORY
 
     @model_validator(mode="after")
     def _normalize_weights_if_needed(self) -> Self:
@@ -106,6 +107,7 @@ class DatetimeSamplerParams(ConfigBase):
         default="D",
         description="Sampling units, e.g. the smallest possible time interval between samples.",
     )
+    sampler_type: Literal[SamplerType.DATETIME] = SamplerType.DATETIME
 
     @field_validator("start", "end")
     @classmethod
@@ -136,6 +138,7 @@ class SubcategorySamplerParams(ConfigBase):
         ...,
         description="Mapping from each value of parent category to a list of subcategory values.",
     )
+    sampler_type: Literal[SamplerType.SUBCATEGORY] = SamplerType.SUBCATEGORY
 
 
 class TimeDeltaSamplerParams(ConfigBase):
@@ -187,6 +190,7 @@ class TimeDeltaSamplerParams(ConfigBase):
         default="D",
         description="Sampling units, e.g. the smallest possible time interval between samples.",
     )
+    sampler_type: Literal[SamplerType.TIMEDELTA] = SamplerType.TIMEDELTA
 
     @model_validator(mode="after")
     def _validate_min_less_than_max(self) -> Self:
@@ -219,6 +223,7 @@ class UUIDSamplerParams(ConfigBase):
         default=False,
         description="If true, all letters in the UUID will be capitalized.",
     )
+    sampler_type: Literal[SamplerType.UUID] = SamplerType.UUID
 
     @property
     def last_index(self) -> int:
@@ -257,6 +262,7 @@ class ScipySamplerParams(ConfigBase):
     decimal_places: Optional[int] = Field(
         default=None, description="Number of decimal places to round the sampled values to."
     )
+    sampler_type: Literal[SamplerType.SCIPY] = SamplerType.SCIPY
 
 
 class BinomialSamplerParams(ConfigBase):
@@ -273,6 +279,7 @@ class BinomialSamplerParams(ConfigBase):
 
     n: int = Field(..., description="Number of trials.")
     p: float = Field(..., description="Probability of success on each trial.", ge=0.0, le=1.0)
+    sampler_type: Literal[SamplerType.BINOMIAL] = SamplerType.BINOMIAL
 
 
 class BernoulliSamplerParams(ConfigBase):
@@ -288,6 +295,7 @@ class BernoulliSamplerParams(ConfigBase):
     """
 
     p: float = Field(..., description="Probability of success.", ge=0.0, le=1.0)
+    sampler_type: Literal[SamplerType.BERNOULLI] = SamplerType.BERNOULLI
 
 
 class BernoulliMixtureSamplerParams(ConfigBase):
@@ -327,6 +335,7 @@ class BernoulliMixtureSamplerParams(ConfigBase):
         ...,
         description="Parameters of the scipy.stats distribution given in `dist_name`.",
     )
+    sampler_type: Literal[SamplerType.BERNOULLI_MIXTURE] = SamplerType.BERNOULLI_MIXTURE
 
 
 class GaussianSamplerParams(ConfigBase):
@@ -350,6 +359,7 @@ class GaussianSamplerParams(ConfigBase):
     decimal_places: Optional[int] = Field(
         default=None, description="Number of decimal places to round the sampled values to."
     )
+    sampler_type: Literal[SamplerType.GAUSSIAN] = SamplerType.GAUSSIAN
 
 
 class PoissonSamplerParams(ConfigBase):
@@ -369,6 +379,7 @@ class PoissonSamplerParams(ConfigBase):
     """
 
     mean: float = Field(..., description="Mean number of events in a fixed interval.")
+    sampler_type: Literal[SamplerType.POISSON] = SamplerType.POISSON
 
 
 class UniformSamplerParams(ConfigBase):
@@ -390,6 +401,7 @@ class UniformSamplerParams(ConfigBase):
     decimal_places: Optional[int] = Field(
         default=None, description="Number of decimal places to round the sampled values to."
     )
+    sampler_type: Literal[SamplerType.UNIFORM] = SamplerType.UNIFORM
 
 
 #########################################
@@ -470,11 +482,12 @@ class PersonSamplerParams(ConfigBase):
         default=False,
         description="If True, then append synthetic persona columns to each generated person.",
     )
+    sampler_type: Literal[SamplerType.PERSON] = SamplerType.PERSON
 
     @property
     def generator_kwargs(self) -> list[str]:
         """Keyword arguments to pass to the person generator."""
-        return [f for f in list(PersonSamplerParams.model_fields) if f != "locale"]
+        return [f for f in list(PersonSamplerParams.model_fields) if f not in ("locale", "sampler_type")]
 
     @property
     def people_gen_key(self) -> str:
@@ -533,11 +546,12 @@ class PersonFromFakerSamplerParams(ConfigBase):
         min_length=2,
         max_length=2,
     )
+    sampler_type: Literal[SamplerType.PERSON_FROM_FAKER] = SamplerType.PERSON_FROM_FAKER
 
     @property
     def generator_kwargs(self) -> list[str]:
         """Keyword arguments to pass to the person generator."""
-        return [f for f in list(PersonFromFakerSamplerParams.model_fields) if f != "locale"]
+        return [f for f in list(PersonFromFakerSamplerParams.model_fields) if f not in ("locale", "sampler_type")]
 
     @property
     def people_gen_key(self) -> str:
diff --git a/tests/config/test_columns.py b/tests/config/test_columns.py
@@ -23,7 +23,15 @@
     get_column_display_order,
 )
 from data_designer.config.errors import InvalidConfigError
-from data_designer.config.sampler_params import SamplerType, UUIDSamplerParams
+from data_designer.config.sampler_params import (
+    CategorySamplerParams,
+    GaussianSamplerParams,
+    PersonFromFakerSamplerParams,
+    PersonSamplerParams,
+    SamplerType,
+    UniformSamplerParams,
+    UUIDSamplerParams,
+)
 from data_designer.config.utils.code_lang import CodeLang
 from data_designer.config.utils.errors import UserJinjaTemplateSyntaxError
 from data_designer.config.validator_params import CodeValidatorParams
@@ -324,3 +332,114 @@ def test_get_column_config_from_kwargs():
         ),
         SeedDatasetColumnConfig,
     )
+
+
+def test_sampler_column_config_discriminated_union_with_dict_params():
+    """Test that sampler_type field is automatically injected into params dict."""
+    config = SamplerColumnConfig(
+        name="test_uniform",
+        sampler_type=SamplerType.UNIFORM,
+        params={"low": 0.0, "high": 1.0, "decimal_places": 2},
+    )
+    assert config.name == "test_uniform"
+    assert config.sampler_type == SamplerType.UNIFORM
+    assert isinstance(config.params, UniformSamplerParams)
+    assert config.params.sampler_type == SamplerType.UNIFORM
+    assert config.params.low == 0.0
+    assert config.params.high == 1.0
+    assert config.params.decimal_places == 2
+
+
+def test_sampler_column_config_discriminated_union_with_explicit_sampler_type():
+    """Test that explicit sampler_type in params dict is preserved."""
+    config = SamplerColumnConfig(
+        name="test_category",
+        sampler_type=SamplerType.CATEGORY,
+        params={"sampler_type": "category", "values": ["A", "B", "C"], "weights": [0.5, 0.3, 0.2]},
+    )
+    assert config.name == "test_category"
+    assert config.sampler_type == SamplerType.CATEGORY
+    assert isinstance(config.params, CategorySamplerParams)
+    assert config.params.sampler_type == SamplerType.CATEGORY
+    assert config.params.values == ["A", "B", "C"]
+
+
+def test_sampler_column_config_discriminated_union_serialization():
+    """Test that discriminated union works correctly with serialization/deserialization."""
+    config = SamplerColumnConfig(
+        name="test_person",
+        sampler_type=SamplerType.PERSON,
+        params={"locale": "en_US", "sex": "Female", "age_range": [25, 45]},
+    )
+
+    # Serialize
+    serialized = config.model_dump()
+    assert "sampler_type" in serialized["params"]
+    assert serialized["params"]["sampler_type"] == "person"
+
+    # Deserialize
+    deserialized = SamplerColumnConfig(**serialized)
+    assert isinstance(deserialized.params, PersonSamplerParams)
+    assert deserialized.params.locale == "en_US"
+    assert deserialized.params.sex == "Female"
+    assert deserialized.params.age_range == [25, 45]
+
+
+def test_sampler_column_config_discriminated_union_person_vs_person_from_faker():
+    """Test that discriminated union correctly distinguishes between person and person_from_faker."""
+    # Test person sampler (managed datasets)
+    person_config = SamplerColumnConfig(
+        name="test_person",
+        sampler_type=SamplerType.PERSON,
+        params={"locale": "en_US", "sex": "Male", "age_range": [30, 50]},
+    )
+    assert isinstance(person_config.params, PersonSamplerParams)
+    assert person_config.params.sampler_type == SamplerType.PERSON
+    assert person_config.params.locale == "en_US"
+
+    # Test person_from_faker sampler (Faker-based)
+    person_faker_config = SamplerColumnConfig(
+        name="test_person_faker",
+        sampler_type=SamplerType.PERSON_FROM_FAKER,
+        params={"locale": "en_GB", "sex": "Female", "age_range": [20, 40]},
+    )
+    assert isinstance(person_faker_config.params, PersonFromFakerSamplerParams)
+    assert person_faker_config.params.sampler_type == SamplerType.PERSON_FROM_FAKER
+    assert person_faker_config.params.locale == "en_GB"
+
+    # Verify they are different types
+    assert type(person_config.params) != type(person_faker_config.params)
+    assert isinstance(person_config.params, PersonSamplerParams)
+    assert isinstance(person_faker_config.params, PersonFromFakerSamplerParams)
+
+
+def test_sampler_column_config_discriminated_union_with_conditional_params():
+    """Test that sampler_type is injected into conditional_params as well."""
+    config = SamplerColumnConfig(
+        name="test_gaussian",
+        sampler_type=SamplerType.GAUSSIAN,
+        params={"mean": 0.0, "stddev": 1.0},
+        conditional_params={"age > 21": {"mean": 5.0, "stddev": 2.0}},
+    )
+
+    assert isinstance(config.params, GaussianSamplerParams)
+    assert config.params.mean == 0.0
+    assert config.params.stddev == 1.0
+
+    # Check conditional params
+    assert "age > 21" in config.conditional_params
+    cond_param = config.conditional_params["age > 21"]
+    assert isinstance(cond_param, GaussianSamplerParams)
+    assert cond_param.sampler_type == SamplerType.GAUSSIAN
+    assert cond_param.mean == 5.0
+    assert cond_param.stddev == 2.0
+
+
+def test_sampler_column_config_discriminated_union_wrong_params_type():
+    """Test that discriminated union rejects params that don't match the sampler_type."""
+    with pytest.raises(ValidationError):
+        SamplerColumnConfig(
+            name="test_wrong_params",
+            sampler_type=SamplerType.UNIFORM,
+            params={"values": ["A", "B"]},  # Category params for uniform sampler
+        )
diff --git a/tests/engine/analysis/column_profilers/test_base.py b/tests/engine/analysis/column_profilers/test_base.py
@@ -16,7 +16,9 @@
 
 def test_column_config_with_dataframe_valid_column_config_with_dataframe():
     df = pd.DataFrame({"test_column": [1, 2, 3]})
-    column_config = SamplerColumnConfig(name="test_column", sampler_type=SamplerType.CATEGORY, params={})
+    column_config = SamplerColumnConfig(
+        name="test_column", sampler_type=SamplerType.CATEGORY, params={"values": [1, 2, 3]}
+    )
 
     config_with_df = ColumnConfigWithDataFrame(column_config=column_config, df=df)
 
@@ -27,15 +29,19 @@ def test_column_config_with_dataframe_valid_column_config_with_dataframe():
 
 def test_column_config_with_dataframe_column_not_found_validation_error():
     df = pd.DataFrame({"other_column": [1, 2, 3]})
-    column_config = SamplerColumnConfig(name="test_column", sampler_type=SamplerType.CATEGORY, params={})
+    column_config = SamplerColumnConfig(
+        name="test_column", sampler_type=SamplerType.CATEGORY, params={"values": [1, 2, 3]}
+    )
 
     with pytest.raises(ValidationError, match="Column 'test_column' not found in DataFrame"):
         ColumnConfigWithDataFrame(column_config=column_config, df=df)
 
 
 def test_column_config_with_dataframe_pyarrow_backend_conversion():
     df = pd.DataFrame({"test_column": [1, 2, 3]})
-    column_config = SamplerColumnConfig(name="test_column", sampler_type=SamplerType.CATEGORY, params={})
+    column_config = SamplerColumnConfig(
+        name="test_column", sampler_type=SamplerType.CATEGORY, params={"values": [1, 2, 3]}
+    )
 
     config_with_df = ColumnConfigWithDataFrame(column_config=column_config, df=df)
 
@@ -44,7 +50,9 @@ def test_column_config_with_dataframe_pyarrow_backend_conversion():
 
 def test_column_config_with_dataframe_as_tuple_method():
     df = pd.DataFrame({"test_column": [1, 2, 3]})
-    column_config = SamplerColumnConfig(name="test_column", sampler_type=SamplerType.CATEGORY, params={})
+    column_config = SamplerColumnConfig(
+        name="test_column", sampler_type=SamplerType.CATEGORY, params={"values": [1, 2, 3]}
+    )
 
     config_with_df = ColumnConfigWithDataFrame(column_config=column_config, df=df)
     column_config_result, df_result = config_with_df.as_tuple()