diff --git a/examples/.gitignore b/examples/.gitignore new file mode 100644 index 00000000..8024e08c --- /dev/null +++ b/examples/.gitignore @@ -0,0 +1 @@ +**/artifacts diff --git a/examples/custom_column.py b/examples/custom_column.py new file mode 100644 index 00000000..528bc29d --- /dev/null +++ b/examples/custom_column.py @@ -0,0 +1,74 @@ +import pandas as pd + +from data_designer.essentials import ( + CustomColumnConfig, + DataDesigner, + DataDesignerConfigBuilder, + InferenceParameters, + LoggingConfig, + ModelConfig, + configure_logging, +) + +configure_logging(LoggingConfig.debug()) + +# Initialize NDD and add columns +MODEL_ALIAS = "nano" +SYSTEM_PROMPT = "/no_think" + +model_configs = [ + ModelConfig( + alias="nano", + model="nvidia/nvidia-nemotron-nano-9b-v2", + inference_parameters=InferenceParameters( + temperature=0.5, + top_p=1.0, + max_tokens=1024, + max_parallel_requests=4, + ), + provider="nvidia", + ) +] + +builder = DataDesignerConfigBuilder(model_configs=model_configs) + +builder.add_column( + name="topic", + column_type="sampler", + sampler_type="category", + params={ + "values": [ + "healthcare", + "finance", + "technology", + ] + }, +) + +builder.add_column( + name="text", + column_type="llm-text", + model_alias=MODEL_ALIAS, + prompt="Write me a paragraph about {{ topic }}.", + system_prompt=SYSTEM_PROMPT, +) + + +def generator_function(df: pd.DataFrame) -> pd.DataFrame: + df["length_frac"] = df["text"].apply(lambda x: len(x) / 1000) + return df + + +builder.add_column( + CustomColumnConfig( + name="length_frac", + generator_function=generator_function, + ) +) + +# Generate dataset +dd = DataDesigner(artifact_path="./artifacts") +dd_preview = dd.preview(builder, num_records=10) +dd_preview.display_sample_record() + +dd.create(builder, num_records=20) diff --git a/src/data_designer/config/columns.py b/src/data_designer/config/columns.py index c2449499..c9a43754 100644 --- a/src/data_designer/config/columns.py +++ b/src/data_designer/config/columns.py @@ -3,9 +3,10 @@ from abc import ABC, abstractmethod from enum import Enum -from typing import Literal, Optional, Type, Union +from typing import Callable, Literal, Optional, Type, Union -from pydantic import BaseModel, Field, model_validator +import pandas as pd +from pydantic import BaseModel, Field, field_serializer, model_validator from typing_extensions import Self, TypeAlias from .base import ConfigBase @@ -28,6 +29,7 @@ class DataDesignerColumnType(str, Enum): EXPRESSION = "expression" VALIDATION = "validation" SEED_DATASET = "seed-dataset" + CUSTOM = "custom" @staticmethod def get_display_order() -> list[Self]: @@ -40,6 +42,7 @@ def get_display_order() -> list[Self]: DataDesignerColumnType.LLM_JUDGE, DataDesignerColumnType.VALIDATION, DataDesignerColumnType.EXPRESSION, + DataDesignerColumnType.CUSTOM, ] @property @@ -55,6 +58,7 @@ def is_dag_column_type(self) -> bool: self.LLM_STRUCTURED, self.LLM_TEXT, self.VALIDATION, + self.CUSTOM, ] @@ -195,6 +199,18 @@ def column_type(self) -> DataDesignerColumnType: return DataDesignerColumnType.SEED_DATASET +class CustomColumnConfig(SingleColumnConfig): + generator_function: Callable[[pd.DataFrame], pd.DataFrame] + + @property + def column_type(self) -> DataDesignerColumnType: + return DataDesignerColumnType.CUSTOM + + @field_serializer("generator_function") + def serialize_generator_function(self, v: Callable[[pd.DataFrame], pd.DataFrame]) -> str: + return v.__name__ + + COLUMN_TYPE_EMOJI_MAP = { "general": "⚛️", # possible analysis column type DataDesignerColumnType.EXPRESSION: "🧩", @@ -205,6 +221,7 @@ def column_type(self) -> DataDesignerColumnType: DataDesignerColumnType.SEED_DATASET: "🌱", DataDesignerColumnType.SAMPLER: "🎲", DataDesignerColumnType.VALIDATION: "🔍", + DataDesignerColumnType.CUSTOM: "🛠️", } @@ -217,6 +234,7 @@ def column_type(self) -> DataDesignerColumnType: SamplerColumnConfig, SeedDatasetColumnConfig, ValidationColumnConfig, + CustomColumnConfig, ] @@ -248,6 +266,8 @@ def get_column_config_from_kwargs(name: str, column_type: DataDesignerColumnType return SamplerColumnConfig(name=name, **_resolve_sampler_kwargs(name, kwargs)) elif column_type == DataDesignerColumnType.SEED_DATASET: return SeedDatasetColumnConfig(name=name, **kwargs) + elif column_type == DataDesignerColumnType.CUSTOM: + return CustomColumnConfig(name=name, **kwargs) raise InvalidColumnTypeError(f"🛑 {column_type} is not a valid column type.") # pragma: no cover diff --git a/src/data_designer/config/utils/visualization.py b/src/data_designer/config/utils/visualization.py index f245517c..c80fb050 100644 --- a/src/data_designer/config/utils/visualization.py +++ b/src/data_designer/config/utils/visualization.py @@ -160,6 +160,7 @@ def display_sample_record( + config_builder.get_columns_of_type(DataDesignerColumnType.EXPRESSION) + config_builder.get_columns_of_type(DataDesignerColumnType.LLM_TEXT) + config_builder.get_columns_of_type(DataDesignerColumnType.LLM_STRUCTURED) + + config_builder.get_columns_of_type(DataDesignerColumnType.CUSTOM) ) if len(non_code_columns) > 0: table = Table(title="Generated Columns", **table_kws) diff --git a/src/data_designer/engine/column_generators/generators/custom.py b/src/data_designer/engine/column_generators/generators/custom.py new file mode 100644 index 00000000..7929ccc4 --- /dev/null +++ b/src/data_designer/engine/column_generators/generators/custom.py @@ -0,0 +1,53 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import logging + +import pandas as pd + +from data_designer.config.columns import CustomColumnConfig +from data_designer.engine.column_generators.generators.base import ( + ColumnGenerator, + GenerationStrategy, + GeneratorMetadata, +) +from data_designer.engine.errors import DataDesignerRuntimeError + +logger = logging.getLogger(__name__) + + +class CustomColumnGenerator(ColumnGenerator[CustomColumnConfig]): + @staticmethod + def metadata() -> GeneratorMetadata: + return GeneratorMetadata( + name="custom", + description="Generate a custom column.", + generation_strategy=GenerationStrategy.FULL_COLUMN, + required_resources=None, + ) + + def generate(self, data: pd.DataFrame) -> pd.DataFrame: + logger.info(f"🛠️ Generating custom column {self.config.name!r} with {len(data)} records") + logger.info(f" |-- generator function: {self.config.generator_function.__name__}") + + original_columns = set(data.columns) + try: + result = self.config.generator_function(data) + + # Check if custom column is the only one that was added + diff_columns = set(result.columns) - original_columns + if len(diff_columns) == 0: + raise DataDesignerRuntimeError( + f"Custom column generator {self.config.generator_function.__name__} added no columns. " + f"Expected column {self.config.name!r} to be added by this generator." + ) + elif diff_columns != {self.config.name}: + diff_columns_str = ", ".join(diff_columns - {self.config.name}) + raise DataDesignerRuntimeError( + f"Custom column generator {self.config.generator_function.__name__} added unexpected columns: {diff_columns_str}. " + f"Expected only column {self.config.name!r} to be added by this generator." + ) + except Exception as e: + raise DataDesignerRuntimeError(f"Error generating custom column {self.config.name!r}: {e}") + + return result diff --git a/src/data_designer/engine/column_generators/registry.py b/src/data_designer/engine/column_generators/registry.py index 95c6ee7a..7a45632f 100644 --- a/src/data_designer/engine/column_generators/registry.py +++ b/src/data_designer/engine/column_generators/registry.py @@ -3,6 +3,7 @@ from data_designer.config.base import ConfigBase from data_designer.config.columns import ( + CustomColumnConfig, DataDesignerColumnType, ExpressionColumnConfig, LLMCodeColumnConfig, @@ -12,6 +13,7 @@ ValidationColumnConfig, ) from data_designer.engine.column_generators.generators.base import ColumnGenerator +from data_designer.engine.column_generators.generators.custom import CustomColumnGenerator from data_designer.engine.column_generators.generators.expression import ExpressionColumnGenerator from data_designer.engine.column_generators.generators.llm_generators import ( LLMCodeCellGenerator, @@ -39,6 +41,7 @@ def create_default_column_generator_registry() -> ColumnGeneratorRegistry: registry.register(DataDesignerColumnType.LLM_JUDGE, LLMJudgeCellGenerator, LLMJudgeColumnConfig, False) registry.register(DataDesignerColumnType.EXPRESSION, ExpressionColumnGenerator, ExpressionColumnConfig, False) registry.register(DataDesignerColumnType.SAMPLER, SamplerColumnGenerator, SamplerMultiColumnConfig, False) + registry.register(DataDesignerColumnType.CUSTOM, CustomColumnGenerator, CustomColumnConfig, False) registry.register( DataDesignerColumnType.SEED_DATASET, SeedDatasetColumnGenerator, diff --git a/src/data_designer/essentials/__init__.py b/src/data_designer/essentials/__init__.py index 3b3fd96f..3d241ba0 100644 --- a/src/data_designer/essentials/__init__.py +++ b/src/data_designer/essentials/__init__.py @@ -3,6 +3,7 @@ from ..config.analysis.column_profilers import JudgeScoreProfilerConfig from ..config.columns import ( + CustomColumnConfig, DataDesignerColumnType, ExpressionColumnConfig, LLMCodeColumnConfig, @@ -79,6 +80,7 @@ "CodeValidatorParams", "ColumnInequalityConstraint", "configure_logging", + "CustomColumnConfig", "DataDesignerColumnType", "DataDesignerConfig", "DataDesignerConfigBuilder", diff --git a/tests/config/test_columns.py b/tests/config/test_columns.py index 0bfa725d..2b3bcec3 100644 --- a/tests/config/test_columns.py +++ b/tests/config/test_columns.py @@ -38,6 +38,7 @@ def test_data_designer_column_type_get_display_order(): DataDesignerColumnType.LLM_JUDGE, DataDesignerColumnType.VALIDATION, DataDesignerColumnType.EXPRESSION, + DataDesignerColumnType.CUSTOM, ] diff --git a/tests/engine/column_generators/generators/test_custom.py b/tests/engine/column_generators/generators/test_custom.py new file mode 100644 index 00000000..dca2c43f --- /dev/null +++ b/tests/engine/column_generators/generators/test_custom.py @@ -0,0 +1,99 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import pandas as pd +import pytest + +from data_designer.config.columns import CustomColumnConfig +from data_designer.engine.column_generators.generators.custom import CustomColumnGenerator +from data_designer.engine.errors import DataDesignerRuntimeError + + +def test_generate_successful_custom_column(stub_resource_provider: object) -> None: + """Test successful generation of a custom column.""" + + def add_sum_column(data: pd.DataFrame) -> pd.DataFrame: + data["sum_column"] = data["col1"] + data["other_col"] + return data + + config = CustomColumnConfig(name="sum_column", generator_function=add_sum_column) + generator = CustomColumnGenerator(config=config, resource_provider=stub_resource_provider) + + df = pd.DataFrame({"col1": [1, 2, 3, 4], "other_col": [10, 20, 30, 40]}) + result = generator.generate(df) + + assert "sum_column" in result.columns + assert result["sum_column"].tolist() == [11, 22, 33, 44] + assert len(result) == 4 + + +def test_generate_custom_column_with_string_data(stub_resource_provider: object) -> None: + """Test custom column generation with string manipulation.""" + + def add_full_name_column(data: pd.DataFrame) -> pd.DataFrame: + data["full_name"] = data["first_name"] + " " + data["last_name"] + return data + + config = CustomColumnConfig(name="full_name", generator_function=add_full_name_column) + generator = CustomColumnGenerator(config=config, resource_provider=stub_resource_provider) + + df = pd.DataFrame({"first_name": ["John", "Jane", "Bob"], "last_name": ["Doe", "Smith", "Johnson"]}) + result = generator.generate(df) + + assert "full_name" in result.columns + assert result["full_name"].tolist() == ["John Doe", "Jane Smith", "Bob Johnson"] + + +def test_generate_error_when_unexpected_columns_added(stub_resource_provider: object) -> None: + """Test that an error is raised when the generator adds unexpected columns.""" + + def add_multiple_columns(data: pd.DataFrame) -> pd.DataFrame: + data["expected_column"] = data["col1"] * 2 + data["unexpected_column"] = data["col1"] * 3 # This should cause an error + return data + + config = CustomColumnConfig(name="expected_column", generator_function=add_multiple_columns) + generator = CustomColumnGenerator(config=config, resource_provider=stub_resource_provider) + + df = pd.DataFrame({"col1": [1, 2, 3]}) + + with pytest.raises( + DataDesignerRuntimeError, + match=r"Custom column generator add_multiple_columns added unexpected columns: unexpected_column", + ): + generator.generate(df) + + +def test_generate_error_when_no_column_added(stub_resource_provider: object) -> None: + """Test that an error is raised when the generator doesn't add the expected column.""" + + def add_no_columns(data: pd.DataFrame) -> pd.DataFrame: + return data + + config = CustomColumnConfig(name="missing_column", generator_function=add_no_columns) + generator = CustomColumnGenerator(config=config, resource_provider=stub_resource_provider) + + df = pd.DataFrame({"col1": [1, 2, 3]}) + + with pytest.raises( + DataDesignerRuntimeError, + match=r"Custom column generator add_no_columns added no columns", + ): + generator.generate(df) + + +def test_generate_error_when_generator_function_raises_exception(stub_resource_provider: object) -> None: + """Test that exceptions from the generator function are properly wrapped.""" + + def failing_generator(data: pd.DataFrame) -> pd.DataFrame: + raise ValueError("Something went wrong in the generator") + + config = CustomColumnConfig(name="test_column", generator_function=failing_generator) + generator = CustomColumnGenerator(config=config, resource_provider=stub_resource_provider) + + df = pd.DataFrame({"col1": [1, 2, 3]}) + + with pytest.raises( + DataDesignerRuntimeError, match=r"Error generating custom column 'test_column': Something went wrong" + ): + generator.generate(df)