Skip to content

Commit a9a6168

Browse files
committed
renaming
1 parent 3dfa7bb commit a9a6168

File tree

8 files changed

+63
-63
lines changed

8 files changed

+63
-63
lines changed

src/data_designer/config/exports.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
UniformDistributionParams,
3434
)
3535
from data_designer.config.processors import (
36-
AncillaryDatasetProcessorConfig,
36+
SchemaTransformProcessorConfig,
3737
DropColumnsProcessorConfig,
3838
ProcessorType,
3939
)
@@ -73,7 +73,7 @@
7373

7474
def get_config_exports() -> list[str]:
7575
return [
76-
AncillaryDatasetProcessorConfig.__name__,
76+
SchemaTransformProcessorConfig.__name__,
7777
BernoulliMixtureSamplerParams.__name__,
7878
BernoulliSamplerParams.__name__,
7979
BinomialSamplerParams.__name__,

src/data_designer/config/processors.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
class ProcessorType(str, Enum):
1717
DROP_COLUMNS = "drop_columns"
18-
ANCILLARY_DATASET = "ancillary_dataset"
18+
SCHEMA_TRANSFORM = "schema_transform"
1919

2020

2121
class ProcessorConfig(ConfigBase, ABC):
@@ -39,20 +39,20 @@ def validate_build_stage(cls, v: BuildStage) -> BuildStage:
3939
def get_processor_config_from_kwargs(processor_type: ProcessorType, **kwargs) -> ProcessorConfig:
4040
if processor_type == ProcessorType.DROP_COLUMNS:
4141
return DropColumnsProcessorConfig(**kwargs)
42-
elif processor_type == ProcessorType.ANCILLARY_DATASET:
43-
return AncillaryDatasetProcessorConfig(**kwargs)
42+
elif processor_type == ProcessorType.SCHEMA_TRANSFORM:
43+
return SchemaTransformProcessorConfig(**kwargs)
4444

4545

4646
class DropColumnsProcessorConfig(ProcessorConfig):
4747
column_names: list[str]
4848
processor_type: Literal[ProcessorType.DROP_COLUMNS] = ProcessorType.DROP_COLUMNS
4949

5050

51-
class AncillaryDatasetProcessorConfig(ProcessorConfig):
51+
class SchemaTransformProcessorConfig(ProcessorConfig):
5252
template: dict[str, Any] = Field(
5353
...,
5454
description="""
55-
Dictionary specifying columns and templates to use in the ancillary dataset.
55+
Dictionary specifying columns and templates to use in the new dataset with transformed schema.
5656
5757
Each key is a new column name, and each value is an object containing Jinja2 templates - for instance, a string or a list of strings.
5858
Values must be JSON-serializable.
@@ -67,8 +67,8 @@ class AncillaryDatasetProcessorConfig(ProcessorConfig):
6767
}
6868
```
6969
70-
The above templates will create an ancillary dataset with three columns: "list_of_strings", "uppercase_string", and "lowercase_string".
70+
The above templates will create an new dataset with three columns: "list_of_strings", "uppercase_string", and "lowercase_string".
7171
References to columns "col1" and "col2" in the templates will be replaced with the actual values of the columns in the dataset.
7272
""",
7373
)
74-
processor_type: Literal[ProcessorType.ANCILLARY_DATASET] = ProcessorType.ANCILLARY_DATASET
74+
processor_type: Literal[ProcessorType.SCHEMA_TRANSFORM] = ProcessorType.SCHEMA_TRANSFORM

src/data_designer/config/utils/validation.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ def validate_data_designer_config(
6868
violations.extend(validate_expression_references(columns=columns, allowed_references=allowed_references))
6969
violations.extend(validate_columns_not_all_dropped(columns=columns))
7070
violations.extend(validate_drop_columns_processor(columns=columns, processor_configs=processor_configs))
71-
violations.extend(validate_ancillary_dataset_processor(columns=columns, processor_configs=processor_configs))
71+
violations.extend(validate_schema_transform_processor(columns=columns, processor_configs=processor_configs))
7272
if not can_run_data_designer_locally():
7373
violations.extend(validate_local_only_columns(columns=columns))
7474
return violations
@@ -294,15 +294,15 @@ def validate_drop_columns_processor(
294294
return []
295295

296296

297-
def validate_ancillary_dataset_processor(
297+
def validate_schema_transform_processor(
298298
columns: list[ColumnConfigT],
299299
processor_configs: list[ProcessorConfig],
300300
) -> list[Violation]:
301301
violations = []
302302

303303
all_column_names = {c.name for c in columns}
304304
for processor_config in processor_configs:
305-
if processor_config.processor_type == ProcessorType.ANCILLARY_DATASET:
305+
if processor_config.processor_type == ProcessorType.SCHEMA_TRANSFORM:
306306
try:
307307
json.dumps(processor_config.template)
308308
except TypeError as e:

src/data_designer/engine/processing/processors/registry.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,11 @@
33

44
from data_designer.config.base import ConfigBase
55
from data_designer.config.processors import (
6-
AncillaryDatasetProcessorConfig,
6+
SchemaTransformProcessorConfig,
77
DropColumnsProcessorConfig,
88
ProcessorType,
99
)
10-
from data_designer.engine.processing.processors.ancillary_dataset import AncillaryDatasetProcessor
10+
from data_designer.engine.processing.processors.schema_transform import SchemaTransformProcessor
1111
from data_designer.engine.processing.processors.base import Processor
1212
from data_designer.engine.processing.processors.drop_columns import DropColumnsProcessor
1313
from data_designer.engine.registry.base import TaskRegistry
@@ -19,7 +19,7 @@ class ProcessorRegistry(TaskRegistry[str, Processor, ConfigBase]): ...
1919
def create_default_processor_registry() -> ProcessorRegistry:
2020
registry = ProcessorRegistry()
2121
registry.register(
22-
ProcessorType.ANCILLARY_DATASET, AncillaryDatasetProcessor, AncillaryDatasetProcessorConfig, False
22+
ProcessorType.SCHEMA_TRANSFORM, SchemaTransformProcessor, SchemaTransformProcessorConfig, False
2323
)
2424
registry.register(ProcessorType.DROP_COLUMNS, DropColumnsProcessor, DropColumnsProcessorConfig, False)
2525
return registry

src/data_designer/engine/processing/processors/ancillary_dataset.py renamed to src/data_designer/engine/processing/processors/schema_transform.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
import pandas as pd
88

9-
from data_designer.config.processors import AncillaryDatasetProcessorConfig
9+
from data_designer.config.processors import SchemaTransformProcessorConfig
1010
from data_designer.engine.configurable_task import ConfigurableTaskMetadata
1111
from data_designer.engine.dataset_builders.artifact_storage import BatchStage
1212
from data_designer.engine.processing.ginja.environment import WithJinja2UserTemplateRendering
@@ -16,12 +16,12 @@
1616
logger = logging.getLogger(__name__)
1717

1818

19-
class AncillaryDatasetProcessor(WithJinja2UserTemplateRendering, Processor[AncillaryDatasetProcessorConfig]):
19+
class SchemaTransformProcessor(WithJinja2UserTemplateRendering, Processor[SchemaTransformProcessorConfig]):
2020
@staticmethod
2121
def metadata() -> ConfigurableTaskMetadata:
2222
return ConfigurableTaskMetadata(
23-
name="ancillary_dataset_processor",
24-
description="Generate an ancillary dataset using a Jinja2 template.",
23+
name="schema_transform_processor",
24+
description="Generate dataset with transformed schema using a Jinja2 template.",
2525
required_resources=None,
2626
)
2727

tests/config/test_processors.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
from data_designer.config.dataset_builders import BuildStage
88
from data_designer.config.processors import (
9-
AncillaryDatasetProcessorConfig,
9+
SchemaTransformProcessorConfig,
1010
DropColumnsProcessorConfig,
1111
ProcessorConfig,
1212
ProcessorType,
@@ -53,35 +53,35 @@ def test_drop_columns_processor_config_serialization():
5353
assert config_restored.column_names == config.column_names
5454

5555

56-
def test_ancillary_dataset_processor_config_creation():
57-
config = AncillaryDatasetProcessorConfig(
56+
def test_schema_transform_processor_config_creation():
57+
config = SchemaTransformProcessorConfig(
5858
name="output_format_processor",
5959
build_stage=BuildStage.POST_BATCH,
6060
template={"text": "{{ col1 }}"},
6161
)
6262

6363
assert config.build_stage == BuildStage.POST_BATCH
6464
assert config.template == {"text": "{{ col1 }}"}
65-
assert config.processor_type == ProcessorType.ANCILLARY_DATASET
65+
assert config.processor_type == ProcessorType.SCHEMA_TRANSFORM
6666
assert isinstance(config, ProcessorConfig)
6767

6868

69-
def test_ancillary_dataset_processor_config_validation():
69+
def test_schema_transform_processor_config_validation():
7070
# Test unsupported stage raises error
7171
with pytest.raises(ValidationError, match="Invalid dataset builder stage"):
72-
AncillaryDatasetProcessorConfig(
73-
name="ancillary_dataset_processor",
72+
SchemaTransformProcessorConfig(
73+
name="schema_transform_processor",
7474
build_stage=BuildStage.PRE_BATCH,
7575
template={"text": "{{ col1 }}"},
7676
)
7777

7878
# Test missing required field raises error
7979
with pytest.raises(ValidationError, match="Field required"):
80-
AncillaryDatasetProcessorConfig(name="ancillary_dataset_processor", build_stage=BuildStage.POST_BATCH)
80+
SchemaTransformProcessorConfig(name="schema_transform_processor", build_stage=BuildStage.POST_BATCH)
8181

8282

8383
def test_output_format_processor_config_serialization():
84-
config = AncillaryDatasetProcessorConfig(
84+
config = SchemaTransformProcessorConfig(
8585
name="output_format_processor",
8686
build_stage=BuildStage.POST_BATCH,
8787
template={"text": "{{ col1 }}"},
@@ -93,7 +93,7 @@ def test_output_format_processor_config_serialization():
9393
assert config_dict["template"] == {"text": "{{ col1 }}"}
9494

9595
# Deserialize from dict
96-
config_restored = AncillaryDatasetProcessorConfig.model_validate(config_dict)
96+
config_restored = SchemaTransformProcessorConfig.model_validate(config_dict)
9797
assert config_restored.build_stage == config.build_stage
9898
assert config_restored.template == config.template
9999

@@ -110,15 +110,15 @@ def test_get_processor_config_from_kwargs():
110110
assert config_drop_columns.column_names == ["col1"]
111111
assert config_drop_columns.processor_type == ProcessorType.DROP_COLUMNS
112112

113-
config_ancillary_dataset = get_processor_config_from_kwargs(
114-
ProcessorType.ANCILLARY_DATASET,
113+
config_schema_transform = get_processor_config_from_kwargs(
114+
ProcessorType.SCHEMA_TRANSFORM,
115115
name="output_format_processor",
116116
build_stage=BuildStage.POST_BATCH,
117117
template={"text": "{{ col1 }}"},
118118
)
119-
assert isinstance(config_ancillary_dataset, AncillaryDatasetProcessorConfig)
120-
assert config_ancillary_dataset.template == {"text": "{{ col1 }}"}
121-
assert config_ancillary_dataset.processor_type == ProcessorType.ANCILLARY_DATASET
119+
assert isinstance(config_schema_transform, SchemaTransformProcessorConfig)
120+
assert config_schema_transform.template == {"text": "{{ col1 }}"}
121+
assert config_schema_transform.processor_type == ProcessorType.SCHEMA_TRANSFORM
122122

123123
# Test with unknown processor type returns None
124124
from enum import Enum

tests/config/utils/test_validation.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from data_designer.config.dataset_builders import BuildStage
1616
from data_designer.config.models import ImageContext, ModalityDataType
1717
from data_designer.config.processors import (
18-
AncillaryDatasetProcessorConfig,
18+
SchemaTransformProcessorConfig,
1919
DropColumnsProcessorConfig,
2020
)
2121
from data_designer.config.utils.code_lang import CodeLang
@@ -24,7 +24,7 @@
2424
ViolationLevel,
2525
ViolationType,
2626
rich_print_violations,
27-
validate_ancillary_dataset_processor,
27+
validate_schema_transform_processor,
2828
validate_code_validation,
2929
validate_columns_not_all_dropped,
3030
validate_data_designer_config,
@@ -106,13 +106,13 @@
106106
column_names=["inexistent_column"],
107107
build_stage=BuildStage.POST_BATCH,
108108
),
109-
AncillaryDatasetProcessorConfig(
110-
name="ancillary_dataset_processor_invalid_reference",
109+
SchemaTransformProcessorConfig(
110+
name="schema_transform_processor_invalid_reference",
111111
template={"text": "{{ invalid_reference }}"},
112112
build_stage=BuildStage.POST_BATCH,
113113
),
114-
AncillaryDatasetProcessorConfig(
115-
name="ancillary_dataset_processor_invalid_template",
114+
SchemaTransformProcessorConfig(
115+
name="schema_transform_processor_invalid_template",
116116
template={"text": {1, 2, 3}},
117117
build_stage=BuildStage.POST_BATCH,
118118
),
@@ -125,14 +125,14 @@
125125
@patch("data_designer.config.utils.validation.validate_expression_references")
126126
@patch("data_designer.config.utils.validation.validate_columns_not_all_dropped")
127127
@patch("data_designer.config.utils.validation.validate_drop_columns_processor")
128-
@patch("data_designer.config.utils.validation.validate_ancillary_dataset_processor")
128+
@patch("data_designer.config.utils.validation.validate_schema_transform_processor")
129129
def test_validate_data_designer_config(
130130
mock_validate_columns_not_all_dropped,
131131
mock_validate_expression_references,
132132
mock_validate_code_validation,
133133
mock_validate_prompt_templates,
134134
mock_validate_drop_columns_processor,
135-
mock_validate_ancillary_dataset_processor,
135+
mock_validate_schema_transform_processor,
136136
):
137137
mock_validate_columns_not_all_dropped.return_value = [
138138
Violation(
@@ -174,7 +174,7 @@ def test_validate_data_designer_config(
174174
level=ViolationLevel.ERROR,
175175
)
176176
]
177-
mock_validate_ancillary_dataset_processor.return_value = [
177+
mock_validate_schema_transform_processor.return_value = [
178178
Violation(
179179
column="text",
180180
type=ViolationType.INVALID_REFERENCE,
@@ -196,7 +196,7 @@ def test_validate_data_designer_config(
196196
mock_validate_code_validation.assert_called_once()
197197
mock_validate_prompt_templates.assert_called_once()
198198
mock_validate_drop_columns_processor.assert_called_once()
199-
mock_validate_ancillary_dataset_processor.assert_called_once()
199+
mock_validate_schema_transform_processor.assert_called_once()
200200

201201

202202
def test_validate_prompt_templates():
@@ -281,8 +281,8 @@ def test_validate_expression_references():
281281
assert violations[0].type == ViolationType.EXPRESSION_REFERENCE_MISSING
282282

283283

284-
def test_validate_ancillary_dataset_processor():
285-
violations = validate_ancillary_dataset_processor(COLUMNS, PROCESSOR_CONFIGS)
284+
def test_validate_schema_transform_processor():
285+
violations = validate_schema_transform_processor(COLUMNS, PROCESSOR_CONFIGS)
286286
assert len(violations) == 2
287287
assert violations[0].type == ViolationType.INVALID_REFERENCE
288288
assert violations[0].column is None
@@ -295,7 +295,7 @@ def test_validate_ancillary_dataset_processor():
295295
assert violations[1].column is None
296296
assert (
297297
violations[1].message
298-
== "Ancillary dataset processor ancillary_dataset_processor_invalid_template template is not a valid JSON object."
298+
== "Ancillary dataset processor schema_transform_processor_invalid_template template is not a valid JSON object."
299299
)
300300
assert violations[1].level == ViolationLevel.ERROR
301301

0 commit comments

Comments
 (0)