Skip to content

Commit 28d5773

Browse files
committed
addressing comments pt1
1 parent 37fffe4 commit 28d5773

File tree

9 files changed

+91
-90
lines changed

9 files changed

+91
-90
lines changed

examples/example.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,19 @@
1+
import json
2+
13
from data_designer.essentials import (
4+
BuildStage,
25
CategorySamplerParams,
36
DataDesigner,
47
DataDesignerConfigBuilder,
58
InferenceParameters,
9+
JsonlExportProcessorConfig,
610
LLMTextColumnConfig,
711
ModelConfig,
812
PersonSamplerParams,
13+
ProcessorType,
914
SamplerColumnConfig,
1015
Score,
1116
SubcategorySamplerParams,
12-
ToJsonlProcessorConfig,
1317
)
1418

1519
# define model aliases
@@ -192,10 +196,12 @@
192196
],
193197
}
194198

199+
template_as_str = json.dumps(jsonl_entry_template)
195200
config_builder.add_processor(
196-
ToJsonlProcessorConfig(
197-
template=jsonl_entry_template,
198-
folder_name="jsonl_files",
201+
JsonlExportProcessorConfig(
202+
processor_type=ProcessorType.JSONL_EXPORT,
203+
build_stage=BuildStage.POST_BATCH,
204+
template=template_as_str,
199205
fraction_per_file={
200206
"train.jsonl": 0.8,
201207
"validation.jsonl": 0.2,

src/data_designer/config/config_builder.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -314,13 +314,19 @@ def add_processor(
314314
Returns:
315315
The current Data Designer config builder instance.
316316
"""
317+
num_processors_of_type = len([p for p in self._processor_configs if p.processor_type == processor_type])
318+
317319
if processor_config is None:
318320
if processor_type is None:
319321
raise BuilderConfigurationError(
320322
"🛑 You must provide either a 'processor_config' object or 'processor_type' "
321323
"with additional keyword arguments."
322324
)
323-
processor_config = get_processor_config_from_kwargs(processor_type=processor_type, **kwargs)
325+
processor_config = get_processor_config_from_kwargs(
326+
processor_type=processor_type, name=f"{processor_type.value}-{num_processors_of_type + 1}", **kwargs
327+
)
328+
elif processor_config.name is None:
329+
processor_config.name = f"{processor_config.processor_type}-{num_processors_of_type + 1}"
324330

325331
# Checks elsewhere fail if DropColumnsProcessor drops a column but it is not marked for drop
326332
if processor_config.processor_type == ProcessorType.DROP_COLUMNS:

src/data_designer/config/processors.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,13 @@
1515

1616
class ProcessorType(str, Enum):
1717
DROP_COLUMNS = "drop_columns"
18-
TO_JSONL = "to_jsonl"
18+
JSONL_EXPORT = "jsonl_export"
1919

2020

2121
class ProcessorConfig(ConfigBase, ABC):
22+
name: str | None = Field(
23+
default=None, description="The name of the processor. If not provided, a default name will be generated."
24+
)
2225
build_stage: BuildStage = Field(
2326
default=BuildStage.POST_BATCH,
2427
description=f"The stage at which the processor will run. Supported stages: {', '.join(SUPPORTED_STAGES)}",
@@ -36,23 +39,22 @@ def validate_build_stage(cls, v: BuildStage) -> BuildStage:
3639
def get_processor_config_from_kwargs(processor_type: ProcessorType, **kwargs) -> ProcessorConfig:
3740
if processor_type == ProcessorType.DROP_COLUMNS:
3841
return DropColumnsProcessorConfig(**kwargs)
39-
elif processor_type == ProcessorType.TO_JSONL:
40-
return ToJsonlProcessorConfig(**kwargs)
42+
elif processor_type == ProcessorType.JSONL_EXPORT:
43+
return JsonlExportProcessorConfig(**kwargs)
4144

4245

4346
class DropColumnsProcessorConfig(ProcessorConfig):
4447
column_names: list[str]
4548
processor_type: Literal[ProcessorType.DROP_COLUMNS] = ProcessorType.DROP_COLUMNS
4649

4750

48-
class ToJsonlProcessorConfig(ProcessorConfig):
49-
template: dict = Field(..., description="The template to use for each entry in the dataset.")
50-
folder_name: str = Field(..., description="Folder where JSONL files will be saved.")
51+
class JsonlExportProcessorConfig(ProcessorConfig):
52+
template: str = Field(..., description="The template to use for each entry in the dataset, as a single string.")
5153
fraction_per_file: dict[str, float] = Field(
5254
default={"train.jsonl": 0.8, "validation.jsonl": 0.2},
5355
description="Fraction of the dataset to save in each file. The keys are the filenames and the values are the fractions.",
5456
)
55-
processor_type: Literal[ProcessorType.TO_JSONL] = ProcessorType.TO_JSONL
57+
processor_type: Literal[ProcessorType.JSONL_EXPORT] = ProcessorType.JSONL_EXPORT
5658

5759
@field_validator("fraction_per_file")
5860
def validate_fraction_per_file(cls, v: dict[str, float]) -> dict[str, float]:

src/data_designer/engine/dataset_builders/artifact_storage.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ class ArtifactStorage(BaseModel):
3232
final_dataset_folder_name: str = "parquet-files"
3333
partial_results_folder_name: str = "tmp-partial-parquet-files"
3434
dropped_columns_folder_name: str = "dropped-columns-parquet-files"
35-
outputs_folder_name: str = "outputs"
35+
processors_outputs_folder_name: str = "processors-files"
3636

3737
@property
3838
def artifact_path_exists(self) -> bool:
@@ -59,8 +59,8 @@ def partial_results_path(self) -> Path:
5959
return self.base_dataset_path / self.partial_results_folder_name
6060

6161
@property
62-
def outputs_path(self) -> Path:
63-
return self.base_dataset_path / self.outputs_folder_name
62+
def processors_outputs_path(self) -> Path:
63+
return self.base_dataset_path / self.processors_outputs_folder_name
6464

6565
@field_validator("artifact_path")
6666
def validate_artifact_path(cls, v: Union[Path, str]) -> Path:
@@ -183,10 +183,10 @@ def write_metadata(self, metadata: dict) -> Path:
183183
json.dump(metadata, file)
184184
return self.metadata_file_path
185185

186-
def move_to_outputs(self, from_path: Path, to_folder_name: str) -> Path:
187-
self.mkdir_if_needed(self.outputs_path / to_folder_name)
188-
shutil.move(from_path, self.outputs_path / to_folder_name / from_path.name)
189-
return self.outputs_path / to_folder_name / from_path.name
186+
def move_processor_output(self, from_path: Path, folder_name: str) -> Path:
187+
self.mkdir_if_needed(self.processors_outputs_path / folder_name)
188+
shutil.move(from_path, self.processors_outputs_path / folder_name / from_path.name)
189+
return self.processors_outputs_path / folder_name / from_path.name
190190

191191
def _get_stage_path(self, stage: BatchStage) -> Path:
192192
return getattr(self, resolve_string_enum(stage, BatchStage).value)

src/data_designer/engine/processing/processors/to_jsonl.py renamed to src/data_designer/engine/processing/processors/jsonl_export.py

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,13 @@
11
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
# SPDX-License-Identifier: Apache-2.0
33

4-
import json
54
import logging
65
from pathlib import Path
76
import tempfile
87

98
import pandas as pd
109

11-
from data_designer.config.processors import ToJsonlProcessorConfig
10+
from data_designer.config.processors import JsonlExportProcessorConfig
1211
from data_designer.engine.configurable_task import ConfigurableTaskMetadata
1312
from data_designer.engine.processing.ginja.environment import WithJinja2UserTemplateRendering
1413
from data_designer.engine.processing.processors.base import Processor
@@ -17,19 +16,15 @@
1716
logger = logging.getLogger(__name__)
1817

1918

20-
class ToJsonlProcessor(WithJinja2UserTemplateRendering, Processor[ToJsonlProcessorConfig]):
19+
class JsonlExportProcessor(WithJinja2UserTemplateRendering, Processor[JsonlExportProcessorConfig]):
2120
@staticmethod
2221
def metadata() -> ConfigurableTaskMetadata:
2322
return ConfigurableTaskMetadata(
24-
name="to_jsonl",
23+
name="jsonl_export",
2524
description="Save formatted dataset as JSONL files.",
2625
required_resources=None,
2726
)
2827

29-
@property
30-
def template_as_string(self) -> str:
31-
return json.dumps(self.config.template)
32-
3328
def _get_stop_index_per_file(self, dataset_size: int) -> dict[str, int]:
3429
"""Helper function to get the end index for each file of the split."""
3530
stop_index_per_file = {}
@@ -42,7 +37,7 @@ def _get_stop_index_per_file(self, dataset_size: int) -> dict[str, int]:
4237
return stop_index_per_file
4338

4439
def process(self, data: pd.DataFrame, *, current_batch_number: int | None = None) -> pd.DataFrame:
45-
self.prepare_jinja2_template_renderer(self.template_as_string, data.columns.to_list())
40+
self.prepare_jinja2_template_renderer(self.config.template, data.columns.to_list())
4641

4742
stop_index_per_file = self._get_stop_index_per_file(len(data))
4843
with tempfile.TemporaryDirectory() as temp_dir:
@@ -60,6 +55,8 @@ def process(self, data: pd.DataFrame, *, current_batch_number: int | None = None
6055
f.write("\n")
6156
start_index = stop_index
6257

63-
self.artifact_storage.move_to_outputs(Path(temp_dir) / filename, self.config.folder_name)
58+
self.artifact_storage.move_processor_output(
59+
from_path=Path(temp_dir) / filename, folder_name=self.config.name
60+
)
6461

6562
return data

src/data_designer/engine/processing/processors/registry.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,12 @@
44
from data_designer.config.base import ConfigBase
55
from data_designer.config.processors import (
66
DropColumnsProcessorConfig,
7+
JsonlExportProcessorConfig,
78
ProcessorType,
8-
ToJsonlProcessorConfig,
99
)
1010
from data_designer.engine.processing.processors.base import Processor
1111
from data_designer.engine.processing.processors.drop_columns import DropColumnsProcessor
12-
from data_designer.engine.processing.processors.to_jsonl import ToJsonlProcessor
12+
from data_designer.engine.processing.processors.jsonl_export import JsonlExportProcessor
1313
from data_designer.engine.registry.base import TaskRegistry
1414

1515

@@ -19,5 +19,5 @@ class ProcessorRegistry(TaskRegistry[str, Processor, ConfigBase]): ...
1919
def create_default_processor_registry() -> ProcessorRegistry:
2020
registry = ProcessorRegistry()
2121
registry.register(ProcessorType.DROP_COLUMNS, DropColumnsProcessor, DropColumnsProcessorConfig, False)
22-
registry.register(ProcessorType.TO_JSONL, ToJsonlProcessor, ToJsonlProcessorConfig, False)
22+
registry.register(ProcessorType.JSONL_EXPORT, JsonlExportProcessor, JsonlExportProcessorConfig, False)
2323
return registry

src/data_designer/essentials/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
UniformDistribution,
3535
UniformDistributionParams,
3636
)
37-
from ..config.processors import DropColumnsProcessorConfig, ProcessorType, ToJsonlProcessorConfig
37+
from ..config.processors import DropColumnsProcessorConfig, JsonlExportProcessorConfig, ProcessorType
3838
from ..config.sampler_constraints import ColumnInequalityConstraint, ScalarInequalityConstraint
3939
from ..config.sampler_params import (
4040
BernoulliMixtureSamplerParams,
@@ -98,6 +98,7 @@
9898
"ImageFormat",
9999
"InferenceParameters",
100100
"JudgeScoreProfilerConfig",
101+
"JsonlExportProcessorConfig",
101102
"LLMCodeColumnConfig",
102103
"LLMJudgeColumnConfig",
103104
"LLMStructuredColumnConfig",
@@ -124,7 +125,6 @@
124125
"SeedDatasetColumnConfig",
125126
"SubcategorySamplerParams",
126127
"TimeDeltaSamplerParams",
127-
"ToJsonlProcessorConfig",
128128
"UniformDistribution",
129129
"UniformDistributionParams",
130130
"UniformSamplerParams",

tests/config/test_processors.py

Lines changed: 24 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@
77
from data_designer.config.dataset_builders import BuildStage
88
from data_designer.config.processors import (
99
DropColumnsProcessorConfig,
10+
JsonlExportProcessorConfig,
1011
ProcessorConfig,
1112
ProcessorType,
12-
ToJsonlProcessorConfig,
1313
get_processor_config_from_kwargs,
1414
)
1515

@@ -47,66 +47,59 @@ def test_drop_columns_processor_config_serialization():
4747
assert config_restored.column_names == config.column_names
4848

4949

50-
def test_to_jsonl_processor_config_creation():
51-
config = ToJsonlProcessorConfig(
50+
def test_jsonl_export_processor_config_creation():
51+
config = JsonlExportProcessorConfig(
5252
build_stage=BuildStage.POST_BATCH,
53-
template={"text": "{{ col1 }}"},
54-
folder_name="jsonl_output",
53+
template='{"text": "{{ col1 }}"}',
5554
fraction_per_file={"train.jsonl": 0.8, "validation.jsonl": 0.2},
5655
)
5756

5857
assert config.build_stage == BuildStage.POST_BATCH
59-
assert config.template == {"text": "{{ col1 }}"}
60-
assert config.folder_name == "jsonl_output"
58+
assert config.template == '{"text": "{{ col1 }}"}'
6159
assert config.fraction_per_file == {"train.jsonl": 0.8, "validation.jsonl": 0.2}
62-
assert config.processor_type == ProcessorType.TO_JSONL
60+
assert config.processor_type == ProcessorType.JSONL_EXPORT
6361
assert isinstance(config, ProcessorConfig)
6462

6563

66-
def test_to_jsonl_processor_config_validation():
64+
def test_jsonl_export_processor_config_validation():
6765
# Test unsupported stage raises error
6866
with pytest.raises(ValidationError, match="Invalid dataset builder stage"):
69-
ToJsonlProcessorConfig(
67+
JsonlExportProcessorConfig(
7068
build_stage=BuildStage.PRE_BATCH,
71-
template={"text": "{{ col1 }}"},
72-
folder_name="jsonl_output",
69+
template='{"text": "{{ col1 }}"}',
7370
fraction_per_file={"train.jsonl": 0.8, "validation.jsonl": 0.2},
7471
)
7572

7673
# Test missing required field raises error
7774
with pytest.raises(ValidationError, match="Field required"):
78-
ToJsonlProcessorConfig(build_stage=BuildStage.POST_BATCH, template={"text": "{{ col1 }}"})
75+
JsonlExportProcessorConfig(build_stage=BuildStage.POST_BATCH)
7976

8077
# Test invalid fraction per file raises error
8178
with pytest.raises(ValidationError, match="The fractions must sum to 1."):
82-
ToJsonlProcessorConfig(
79+
JsonlExportProcessorConfig(
8380
build_stage=BuildStage.POST_BATCH,
84-
template={"text": "{{ col1 }}"},
85-
folder_name="jsonl_output",
81+
template='{"text": "{{ col1 }}"}',
8682
fraction_per_file={"train.jsonl": 0.8, "validation.jsonl": 0.3},
8783
)
8884

8985

90-
def test_to_jsonl_processor_config_serialization():
91-
config = ToJsonlProcessorConfig(
86+
def test_jsonl_export_processor_config_serialization():
87+
config = JsonlExportProcessorConfig(
9288
build_stage=BuildStage.POST_BATCH,
93-
template={"text": "{{ col1 }}"},
94-
folder_name="jsonl_output",
89+
template='{"text": "{{ col1 }}"}',
9590
fraction_per_file={"train.jsonl": 0.8, "validation.jsonl": 0.2},
9691
)
9792

9893
# Serialize to dict
9994
config_dict = config.model_dump()
10095
assert config_dict["build_stage"] == "post_batch"
101-
assert config_dict["template"] == {"text": "{{ col1 }}"}
102-
assert config_dict["folder_name"] == "jsonl_output"
96+
assert config_dict["template"] == '{"text": "{{ col1 }}"}'
10397
assert config_dict["fraction_per_file"] == {"train.jsonl": 0.8, "validation.jsonl": 0.2}
10498

10599
# Deserialize from dict
106-
config_restored = ToJsonlProcessorConfig.model_validate(config_dict)
100+
config_restored = JsonlExportProcessorConfig.model_validate(config_dict)
107101
assert config_restored.build_stage == config.build_stage
108102
assert config_restored.template == config.template
109-
assert config_restored.folder_name == config.folder_name
110103
assert config_restored.fraction_per_file == config.fraction_per_file
111104

112105

@@ -119,18 +112,16 @@ def test_get_processor_config_from_kwargs():
119112
assert config_drop_columns.column_names == ["col1"]
120113
assert config_drop_columns.processor_type == ProcessorType.DROP_COLUMNS
121114

122-
config_to_jsonl = get_processor_config_from_kwargs(
123-
ProcessorType.TO_JSONL,
115+
config_jsonl_export = get_processor_config_from_kwargs(
116+
ProcessorType.JSONL_EXPORT,
124117
build_stage=BuildStage.POST_BATCH,
125-
template={"text": "{{ col1 }}"},
126-
folder_name="jsonl_output",
118+
template='{"text": "{{ col1 }}"}',
127119
fraction_per_file={"train.jsonl": 0.8, "validation.jsonl": 0.2},
128120
)
129-
assert isinstance(config_to_jsonl, ToJsonlProcessorConfig)
130-
assert config_to_jsonl.template == {"text": "{{ col1 }}"}
131-
assert config_to_jsonl.folder_name == "jsonl_output"
132-
assert config_to_jsonl.fraction_per_file == {"train.jsonl": 0.8, "validation.jsonl": 0.2}
133-
assert config_to_jsonl.processor_type == ProcessorType.TO_JSONL
121+
assert isinstance(config_jsonl_export, JsonlExportProcessorConfig)
122+
assert config_jsonl_export.template == '{"text": "{{ col1 }}"}'
123+
assert config_jsonl_export.fraction_per_file == {"train.jsonl": 0.8, "validation.jsonl": 0.2}
124+
assert config_jsonl_export.processor_type == ProcessorType.JSONL_EXPORT
134125

135126
# Test with unknown processor type returns None
136127
from enum import Enum

0 commit comments

Comments
 (0)