Skip to content

Commit db6a276

Browse files
committed
lint plus tests
1 parent 82e8641 commit db6a276

File tree

8 files changed

+112
-99
lines changed

8 files changed

+112
-99
lines changed

examples/example.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -193,4 +193,4 @@
193193
preview = dd.preview(config_builder, num_records=10)
194194

195195
results = dd.create(config_builder, num_records=20)
196-
results.write_processor_outputs_to_disk("./processor_outputs", "jsonl")
196+
results.write_processor_outputs_to_disk("./processor_outputs", "jsonl")

src/data_designer/config/processors.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,4 +50,4 @@ class DropColumnsProcessorConfig(ProcessorConfig):
5050

5151
class OutputFormatProcessorConfig(ProcessorConfig):
5252
template: str = Field(..., description="The template to use for each entry in the dataset, as a single string.")
53-
processor_type: Literal[ProcessorType.OUTPUT_FORMAT] = ProcessorType.OUTPUT_FORMAT
53+
processor_type: Literal[ProcessorType.OUTPUT_FORMAT] = ProcessorType.OUTPUT_FORMAT

src/data_designer/engine/processing/processors/output_format.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,9 @@ def metadata() -> ConfigurableTaskMetadata:
2626

2727
def process(self, data: pd.DataFrame, *, current_batch_number: int | None = None) -> pd.DataFrame:
2828
self.prepare_jinja2_template_renderer(self.config.template, data.columns.to_list())
29-
formatted_records = [self.render_template(deserialize_json_values(record)) for record in data.to_dict(orient="records")]
29+
formatted_records = [
30+
self.render_template(deserialize_json_values(record)) for record in data.to_dict(orient="records")
31+
]
3032
formatted_data = pd.DataFrame(formatted_records, columns=["formatted_output"])
3133
if current_batch_number is not None:
3234
self.artifact_storage.write_batch_to_parquet_file(

src/data_designer/essentials/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,6 @@
9999
"ImageFormat",
100100
"InferenceParameters",
101101
"JudgeScoreProfilerConfig",
102-
"JsonlExportProcessorConfig",
103102
"LLMCodeColumnConfig",
104103
"LLMJudgeColumnConfig",
105104
"LLMStructuredColumnConfig",
@@ -111,6 +110,7 @@
111110
"ModalityContext",
112111
"ModalityDataType",
113112
"ModelConfig",
113+
"OutputFormatProcessorConfig",
114114
"PartitionBlock",
115115
"PersonSamplerParams",
116116
"PersonFromFakerSamplerParams",

src/data_designer/interface/results.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
# SPDX-License-Identifier: Apache-2.0
33

44
from __future__ import annotations
5+
56
from pathlib import Path
67
from typing import Literal
78

@@ -71,4 +72,4 @@ def write_processor_outputs_to_disk(self, output_folder: Path | str, extension:
7172
# TODO: faster way to convert than reading and writing row by row?
7273
dataframe = pd.read_parquet(file_path)
7374
for _, row in dataframe.iterrows():
74-
f.write(row["formatted_output"].replace("\n", "\\n") + "\n")
75+
f.write(row["formatted_output"].replace("\n", "\\n") + "\n")

tests/config/test_processors.py

Lines changed: 15 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from data_designer.config.dataset_builders import BuildStage
88
from data_designer.config.processors import (
99
DropColumnsProcessorConfig,
10-
JsonlExportProcessorConfig,
10+
OutputFormatProcessorConfig,
1111
ProcessorConfig,
1212
ProcessorType,
1313
get_processor_config_from_kwargs,
@@ -47,60 +47,46 @@ def test_drop_columns_processor_config_serialization():
4747
assert config_restored.column_names == config.column_names
4848

4949

50-
def test_jsonl_export_processor_config_creation():
51-
config = JsonlExportProcessorConfig(
50+
def test_output_format_processor_config_creation():
51+
config = OutputFormatProcessorConfig(
5252
build_stage=BuildStage.POST_BATCH,
5353
template='{"text": "{{ col1 }}"}',
54-
fraction_per_file={"train.jsonl": 0.8, "validation.jsonl": 0.2},
5554
)
5655

5756
assert config.build_stage == BuildStage.POST_BATCH
5857
assert config.template == '{"text": "{{ col1 }}"}'
59-
assert config.fraction_per_file == {"train.jsonl": 0.8, "validation.jsonl": 0.2}
60-
assert config.processor_type == ProcessorType.JSONL_EXPORT
58+
assert config.processor_type == ProcessorType.OUTPUT_FORMAT
6159
assert isinstance(config, ProcessorConfig)
6260

6361

64-
def test_jsonl_export_processor_config_validation():
62+
def test_output_format_processor_config_validation():
6563
# Test unsupported stage raises error
6664
with pytest.raises(ValidationError, match="Invalid dataset builder stage"):
67-
JsonlExportProcessorConfig(
65+
OutputFormatProcessorConfig(
6866
build_stage=BuildStage.PRE_BATCH,
6967
template='{"text": "{{ col1 }}"}',
70-
fraction_per_file={"train.jsonl": 0.8, "validation.jsonl": 0.2},
7168
)
7269

7370
# Test missing required field raises error
7471
with pytest.raises(ValidationError, match="Field required"):
75-
JsonlExportProcessorConfig(build_stage=BuildStage.POST_BATCH)
76-
77-
# Test invalid fraction per file raises error
78-
with pytest.raises(ValidationError, match="The fractions must sum to 1."):
79-
JsonlExportProcessorConfig(
80-
build_stage=BuildStage.POST_BATCH,
81-
template='{"text": "{{ col1 }}"}',
82-
fraction_per_file={"train.jsonl": 0.8, "validation.jsonl": 0.3},
83-
)
72+
OutputFormatProcessorConfig(build_stage=BuildStage.POST_BATCH)
8473

8574

86-
def test_jsonl_export_processor_config_serialization():
87-
config = JsonlExportProcessorConfig(
75+
def test_output_format_processor_config_serialization():
76+
config = OutputFormatProcessorConfig(
8877
build_stage=BuildStage.POST_BATCH,
8978
template='{"text": "{{ col1 }}"}',
90-
fraction_per_file={"train.jsonl": 0.8, "validation.jsonl": 0.2},
9179
)
9280

9381
# Serialize to dict
9482
config_dict = config.model_dump()
9583
assert config_dict["build_stage"] == "post_batch"
9684
assert config_dict["template"] == '{"text": "{{ col1 }}"}'
97-
assert config_dict["fraction_per_file"] == {"train.jsonl": 0.8, "validation.jsonl": 0.2}
9885

9986
# Deserialize from dict
100-
config_restored = JsonlExportProcessorConfig.model_validate(config_dict)
87+
config_restored = OutputFormatProcessorConfig.model_validate(config_dict)
10188
assert config_restored.build_stage == config.build_stage
10289
assert config_restored.template == config.template
103-
assert config_restored.fraction_per_file == config.fraction_per_file
10490

10591

10692
def test_get_processor_config_from_kwargs():
@@ -112,16 +98,14 @@ def test_get_processor_config_from_kwargs():
11298
assert config_drop_columns.column_names == ["col1"]
11399
assert config_drop_columns.processor_type == ProcessorType.DROP_COLUMNS
114100

115-
config_jsonl_export = get_processor_config_from_kwargs(
116-
ProcessorType.JSONL_EXPORT,
101+
config_output_format = get_processor_config_from_kwargs(
102+
ProcessorType.OUTPUT_FORMAT,
117103
build_stage=BuildStage.POST_BATCH,
118104
template='{"text": "{{ col1 }}"}',
119-
fraction_per_file={"train.jsonl": 0.8, "validation.jsonl": 0.2},
120105
)
121-
assert isinstance(config_jsonl_export, JsonlExportProcessorConfig)
122-
assert config_jsonl_export.template == '{"text": "{{ col1 }}"}'
123-
assert config_jsonl_export.fraction_per_file == {"train.jsonl": 0.8, "validation.jsonl": 0.2}
124-
assert config_jsonl_export.processor_type == ProcessorType.JSONL_EXPORT
106+
assert isinstance(config_output_format, OutputFormatProcessorConfig)
107+
assert config_output_format.template == '{"text": "{{ col1 }}"}'
108+
assert config_output_format.processor_type == ProcessorType.OUTPUT_FORMAT
125109

126110
# Test with unknown processor type returns None
127111
from enum import Enum

tests/engine/processing/processors/test_output_format.py

Lines changed: 86 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -2,34 +2,34 @@
22
# SPDX-License-Identifier: Apache-2.0
33

44
import json
5-
from pathlib import Path
6-
from unittest.mock import Mock, patch
5+
from unittest.mock import Mock
76

87
import pandas as pd
98
import pytest
109

1110
from data_designer.config.dataset_builders import BuildStage
12-
from data_designer.config.processors import JsonlExportProcessorConfig
13-
from data_designer.engine.processing.processors.jsonl_export import JsonlExportProcessor
11+
from data_designer.config.processors import OutputFormatProcessorConfig
12+
from data_designer.engine.dataset_builders.artifact_storage import BatchStage
13+
from data_designer.engine.processing.processors.output_format import OutputFormatProcessor
1414

1515

1616
@pytest.fixture
17-
def stub_processor_config() -> JsonlExportProcessorConfig:
18-
return JsonlExportProcessorConfig(
17+
def stub_processor_config() -> OutputFormatProcessorConfig:
18+
return OutputFormatProcessorConfig(
1919
build_stage=BuildStage.POST_BATCH,
2020
template='{"text": "{{ col1 }}", "value": "{{ col2 }}"}',
21-
fraction_per_file={"train.jsonl": 0.75, "validation.jsonl": 0.25},
21+
name="test_output_format",
2222
)
2323

2424

2525
@pytest.fixture
26-
def stub_processor(stub_processor_config: JsonlExportProcessorConfig, tmp_path: Path) -> JsonlExportProcessor:
26+
def stub_processor(stub_processor_config: OutputFormatProcessorConfig) -> OutputFormatProcessor:
2727
mock_resource_provider = Mock()
2828
mock_artifact_storage = Mock()
29-
mock_artifact_storage.move_processor_output = Mock()
29+
mock_artifact_storage.write_batch_to_parquet_file = Mock()
3030
mock_resource_provider.artifact_storage = mock_artifact_storage
3131

32-
processor = JsonlExportProcessor(
32+
processor = OutputFormatProcessor(
3333
config=stub_processor_config,
3434
resource_provider=mock_resource_provider,
3535
)
@@ -47,80 +47,103 @@ def stub_simple_dataframe() -> pd.DataFrame:
4747

4848

4949
def test_metadata() -> None:
50-
metadata = JsonlExportProcessor.metadata()
50+
metadata = OutputFormatProcessor.metadata()
5151

52-
assert metadata.name == "jsonl_export"
53-
assert metadata.description == "Save formatted dataset as JSONL files."
52+
assert metadata.name == "output_format"
53+
assert metadata.description == "Format the dataset using a Jinja2 template."
5454
assert metadata.required_resources is None
5555

5656

57-
def test_template_as_string(stub_processor: JsonlExportProcessor) -> None:
58-
template_str = stub_processor.config.template
59-
assert isinstance(template_str, str)
60-
assert template_str == '{"text": "{{ col1 }}", "value": "{{ col2 }}"}'
57+
def test_process_returns_original_dataframe(
58+
stub_processor: OutputFormatProcessor, stub_simple_dataframe: pd.DataFrame
59+
) -> None:
60+
result = stub_processor.process(stub_simple_dataframe, current_batch_number=0)
61+
pd.testing.assert_frame_equal(result, stub_simple_dataframe)
62+
6163

64+
def test_process_writes_formatted_output_to_parquet(
65+
stub_processor: OutputFormatProcessor, stub_simple_dataframe: pd.DataFrame
66+
) -> None:
67+
# Capture the formatted dataframe that is written to parquet
68+
written_dataframe: pd.DataFrame | None = None
6269

63-
def test_get_stop_index_per_file(stub_processor: JsonlExportProcessor) -> None:
64-
stub_processor.config.fraction_per_file = {"train.jsonl": 0.8, "val.jsonl": 0.15, "test.jsonl": 0.05}
65-
result = stub_processor._get_stop_index_per_file(100)
70+
def capture_dataframe(batch_number: int, dataframe: pd.DataFrame, batch_stage: BatchStage, subfolder: str) -> None:
71+
nonlocal written_dataframe
72+
written_dataframe = dataframe
6673

67-
assert result == {"train.jsonl": 80, "val.jsonl": 95, "test.jsonl": 100}
74+
stub_processor.artifact_storage.write_batch_to_parquet_file.side_effect = capture_dataframe
6875

76+
# Process the dataframe
77+
result = stub_processor.process(stub_simple_dataframe, current_batch_number=0)
6978

70-
def test_process_returns_original_dataframe(
71-
stub_processor: JsonlExportProcessor, stub_simple_dataframe: pd.DataFrame
72-
) -> None:
73-
result = stub_processor.process(stub_simple_dataframe)
79+
# Verify the original dataframe is returned
7480
pd.testing.assert_frame_equal(result, stub_simple_dataframe)
7581

82+
# Verify write_batch_to_parquet_file was called with correct parameters
83+
stub_processor.artifact_storage.write_batch_to_parquet_file.assert_called_once()
84+
call_args = stub_processor.artifact_storage.write_batch_to_parquet_file.call_args
85+
86+
assert call_args.kwargs["batch_number"] == 0
87+
assert call_args.kwargs["batch_stage"] == BatchStage.PROCESSORS_OUTPUTS
88+
assert call_args.kwargs["subfolder"] == "test_output_format"
89+
90+
# Verify the formatted dataframe has the correct structure
91+
assert written_dataframe is not None
92+
assert list(written_dataframe.columns) == ["formatted_output"]
93+
assert len(written_dataframe) == 4
94+
95+
# Verify the formatted content
96+
expected_formatted_output = [
97+
'{"text": "hello", "value": "1"}',
98+
'{"text": "world", "value": "2"}',
99+
'{"text": "test", "value": "3"}',
100+
'{"text": "data", "value": "4"}',
101+
]
76102

77-
def test_process_writes_correct_content_to_files(
78-
stub_processor: JsonlExportProcessor, stub_simple_dataframe: pd.DataFrame
79-
) -> None:
80-
stub_processor.config.fraction_per_file = {"train.jsonl": 0.75, "validation.jsonl": 0.25}
81-
82-
# Capture the content of the files that are written to the outputs folder
83-
file_contents: dict[str, str] = {}
103+
for i, expected in enumerate(expected_formatted_output):
104+
actual = written_dataframe.iloc[i]["formatted_output"]
105+
# Parse both as JSON to compare structure (ignoring whitespace differences)
106+
assert json.loads(actual) == json.loads(expected), f"Row {i} mismatch: {actual} != {expected}"
84107

85-
def capture_file_content(from_path: Path, folder_name: str) -> None:
86-
with open(from_path, "r") as f:
87-
file_contents[from_path.name] = f.read()
88108

89-
stub_processor.artifact_storage.move_processor_output.side_effect = capture_file_content
109+
def test_process_without_batch_number_does_not_write(
110+
stub_processor: OutputFormatProcessor, stub_simple_dataframe: pd.DataFrame
111+
) -> None:
112+
# Process without batch number (preview mode)
113+
result = stub_processor.process(stub_simple_dataframe, current_batch_number=None)
90114

91-
# Process the dataframe and write the files to the outputs folder
92-
with patch("data_designer.engine.processing.processors.jsonl_export.logger"):
93-
stub_processor.process(stub_simple_dataframe)
115+
# Verify the original dataframe is returned
116+
pd.testing.assert_frame_equal(result, stub_simple_dataframe)
94117

95-
# Check that the files were moved with the correct names
96-
assert stub_processor.artifact_storage.move_processor_output.call_count == 2
118+
# Verify write_batch_to_parquet_file was NOT called
119+
stub_processor.artifact_storage.write_batch_to_parquet_file.assert_not_called()
97120

98-
assert "train.jsonl" in file_contents
99-
assert "validation.jsonl" in file_contents
100121

101-
# Check that the files contain the correct content
102-
train_lines = file_contents["train.jsonl"].strip().split("\n") if file_contents["train.jsonl"].strip() else []
103-
validation_lines = (
104-
file_contents["validation.jsonl"].strip().split("\n") if file_contents["validation.jsonl"].strip() else []
122+
def test_process_with_json_serialized_values(stub_processor: OutputFormatProcessor) -> None:
123+
# Test with JSON-serialized values in dataframe
124+
df_with_json = pd.DataFrame(
125+
{
126+
"col1": ["hello", "world"],
127+
"col2": ['{"nested": "value1"}', '{"nested": "value2"}'],
128+
}
105129
)
106130

107-
assert len(train_lines) == 3, f"Expected 3 lines in train.jsonl, got {len(train_lines)}"
108-
assert len(validation_lines) == 1, f"Expected 1 line in validation.jsonl, got {len(validation_lines)}"
131+
written_dataframe: pd.DataFrame | None = None
109132

110-
expected_train_data = [
111-
{"text": "hello", "value": "1"},
112-
{"text": "world", "value": "2"},
113-
{"text": "test", "value": "3"},
114-
]
133+
def capture_dataframe(batch_number: int, dataframe: pd.DataFrame, batch_stage: BatchStage, subfolder: str) -> None:
134+
nonlocal written_dataframe
135+
written_dataframe = dataframe
136+
137+
stub_processor.artifact_storage.write_batch_to_parquet_file.side_effect = capture_dataframe
115138

116-
for i, line in enumerate(train_lines):
117-
parsed = json.loads(line)
118-
assert parsed == expected_train_data[i], f"Train line {i} mismatch: {parsed} != {expected_train_data[i]}"
139+
# Process the dataframe
140+
stub_processor.process(df_with_json, current_batch_number=0)
119141

120-
expected_validation_data = [{"text": "data", "value": "4"}]
142+
# Verify the formatted dataframe was written
143+
assert written_dataframe is not None
144+
assert len(written_dataframe) == 2
121145

122-
for i, line in enumerate(validation_lines):
123-
parsed = json.loads(line)
124-
assert parsed == expected_validation_data[i], (
125-
f"Validation line {i} mismatch: {parsed} != {expected_validation_data[i]}"
126-
)
146+
# Verify that nested JSON values are properly deserialized in template rendering
147+
first_output = json.loads(written_dataframe.iloc[0]["formatted_output"])
148+
assert first_output["text"] == "hello"
149+
assert first_output["value"] == "{'nested': 'value1'}"

tests/engine/test_configurable_task.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ def _initialize(self) -> None:
6868
mock_artifact_storage.final_dataset_folder_name = "final_dataset"
6969
mock_artifact_storage.partial_results_folder_name = "partial_results"
7070
mock_artifact_storage.dropped_columns_folder_name = "dropped_columns"
71+
mock_artifact_storage.processors_outputs_folder_name = "processors_outputs"
7172
resource_provider = ResourceProvider(artifact_storage=mock_artifact_storage)
7273

7374
task = TestTask(config=config, resource_provider=resource_provider)
@@ -99,6 +100,7 @@ def _validate(self) -> None:
99100
mock_artifact_storage.final_dataset_folder_name = "final_dataset"
100101
mock_artifact_storage.partial_results_folder_name = "partial_results"
101102
mock_artifact_storage.dropped_columns_folder_name = "dropped_columns"
103+
mock_artifact_storage.processors_outputs_folder_name = "processors_outputs"
102104
resource_provider = ResourceProvider(artifact_storage=mock_artifact_storage)
103105

104106
task = TestTask(config=config, resource_provider=resource_provider)
@@ -137,6 +139,7 @@ def _initialize(self) -> None:
137139
mock_artifact_storage.final_dataset_folder_name = "final_dataset"
138140
mock_artifact_storage.partial_results_folder_name = "partial_results"
139141
mock_artifact_storage.dropped_columns_folder_name = "dropped_columns"
142+
mock_artifact_storage.processors_outputs_folder_name = "processors_outputs"
140143
mock_model_registry = Mock(spec=ModelRegistry)
141144
resource_provider = ResourceProvider(artifact_storage=mock_artifact_storage, model_registry=mock_model_registry)
142145
task = TestTask(config=config, resource_provider=resource_provider)

0 commit comments

Comments
 (0)