lint plus tests

andreatgretel · andreatgretel · commit db6a276f99e5 · 2025-11-20T11:27:59.000-03:00
diff --git a/examples/example.py b/examples/example.py
@@ -193,4 +193,4 @@
 preview = dd.preview(config_builder, num_records=10)
 
 results = dd.create(config_builder, num_records=20)
-results.write_processor_outputs_to_disk("./processor_outputs", "jsonl")
+results.write_processor_outputs_to_disk("./processor_outputs", "jsonl")
diff --git a/src/data_designer/config/processors.py b/src/data_designer/config/processors.py
@@ -50,4 +50,4 @@ class DropColumnsProcessorConfig(ProcessorConfig):
 
 class OutputFormatProcessorConfig(ProcessorConfig):
     template: str = Field(..., description="The template to use for each entry in the dataset, as a single string.")
-    processor_type: Literal[ProcessorType.OUTPUT_FORMAT] = ProcessorType.OUTPUT_FORMAT
+    processor_type: Literal[ProcessorType.OUTPUT_FORMAT] = ProcessorType.OUTPUT_FORMAT
diff --git a/src/data_designer/engine/processing/processors/output_format.py b/src/data_designer/engine/processing/processors/output_format.py
@@ -26,7 +26,9 @@ def metadata() -> ConfigurableTaskMetadata:
 
     def process(self, data: pd.DataFrame, *, current_batch_number: int | None = None) -> pd.DataFrame:
         self.prepare_jinja2_template_renderer(self.config.template, data.columns.to_list())
-        formatted_records = [self.render_template(deserialize_json_values(record)) for record in data.to_dict(orient="records")]
+        formatted_records = [
+            self.render_template(deserialize_json_values(record)) for record in data.to_dict(orient="records")
+        ]
         formatted_data = pd.DataFrame(formatted_records, columns=["formatted_output"])
         if current_batch_number is not None:
             self.artifact_storage.write_batch_to_parquet_file(
diff --git a/src/data_designer/essentials/__init__.py b/src/data_designer/essentials/__init__.py
@@ -99,7 +99,6 @@
     "ImageFormat",
     "InferenceParameters",
     "JudgeScoreProfilerConfig",
-    "JsonlExportProcessorConfig",
     "LLMCodeColumnConfig",
     "LLMJudgeColumnConfig",
     "LLMStructuredColumnConfig",
@@ -111,6 +110,7 @@
     "ModalityContext",
     "ModalityDataType",
     "ModelConfig",
+    "OutputFormatProcessorConfig",
     "PartitionBlock",
     "PersonSamplerParams",
     "PersonFromFakerSamplerParams",
diff --git a/src/data_designer/interface/results.py b/src/data_designer/interface/results.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from __future__ import annotations
+
 from pathlib import Path
 from typing import Literal
 
@@ -71,4 +72,4 @@ def write_processor_outputs_to_disk(self, output_folder: Path | str, extension:
                     # TODO: faster way to convert than reading and writing row by row?
                     dataframe = pd.read_parquet(file_path)
                     for _, row in dataframe.iterrows():
-                        f.write(row["formatted_output"].replace("\n", "\\n") + "\n")
+                        f.write(row["formatted_output"].replace("\n", "\\n") + "\n")
diff --git a/tests/config/test_processors.py b/tests/config/test_processors.py
@@ -7,7 +7,7 @@
 from data_designer.config.dataset_builders import BuildStage
 from data_designer.config.processors import (
     DropColumnsProcessorConfig,
-    JsonlExportProcessorConfig,
+    OutputFormatProcessorConfig,
     ProcessorConfig,
     ProcessorType,
     get_processor_config_from_kwargs,
@@ -47,60 +47,46 @@ def test_drop_columns_processor_config_serialization():
     assert config_restored.column_names == config.column_names
 
 
-def test_jsonl_export_processor_config_creation():
-    config = JsonlExportProcessorConfig(
+def test_output_format_processor_config_creation():
+    config = OutputFormatProcessorConfig(
         build_stage=BuildStage.POST_BATCH,
         template='{"text": "{{ col1 }}"}',
-        fraction_per_file={"train.jsonl": 0.8, "validation.jsonl": 0.2},
     )
 
     assert config.build_stage == BuildStage.POST_BATCH
     assert config.template == '{"text": "{{ col1 }}"}'
-    assert config.fraction_per_file == {"train.jsonl": 0.8, "validation.jsonl": 0.2}
-    assert config.processor_type == ProcessorType.JSONL_EXPORT
+    assert config.processor_type == ProcessorType.OUTPUT_FORMAT
     assert isinstance(config, ProcessorConfig)
 
 
-def test_jsonl_export_processor_config_validation():
+def test_output_format_processor_config_validation():
     # Test unsupported stage raises error
     with pytest.raises(ValidationError, match="Invalid dataset builder stage"):
-        JsonlExportProcessorConfig(
+        OutputFormatProcessorConfig(
             build_stage=BuildStage.PRE_BATCH,
             template='{"text": "{{ col1 }}"}',
-            fraction_per_file={"train.jsonl": 0.8, "validation.jsonl": 0.2},
         )
 
     # Test missing required field raises error
     with pytest.raises(ValidationError, match="Field required"):
-        JsonlExportProcessorConfig(build_stage=BuildStage.POST_BATCH)
-
-    # Test invalid fraction per file raises error
-    with pytest.raises(ValidationError, match="The fractions must sum to 1."):
-        JsonlExportProcessorConfig(
-            build_stage=BuildStage.POST_BATCH,
-            template='{"text": "{{ col1 }}"}',
-            fraction_per_file={"train.jsonl": 0.8, "validation.jsonl": 0.3},
-        )
+        OutputFormatProcessorConfig(build_stage=BuildStage.POST_BATCH)
 
 
-def test_jsonl_export_processor_config_serialization():
-    config = JsonlExportProcessorConfig(
+def test_output_format_processor_config_serialization():
+    config = OutputFormatProcessorConfig(
         build_stage=BuildStage.POST_BATCH,
         template='{"text": "{{ col1 }}"}',
-        fraction_per_file={"train.jsonl": 0.8, "validation.jsonl": 0.2},
     )
 
     # Serialize to dict
     config_dict = config.model_dump()
     assert config_dict["build_stage"] == "post_batch"
     assert config_dict["template"] == '{"text": "{{ col1 }}"}'
-    assert config_dict["fraction_per_file"] == {"train.jsonl": 0.8, "validation.jsonl": 0.2}
 
     # Deserialize from dict
-    config_restored = JsonlExportProcessorConfig.model_validate(config_dict)
+    config_restored = OutputFormatProcessorConfig.model_validate(config_dict)
     assert config_restored.build_stage == config.build_stage
     assert config_restored.template == config.template
-    assert config_restored.fraction_per_file == config.fraction_per_file
 
 
 def test_get_processor_config_from_kwargs():
@@ -112,16 +98,14 @@ def test_get_processor_config_from_kwargs():
     assert config_drop_columns.column_names == ["col1"]
     assert config_drop_columns.processor_type == ProcessorType.DROP_COLUMNS
 
-    config_jsonl_export = get_processor_config_from_kwargs(
-        ProcessorType.JSONL_EXPORT,
+    config_output_format = get_processor_config_from_kwargs(
+        ProcessorType.OUTPUT_FORMAT,
         build_stage=BuildStage.POST_BATCH,
         template='{"text": "{{ col1 }}"}',
-        fraction_per_file={"train.jsonl": 0.8, "validation.jsonl": 0.2},
     )
-    assert isinstance(config_jsonl_export, JsonlExportProcessorConfig)
-    assert config_jsonl_export.template == '{"text": "{{ col1 }}"}'
-    assert config_jsonl_export.fraction_per_file == {"train.jsonl": 0.8, "validation.jsonl": 0.2}
-    assert config_jsonl_export.processor_type == ProcessorType.JSONL_EXPORT
+    assert isinstance(config_output_format, OutputFormatProcessorConfig)
+    assert config_output_format.template == '{"text": "{{ col1 }}"}'
+    assert config_output_format.processor_type == ProcessorType.OUTPUT_FORMAT
 
     # Test with unknown processor type returns None
     from enum import Enum
diff --git a/tests/engine/processing/processors/test_output_format.py b/tests/engine/processing/processors/test_output_format.py
@@ -2,34 +2,34 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import json
-from pathlib import Path
-from unittest.mock import Mock, patch
+from unittest.mock import Mock
 
 import pandas as pd
 import pytest
 
 from data_designer.config.dataset_builders import BuildStage
-from data_designer.config.processors import JsonlExportProcessorConfig
-from data_designer.engine.processing.processors.jsonl_export import JsonlExportProcessor
+from data_designer.config.processors import OutputFormatProcessorConfig
+from data_designer.engine.dataset_builders.artifact_storage import BatchStage
+from data_designer.engine.processing.processors.output_format import OutputFormatProcessor
 
 
 @pytest.fixture
-def stub_processor_config() -> JsonlExportProcessorConfig:
-    return JsonlExportProcessorConfig(
+def stub_processor_config() -> OutputFormatProcessorConfig:
+    return OutputFormatProcessorConfig(
         build_stage=BuildStage.POST_BATCH,
         template='{"text": "{{ col1 }}", "value": "{{ col2 }}"}',
-        fraction_per_file={"train.jsonl": 0.75, "validation.jsonl": 0.25},
+        name="test_output_format",
     )
 
 
 @pytest.fixture
-def stub_processor(stub_processor_config: JsonlExportProcessorConfig, tmp_path: Path) -> JsonlExportProcessor:
+def stub_processor(stub_processor_config: OutputFormatProcessorConfig) -> OutputFormatProcessor:
     mock_resource_provider = Mock()
     mock_artifact_storage = Mock()
-    mock_artifact_storage.move_processor_output = Mock()
+    mock_artifact_storage.write_batch_to_parquet_file = Mock()
     mock_resource_provider.artifact_storage = mock_artifact_storage
 
-    processor = JsonlExportProcessor(
+    processor = OutputFormatProcessor(
         config=stub_processor_config,
         resource_provider=mock_resource_provider,
     )
@@ -47,80 +47,103 @@ def stub_simple_dataframe() -> pd.DataFrame:
 
 
 def test_metadata() -> None:
-    metadata = JsonlExportProcessor.metadata()
+    metadata = OutputFormatProcessor.metadata()
 
-    assert metadata.name == "jsonl_export"
-    assert metadata.description == "Save formatted dataset as JSONL files."
+    assert metadata.name == "output_format"
+    assert metadata.description == "Format the dataset using a Jinja2 template."
     assert metadata.required_resources is None
 
 
-def test_template_as_string(stub_processor: JsonlExportProcessor) -> None:
-    template_str = stub_processor.config.template
-    assert isinstance(template_str, str)
-    assert template_str == '{"text": "{{ col1 }}", "value": "{{ col2 }}"}'
+def test_process_returns_original_dataframe(
+    stub_processor: OutputFormatProcessor, stub_simple_dataframe: pd.DataFrame
+) -> None:
+    result = stub_processor.process(stub_simple_dataframe, current_batch_number=0)
+    pd.testing.assert_frame_equal(result, stub_simple_dataframe)
+
 
+def test_process_writes_formatted_output_to_parquet(
+    stub_processor: OutputFormatProcessor, stub_simple_dataframe: pd.DataFrame
+) -> None:
+    # Capture the formatted dataframe that is written to parquet
+    written_dataframe: pd.DataFrame | None = None
 
-def test_get_stop_index_per_file(stub_processor: JsonlExportProcessor) -> None:
-    stub_processor.config.fraction_per_file = {"train.jsonl": 0.8, "val.jsonl": 0.15, "test.jsonl": 0.05}
-    result = stub_processor._get_stop_index_per_file(100)
+    def capture_dataframe(batch_number: int, dataframe: pd.DataFrame, batch_stage: BatchStage, subfolder: str) -> None:
+        nonlocal written_dataframe
+        written_dataframe = dataframe
 
-    assert result == {"train.jsonl": 80, "val.jsonl": 95, "test.jsonl": 100}
+    stub_processor.artifact_storage.write_batch_to_parquet_file.side_effect = capture_dataframe
 
+    # Process the dataframe
+    result = stub_processor.process(stub_simple_dataframe, current_batch_number=0)
 
-def test_process_returns_original_dataframe(
-    stub_processor: JsonlExportProcessor, stub_simple_dataframe: pd.DataFrame
-) -> None:
-    result = stub_processor.process(stub_simple_dataframe)
+    # Verify the original dataframe is returned
     pd.testing.assert_frame_equal(result, stub_simple_dataframe)
 
+    # Verify write_batch_to_parquet_file was called with correct parameters
+    stub_processor.artifact_storage.write_batch_to_parquet_file.assert_called_once()
+    call_args = stub_processor.artifact_storage.write_batch_to_parquet_file.call_args
+
+    assert call_args.kwargs["batch_number"] == 0
+    assert call_args.kwargs["batch_stage"] == BatchStage.PROCESSORS_OUTPUTS
+    assert call_args.kwargs["subfolder"] == "test_output_format"
+
+    # Verify the formatted dataframe has the correct structure
+    assert written_dataframe is not None
+    assert list(written_dataframe.columns) == ["formatted_output"]
+    assert len(written_dataframe) == 4
+
+    # Verify the formatted content
+    expected_formatted_output = [
+        '{"text": "hello", "value": "1"}',
+        '{"text": "world", "value": "2"}',
+        '{"text": "test", "value": "3"}',
+        '{"text": "data", "value": "4"}',
+    ]
 
-def test_process_writes_correct_content_to_files(
-    stub_processor: JsonlExportProcessor, stub_simple_dataframe: pd.DataFrame
-) -> None:
-    stub_processor.config.fraction_per_file = {"train.jsonl": 0.75, "validation.jsonl": 0.25}
-
-    # Capture the content of the files that are written to the outputs folder
-    file_contents: dict[str, str] = {}
+    for i, expected in enumerate(expected_formatted_output):
+        actual = written_dataframe.iloc[i]["formatted_output"]
+        # Parse both as JSON to compare structure (ignoring whitespace differences)
+        assert json.loads(actual) == json.loads(expected), f"Row {i} mismatch: {actual} != {expected}"
 
-    def capture_file_content(from_path: Path, folder_name: str) -> None:
-        with open(from_path, "r") as f:
-            file_contents[from_path.name] = f.read()
 
-    stub_processor.artifact_storage.move_processor_output.side_effect = capture_file_content
+def test_process_without_batch_number_does_not_write(
+    stub_processor: OutputFormatProcessor, stub_simple_dataframe: pd.DataFrame
+) -> None:
+    # Process without batch number (preview mode)
+    result = stub_processor.process(stub_simple_dataframe, current_batch_number=None)
 
-    # Process the dataframe and write the files to the outputs folder
-    with patch("data_designer.engine.processing.processors.jsonl_export.logger"):
-        stub_processor.process(stub_simple_dataframe)
+    # Verify the original dataframe is returned
+    pd.testing.assert_frame_equal(result, stub_simple_dataframe)
 
-    # Check that the files were moved with the correct names
-    assert stub_processor.artifact_storage.move_processor_output.call_count == 2
+    # Verify write_batch_to_parquet_file was NOT called
+    stub_processor.artifact_storage.write_batch_to_parquet_file.assert_not_called()
 
-    assert "train.jsonl" in file_contents
-    assert "validation.jsonl" in file_contents
 
-    # Check that the files contain the correct content
-    train_lines = file_contents["train.jsonl"].strip().split("\n") if file_contents["train.jsonl"].strip() else []
-    validation_lines = (
-        file_contents["validation.jsonl"].strip().split("\n") if file_contents["validation.jsonl"].strip() else []
+def test_process_with_json_serialized_values(stub_processor: OutputFormatProcessor) -> None:
+    # Test with JSON-serialized values in dataframe
+    df_with_json = pd.DataFrame(
+        {
+            "col1": ["hello", "world"],
+            "col2": ['{"nested": "value1"}', '{"nested": "value2"}'],
+        }
     )
 
-    assert len(train_lines) == 3, f"Expected 3 lines in train.jsonl, got {len(train_lines)}"
-    assert len(validation_lines) == 1, f"Expected 1 line in validation.jsonl, got {len(validation_lines)}"
+    written_dataframe: pd.DataFrame | None = None
 
-    expected_train_data = [
-        {"text": "hello", "value": "1"},
-        {"text": "world", "value": "2"},
-        {"text": "test", "value": "3"},
-    ]
+    def capture_dataframe(batch_number: int, dataframe: pd.DataFrame, batch_stage: BatchStage, subfolder: str) -> None:
+        nonlocal written_dataframe
+        written_dataframe = dataframe
+
+    stub_processor.artifact_storage.write_batch_to_parquet_file.side_effect = capture_dataframe
 
-    for i, line in enumerate(train_lines):
-        parsed = json.loads(line)
-        assert parsed == expected_train_data[i], f"Train line {i} mismatch: {parsed} != {expected_train_data[i]}"
+    # Process the dataframe
+    stub_processor.process(df_with_json, current_batch_number=0)
 
-    expected_validation_data = [{"text": "data", "value": "4"}]
+    # Verify the formatted dataframe was written
+    assert written_dataframe is not None
+    assert len(written_dataframe) == 2
 
-    for i, line in enumerate(validation_lines):
-        parsed = json.loads(line)
-        assert parsed == expected_validation_data[i], (
-            f"Validation line {i} mismatch: {parsed} != {expected_validation_data[i]}"
-        )
+    # Verify that nested JSON values are properly deserialized in template rendering
+    first_output = json.loads(written_dataframe.iloc[0]["formatted_output"])
+    assert first_output["text"] == "hello"
+    assert first_output["value"] == "{'nested': 'value1'}"
diff --git a/tests/engine/test_configurable_task.py b/tests/engine/test_configurable_task.py
@@ -68,6 +68,7 @@ def _initialize(self) -> None:
     mock_artifact_storage.final_dataset_folder_name = "final_dataset"
     mock_artifact_storage.partial_results_folder_name = "partial_results"
     mock_artifact_storage.dropped_columns_folder_name = "dropped_columns"
+    mock_artifact_storage.processors_outputs_folder_name = "processors_outputs"
     resource_provider = ResourceProvider(artifact_storage=mock_artifact_storage)
 
     task = TestTask(config=config, resource_provider=resource_provider)
@@ -99,6 +100,7 @@ def _validate(self) -> None:
     mock_artifact_storage.final_dataset_folder_name = "final_dataset"
     mock_artifact_storage.partial_results_folder_name = "partial_results"
     mock_artifact_storage.dropped_columns_folder_name = "dropped_columns"
+    mock_artifact_storage.processors_outputs_folder_name = "processors_outputs"
     resource_provider = ResourceProvider(artifact_storage=mock_artifact_storage)
 
     task = TestTask(config=config, resource_provider=resource_provider)
@@ -137,6 +139,7 @@ def _initialize(self) -> None:
     mock_artifact_storage.final_dataset_folder_name = "final_dataset"
     mock_artifact_storage.partial_results_folder_name = "partial_results"
     mock_artifact_storage.dropped_columns_folder_name = "dropped_columns"
+    mock_artifact_storage.processors_outputs_folder_name = "processors_outputs"
     mock_model_registry = Mock(spec=ModelRegistry)
     resource_provider = ResourceProvider(artifact_storage=mock_artifact_storage, model_registry=mock_model_registry)
     task = TestTask(config=config, resource_provider=resource_provider)