|
| 1 | +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. |
| 2 | +# SPDX-License-Identifier: Apache-2.0 |
| 3 | + |
| 4 | +import json |
| 5 | +from pathlib import Path |
| 6 | +from unittest.mock import Mock, patch |
| 7 | + |
| 8 | +import pandas as pd |
| 9 | +import pytest |
| 10 | + |
| 11 | +from data_designer.config.dataset_builders import BuildStage |
| 12 | +from data_designer.config.processors import ToJsonlProcessorConfig |
| 13 | +from data_designer.engine.processing.processors.to_jsonl import ToJsonlProcessor |
| 14 | + |
| 15 | + |
| 16 | +@pytest.fixture |
| 17 | +def stub_processor_config() -> ToJsonlProcessorConfig: |
| 18 | + return ToJsonlProcessorConfig( |
| 19 | + build_stage=BuildStage.POST_BATCH, |
| 20 | + template={"text": "{{ col1 }}", "value": "{{ col2 }}"}, |
| 21 | + folder_name="jsonl_output", |
| 22 | + fraction_per_file={"train.jsonl": 0.75, "validation.jsonl": 0.25}, |
| 23 | + ) |
| 24 | + |
| 25 | + |
| 26 | +@pytest.fixture |
| 27 | +def stub_processor(stub_processor_config: ToJsonlProcessorConfig, tmp_path: Path) -> ToJsonlProcessor: |
| 28 | + mock_resource_provider = Mock() |
| 29 | + mock_artifact_storage = Mock() |
| 30 | + mock_artifact_storage.move_to_outputs = Mock() |
| 31 | + mock_resource_provider.artifact_storage = mock_artifact_storage |
| 32 | + |
| 33 | + processor = ToJsonlProcessor( |
| 34 | + config=stub_processor_config, |
| 35 | + resource_provider=mock_resource_provider, |
| 36 | + ) |
| 37 | + return processor |
| 38 | + |
| 39 | + |
| 40 | +@pytest.fixture |
| 41 | +def stub_simple_dataframe() -> pd.DataFrame: |
| 42 | + return pd.DataFrame( |
| 43 | + { |
| 44 | + "col1": ["hello", "world", "test", "data"], |
| 45 | + "col2": [1, 2, 3, 4], |
| 46 | + } |
| 47 | + ) |
| 48 | + |
| 49 | + |
| 50 | +def test_metadata() -> None: |
| 51 | + metadata = ToJsonlProcessor.metadata() |
| 52 | + |
| 53 | + assert metadata.name == "to_jsonl" |
| 54 | + assert metadata.description == "Save formatted dataset as JSONL files." |
| 55 | + assert metadata.required_resources is None |
| 56 | + |
| 57 | + |
| 58 | +def test_template_as_string(stub_processor: ToJsonlProcessor) -> None: |
| 59 | + template_str = stub_processor.template_as_string |
| 60 | + assert isinstance(template_str, str) |
| 61 | + assert json.loads(template_str) == {"text": "{{ col1 }}", "value": "{{ col2 }}"} |
| 62 | + |
| 63 | + |
| 64 | +def test_get_stop_index_per_file(stub_processor: ToJsonlProcessor) -> None: |
| 65 | + stub_processor.config.fraction_per_file = {"train.jsonl": 0.8, "val.jsonl": 0.15, "test.jsonl": 0.05} |
| 66 | + result = stub_processor._get_stop_index_per_file(100) |
| 67 | + |
| 68 | + assert result == {"train.jsonl": 80, "val.jsonl": 95, "test.jsonl": 100} |
| 69 | + |
| 70 | + |
| 71 | +def test_process_returns_original_dataframe( |
| 72 | + stub_processor: ToJsonlProcessor, stub_simple_dataframe: pd.DataFrame |
| 73 | +) -> None: |
| 74 | + result = stub_processor.process(stub_simple_dataframe) |
| 75 | + pd.testing.assert_frame_equal(result, stub_simple_dataframe) |
| 76 | + |
| 77 | + |
| 78 | +def test_process_writes_correct_content_to_files( |
| 79 | + stub_processor: ToJsonlProcessor, stub_simple_dataframe: pd.DataFrame |
| 80 | +) -> None: |
| 81 | + stub_processor.config.fraction_per_file = {"train.jsonl": 0.75, "validation.jsonl": 0.25} |
| 82 | + |
| 83 | + # Capture the content of the files that are written to the outputs folder |
| 84 | + file_contents: dict[str, str] = {} |
| 85 | + |
| 86 | + def capture_file_content(file_path: Path, folder_name: str) -> None: |
| 87 | + with open(file_path, "r") as f: |
| 88 | + file_contents[file_path.name] = f.read() |
| 89 | + |
| 90 | + stub_processor.artifact_storage.move_to_outputs.side_effect = capture_file_content |
| 91 | + |
| 92 | + # Process the dataframe and write the files to the outputs folder |
| 93 | + with patch("data_designer.engine.processing.processors.to_jsonl.logger"): |
| 94 | + stub_processor.process(stub_simple_dataframe) |
| 95 | + |
| 96 | + # Check that the files were moved with the correct names |
| 97 | + assert stub_processor.artifact_storage.move_to_outputs.call_count == 2 |
| 98 | + |
| 99 | + assert "train.jsonl" in file_contents |
| 100 | + assert "validation.jsonl" in file_contents |
| 101 | + |
| 102 | + # Check that the files contain the correct content |
| 103 | + train_lines = file_contents["train.jsonl"].strip().split("\n") if file_contents["train.jsonl"].strip() else [] |
| 104 | + validation_lines = ( |
| 105 | + file_contents["validation.jsonl"].strip().split("\n") if file_contents["validation.jsonl"].strip() else [] |
| 106 | + ) |
| 107 | + |
| 108 | + assert len(train_lines) == 3, f"Expected 3 lines in train.jsonl, got {len(train_lines)}" |
| 109 | + assert len(validation_lines) == 1, f"Expected 1 line in validation.jsonl, got {len(validation_lines)}" |
| 110 | + |
| 111 | + expected_train_data = [ |
| 112 | + {"text": "hello", "value": "1"}, |
| 113 | + {"text": "world", "value": "2"}, |
| 114 | + {"text": "test", "value": "3"}, |
| 115 | + ] |
| 116 | + |
| 117 | + for i, line in enumerate(train_lines): |
| 118 | + parsed = json.loads(line) |
| 119 | + assert parsed == expected_train_data[i], f"Train line {i} mismatch: {parsed} != {expected_train_data[i]}" |
| 120 | + |
| 121 | + expected_validation_data = [{"text": "data", "value": "4"}] |
| 122 | + |
| 123 | + for i, line in enumerate(validation_lines): |
| 124 | + parsed = json.loads(line) |
| 125 | + assert parsed == expected_validation_data[i], ( |
| 126 | + f"Validation line {i} mismatch: {parsed} != {expected_validation_data[i]}" |
| 127 | + ) |
0 commit comments