Skip to content

Commit 37fffe4

Browse files
committed
tests
1 parent 4513747 commit 37fffe4

File tree

3 files changed

+210
-3
lines changed

3 files changed

+210
-3
lines changed

examples/example.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,3 +207,5 @@
207207
artifact_path="./artifacts", blob_storage_path="/Users/amanoel/Data/nemotron-personas-datasets_v0.0.6"
208208
)
209209
preview = dd.preview(config_builder, num_records=10)
210+
211+
dd.create(config_builder, num_records=20)

tests/config/test_processors.py

Lines changed: 81 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
DropColumnsProcessorConfig,
1010
ProcessorConfig,
1111
ProcessorType,
12+
ToJsonlProcessorConfig,
1213
get_processor_config_from_kwargs,
1314
)
1415

@@ -46,13 +47,90 @@ def test_drop_columns_processor_config_serialization():
4647
assert config_restored.column_names == config.column_names
4748

4849

50+
def test_to_jsonl_processor_config_creation():
51+
config = ToJsonlProcessorConfig(
52+
build_stage=BuildStage.POST_BATCH,
53+
template={"text": "{{ col1 }}"},
54+
folder_name="jsonl_output",
55+
fraction_per_file={"train.jsonl": 0.8, "validation.jsonl": 0.2},
56+
)
57+
58+
assert config.build_stage == BuildStage.POST_BATCH
59+
assert config.template == {"text": "{{ col1 }}"}
60+
assert config.folder_name == "jsonl_output"
61+
assert config.fraction_per_file == {"train.jsonl": 0.8, "validation.jsonl": 0.2}
62+
assert config.processor_type == ProcessorType.TO_JSONL
63+
assert isinstance(config, ProcessorConfig)
64+
65+
66+
def test_to_jsonl_processor_config_validation():
67+
# Test unsupported stage raises error
68+
with pytest.raises(ValidationError, match="Invalid dataset builder stage"):
69+
ToJsonlProcessorConfig(
70+
build_stage=BuildStage.PRE_BATCH,
71+
template={"text": "{{ col1 }}"},
72+
folder_name="jsonl_output",
73+
fraction_per_file={"train.jsonl": 0.8, "validation.jsonl": 0.2},
74+
)
75+
76+
# Test missing required field raises error
77+
with pytest.raises(ValidationError, match="Field required"):
78+
ToJsonlProcessorConfig(build_stage=BuildStage.POST_BATCH, template={"text": "{{ col1 }}"})
79+
80+
# Test invalid fraction per file raises error
81+
with pytest.raises(ValidationError, match="The fractions must sum to 1."):
82+
ToJsonlProcessorConfig(
83+
build_stage=BuildStage.POST_BATCH,
84+
template={"text": "{{ col1 }}"},
85+
folder_name="jsonl_output",
86+
fraction_per_file={"train.jsonl": 0.8, "validation.jsonl": 0.3},
87+
)
88+
89+
90+
def test_to_jsonl_processor_config_serialization():
91+
config = ToJsonlProcessorConfig(
92+
build_stage=BuildStage.POST_BATCH,
93+
template={"text": "{{ col1 }}"},
94+
folder_name="jsonl_output",
95+
fraction_per_file={"train.jsonl": 0.8, "validation.jsonl": 0.2},
96+
)
97+
98+
# Serialize to dict
99+
config_dict = config.model_dump()
100+
assert config_dict["build_stage"] == "post_batch"
101+
assert config_dict["template"] == {"text": "{{ col1 }}"}
102+
assert config_dict["folder_name"] == "jsonl_output"
103+
assert config_dict["fraction_per_file"] == {"train.jsonl": 0.8, "validation.jsonl": 0.2}
104+
105+
# Deserialize from dict
106+
config_restored = ToJsonlProcessorConfig.model_validate(config_dict)
107+
assert config_restored.build_stage == config.build_stage
108+
assert config_restored.template == config.template
109+
assert config_restored.folder_name == config.folder_name
110+
assert config_restored.fraction_per_file == config.fraction_per_file
111+
112+
49113
def test_get_processor_config_from_kwargs():
50114
# Test successful creation
51-
config = get_processor_config_from_kwargs(
115+
config_drop_columns = get_processor_config_from_kwargs(
52116
ProcessorType.DROP_COLUMNS, build_stage=BuildStage.POST_BATCH, column_names=["col1"]
53117
)
54-
assert isinstance(config, DropColumnsProcessorConfig)
55-
assert config.column_names == ["col1"]
118+
assert isinstance(config_drop_columns, DropColumnsProcessorConfig)
119+
assert config_drop_columns.column_names == ["col1"]
120+
assert config_drop_columns.processor_type == ProcessorType.DROP_COLUMNS
121+
122+
config_to_jsonl = get_processor_config_from_kwargs(
123+
ProcessorType.TO_JSONL,
124+
build_stage=BuildStage.POST_BATCH,
125+
template={"text": "{{ col1 }}"},
126+
folder_name="jsonl_output",
127+
fraction_per_file={"train.jsonl": 0.8, "validation.jsonl": 0.2},
128+
)
129+
assert isinstance(config_to_jsonl, ToJsonlProcessorConfig)
130+
assert config_to_jsonl.template == {"text": "{{ col1 }}"}
131+
assert config_to_jsonl.folder_name == "jsonl_output"
132+
assert config_to_jsonl.fraction_per_file == {"train.jsonl": 0.8, "validation.jsonl": 0.2}
133+
assert config_to_jsonl.processor_type == ProcessorType.TO_JSONL
56134

57135
# Test with unknown processor type returns None
58136
from enum import Enum
Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
import json
5+
from pathlib import Path
6+
from unittest.mock import Mock, patch
7+
8+
import pandas as pd
9+
import pytest
10+
11+
from data_designer.config.dataset_builders import BuildStage
12+
from data_designer.config.processors import ToJsonlProcessorConfig
13+
from data_designer.engine.processing.processors.to_jsonl import ToJsonlProcessor
14+
15+
16+
@pytest.fixture
17+
def stub_processor_config() -> ToJsonlProcessorConfig:
18+
return ToJsonlProcessorConfig(
19+
build_stage=BuildStage.POST_BATCH,
20+
template={"text": "{{ col1 }}", "value": "{{ col2 }}"},
21+
folder_name="jsonl_output",
22+
fraction_per_file={"train.jsonl": 0.75, "validation.jsonl": 0.25},
23+
)
24+
25+
26+
@pytest.fixture
27+
def stub_processor(stub_processor_config: ToJsonlProcessorConfig, tmp_path: Path) -> ToJsonlProcessor:
28+
mock_resource_provider = Mock()
29+
mock_artifact_storage = Mock()
30+
mock_artifact_storage.move_to_outputs = Mock()
31+
mock_resource_provider.artifact_storage = mock_artifact_storage
32+
33+
processor = ToJsonlProcessor(
34+
config=stub_processor_config,
35+
resource_provider=mock_resource_provider,
36+
)
37+
return processor
38+
39+
40+
@pytest.fixture
41+
def stub_simple_dataframe() -> pd.DataFrame:
42+
return pd.DataFrame(
43+
{
44+
"col1": ["hello", "world", "test", "data"],
45+
"col2": [1, 2, 3, 4],
46+
}
47+
)
48+
49+
50+
def test_metadata() -> None:
51+
metadata = ToJsonlProcessor.metadata()
52+
53+
assert metadata.name == "to_jsonl"
54+
assert metadata.description == "Save formatted dataset as JSONL files."
55+
assert metadata.required_resources is None
56+
57+
58+
def test_template_as_string(stub_processor: ToJsonlProcessor) -> None:
59+
template_str = stub_processor.template_as_string
60+
assert isinstance(template_str, str)
61+
assert json.loads(template_str) == {"text": "{{ col1 }}", "value": "{{ col2 }}"}
62+
63+
64+
def test_get_stop_index_per_file(stub_processor: ToJsonlProcessor) -> None:
65+
stub_processor.config.fraction_per_file = {"train.jsonl": 0.8, "val.jsonl": 0.15, "test.jsonl": 0.05}
66+
result = stub_processor._get_stop_index_per_file(100)
67+
68+
assert result == {"train.jsonl": 80, "val.jsonl": 95, "test.jsonl": 100}
69+
70+
71+
def test_process_returns_original_dataframe(
72+
stub_processor: ToJsonlProcessor, stub_simple_dataframe: pd.DataFrame
73+
) -> None:
74+
result = stub_processor.process(stub_simple_dataframe)
75+
pd.testing.assert_frame_equal(result, stub_simple_dataframe)
76+
77+
78+
def test_process_writes_correct_content_to_files(
79+
stub_processor: ToJsonlProcessor, stub_simple_dataframe: pd.DataFrame
80+
) -> None:
81+
stub_processor.config.fraction_per_file = {"train.jsonl": 0.75, "validation.jsonl": 0.25}
82+
83+
# Capture the content of the files that are written to the outputs folder
84+
file_contents: dict[str, str] = {}
85+
86+
def capture_file_content(file_path: Path, folder_name: str) -> None:
87+
with open(file_path, "r") as f:
88+
file_contents[file_path.name] = f.read()
89+
90+
stub_processor.artifact_storage.move_to_outputs.side_effect = capture_file_content
91+
92+
# Process the dataframe and write the files to the outputs folder
93+
with patch("data_designer.engine.processing.processors.to_jsonl.logger"):
94+
stub_processor.process(stub_simple_dataframe)
95+
96+
# Check that the files were moved with the correct names
97+
assert stub_processor.artifact_storage.move_to_outputs.call_count == 2
98+
99+
assert "train.jsonl" in file_contents
100+
assert "validation.jsonl" in file_contents
101+
102+
# Check that the files contain the correct content
103+
train_lines = file_contents["train.jsonl"].strip().split("\n") if file_contents["train.jsonl"].strip() else []
104+
validation_lines = (
105+
file_contents["validation.jsonl"].strip().split("\n") if file_contents["validation.jsonl"].strip() else []
106+
)
107+
108+
assert len(train_lines) == 3, f"Expected 3 lines in train.jsonl, got {len(train_lines)}"
109+
assert len(validation_lines) == 1, f"Expected 1 line in validation.jsonl, got {len(validation_lines)}"
110+
111+
expected_train_data = [
112+
{"text": "hello", "value": "1"},
113+
{"text": "world", "value": "2"},
114+
{"text": "test", "value": "3"},
115+
]
116+
117+
for i, line in enumerate(train_lines):
118+
parsed = json.loads(line)
119+
assert parsed == expected_train_data[i], f"Train line {i} mismatch: {parsed} != {expected_train_data[i]}"
120+
121+
expected_validation_data = [{"text": "data", "value": "4"}]
122+
123+
for i, line in enumerate(validation_lines):
124+
parsed = json.loads(line)
125+
assert parsed == expected_validation_data[i], (
126+
f"Validation line {i} mismatch: {parsed} != {expected_validation_data[i]}"
127+
)

0 commit comments

Comments
 (0)