Skip to content

Commit c30bd73

Browse files
committed
tests
1 parent 4513747 commit c30bd73

File tree

2 files changed

+83
-3
lines changed

2 files changed

+83
-3
lines changed

examples/example.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,3 +207,5 @@
207207
artifact_path="./artifacts", blob_storage_path="/Users/amanoel/Data/nemotron-personas-datasets_v0.0.6"
208208
)
209209
preview = dd.preview(config_builder, num_records=10)
210+
211+
dd.create(config_builder, num_records=20)

tests/config/test_processors.py

Lines changed: 81 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
DropColumnsProcessorConfig,
1010
ProcessorConfig,
1111
ProcessorType,
12+
ToJsonlProcessorConfig,
1213
get_processor_config_from_kwargs,
1314
)
1415

@@ -46,13 +47,90 @@ def test_drop_columns_processor_config_serialization():
4647
assert config_restored.column_names == config.column_names
4748

4849

50+
def test_to_jsonl_processor_config_creation():
51+
config = ToJsonlProcessorConfig(
52+
build_stage=BuildStage.POST_BATCH,
53+
template={"text": "{{ col1 }}"},
54+
folder_name="jsonl_output",
55+
fraction_per_file={"train.jsonl": 0.8, "validation.jsonl": 0.2},
56+
)
57+
58+
assert config.build_stage == BuildStage.POST_BATCH
59+
assert config.template == {"text": "{{ col1 }}"}
60+
assert config.folder_name == "jsonl_output"
61+
assert config.fraction_per_file == {"train.jsonl": 0.8, "validation.jsonl": 0.2}
62+
assert config.processor_type == ProcessorType.TO_JSONL
63+
assert isinstance(config, ProcessorConfig)
64+
65+
66+
def test_to_jsonl_processor_config_validation():
67+
# Test unsupported stage raises error
68+
with pytest.raises(ValidationError, match="Invalid dataset builder stage"):
69+
ToJsonlProcessorConfig(
70+
build_stage=BuildStage.PRE_BATCH,
71+
template={"text": "{{ col1 }}"},
72+
folder_name="jsonl_output",
73+
fraction_per_file={"train.jsonl": 0.8, "validation.jsonl": 0.2},
74+
)
75+
76+
# Test missing required field raises error
77+
with pytest.raises(ValidationError, match="Field required"):
78+
ToJsonlProcessorConfig(build_stage=BuildStage.POST_BATCH, template={"text": "{{ col1 }}"})
79+
80+
# Test invalid fraction per file raises error
81+
with pytest.raises(ValidationError, match="The fractions must sum to 1."):
82+
ToJsonlProcessorConfig(
83+
build_stage=BuildStage.POST_BATCH,
84+
template={"text": "{{ col1 }}"},
85+
folder_name="jsonl_output",
86+
fraction_per_file={"train.jsonl": 0.8, "validation.jsonl": 0.3},
87+
)
88+
89+
90+
def test_to_jsonl_processor_config_serialization():
91+
config = ToJsonlProcessorConfig(
92+
build_stage=BuildStage.POST_BATCH,
93+
template={"text": "{{ col1 }}"},
94+
folder_name="jsonl_output",
95+
fraction_per_file={"train.jsonl": 0.8, "validation.jsonl": 0.2},
96+
)
97+
98+
# Serialize to dict
99+
config_dict = config.model_dump()
100+
assert config_dict["build_stage"] == "post_batch"
101+
assert config_dict["template"] == {"text": "{{ col1 }}"}
102+
assert config_dict["folder_name"] == "jsonl_output"
103+
assert config_dict["fraction_per_file"] == {"train.jsonl": 0.8, "validation.jsonl": 0.2}
104+
105+
# Deserialize from dict
106+
config_restored = ToJsonlProcessorConfig.model_validate(config_dict)
107+
assert config_restored.build_stage == config.build_stage
108+
assert config_restored.template == config.template
109+
assert config_restored.folder_name == config.folder_name
110+
assert config_restored.fraction_per_file == config.fraction_per_file
111+
112+
49113
def test_get_processor_config_from_kwargs():
50114
# Test successful creation
51-
config = get_processor_config_from_kwargs(
115+
config_drop_columns = get_processor_config_from_kwargs(
52116
ProcessorType.DROP_COLUMNS, build_stage=BuildStage.POST_BATCH, column_names=["col1"]
53117
)
54-
assert isinstance(config, DropColumnsProcessorConfig)
55-
assert config.column_names == ["col1"]
118+
assert isinstance(config_drop_columns, DropColumnsProcessorConfig)
119+
assert config_drop_columns.column_names == ["col1"]
120+
assert config_drop_columns.processor_type == ProcessorType.DROP_COLUMNS
121+
122+
config_to_jsonl = get_processor_config_from_kwargs(
123+
ProcessorType.TO_JSONL,
124+
build_stage=BuildStage.POST_BATCH,
125+
template={"text": "{{ col1 }}"},
126+
folder_name="jsonl_output",
127+
fraction_per_file={"train.jsonl": 0.8, "validation.jsonl": 0.2},
128+
)
129+
assert isinstance(config_to_jsonl, ToJsonlProcessorConfig)
130+
assert config_to_jsonl.template == {"text": "{{ col1 }}"}
131+
assert config_to_jsonl.folder_name == "jsonl_output"
132+
assert config_to_jsonl.fraction_per_file == {"train.jsonl": 0.8, "validation.jsonl": 0.2}
133+
assert config_to_jsonl.processor_type == ProcessorType.TO_JSONL
56134

57135
# Test with unknown processor type returns None
58136
from enum import Enum

0 commit comments

Comments
 (0)