|
9 | 9 | DropColumnsProcessorConfig, |
10 | 10 | ProcessorConfig, |
11 | 11 | ProcessorType, |
| 12 | + ToJsonlProcessorConfig, |
12 | 13 | get_processor_config_from_kwargs, |
13 | 14 | ) |
14 | 15 |
|
@@ -46,13 +47,90 @@ def test_drop_columns_processor_config_serialization(): |
46 | 47 | assert config_restored.column_names == config.column_names |
47 | 48 |
|
48 | 49 |
|
| 50 | +def test_to_jsonl_processor_config_creation(): |
| 51 | + config = ToJsonlProcessorConfig( |
| 52 | + build_stage=BuildStage.POST_BATCH, |
| 53 | + template={"text": "{{ col1 }}"}, |
| 54 | + folder_name="jsonl_output", |
| 55 | + fraction_per_file={"train.jsonl": 0.8, "validation.jsonl": 0.2}, |
| 56 | + ) |
| 57 | + |
| 58 | + assert config.build_stage == BuildStage.POST_BATCH |
| 59 | + assert config.template == {"text": "{{ col1 }}"} |
| 60 | + assert config.folder_name == "jsonl_output" |
| 61 | + assert config.fraction_per_file == {"train.jsonl": 0.8, "validation.jsonl": 0.2} |
| 62 | + assert config.processor_type == ProcessorType.TO_JSONL |
| 63 | + assert isinstance(config, ProcessorConfig) |
| 64 | + |
| 65 | + |
| 66 | +def test_to_jsonl_processor_config_validation(): |
| 67 | + # Test unsupported stage raises error |
| 68 | + with pytest.raises(ValidationError, match="Invalid dataset builder stage"): |
| 69 | + ToJsonlProcessorConfig( |
| 70 | + build_stage=BuildStage.PRE_BATCH, |
| 71 | + template={"text": "{{ col1 }}"}, |
| 72 | + folder_name="jsonl_output", |
| 73 | + fraction_per_file={"train.jsonl": 0.8, "validation.jsonl": 0.2}, |
| 74 | + ) |
| 75 | + |
| 76 | + # Test missing required field raises error |
| 77 | + with pytest.raises(ValidationError, match="Field required"): |
| 78 | + ToJsonlProcessorConfig(build_stage=BuildStage.POST_BATCH, template={"text": "{{ col1 }}"}) |
| 79 | + |
| 80 | + # Test invalid fraction per file raises error |
| 81 | + with pytest.raises(ValidationError, match="The fractions must sum to 1."): |
| 82 | + ToJsonlProcessorConfig( |
| 83 | + build_stage=BuildStage.POST_BATCH, |
| 84 | + template={"text": "{{ col1 }}"}, |
| 85 | + folder_name="jsonl_output", |
| 86 | + fraction_per_file={"train.jsonl": 0.8, "validation.jsonl": 0.3}, |
| 87 | + ) |
| 88 | + |
| 89 | + |
| 90 | +def test_to_jsonl_processor_config_serialization(): |
| 91 | + config = ToJsonlProcessorConfig( |
| 92 | + build_stage=BuildStage.POST_BATCH, |
| 93 | + template={"text": "{{ col1 }}"}, |
| 94 | + folder_name="jsonl_output", |
| 95 | + fraction_per_file={"train.jsonl": 0.8, "validation.jsonl": 0.2}, |
| 96 | + ) |
| 97 | + |
| 98 | + # Serialize to dict |
| 99 | + config_dict = config.model_dump() |
| 100 | + assert config_dict["build_stage"] == "post_batch" |
| 101 | + assert config_dict["template"] == {"text": "{{ col1 }}"} |
| 102 | + assert config_dict["folder_name"] == "jsonl_output" |
| 103 | + assert config_dict["fraction_per_file"] == {"train.jsonl": 0.8, "validation.jsonl": 0.2} |
| 104 | + |
| 105 | + # Deserialize from dict |
| 106 | + config_restored = ToJsonlProcessorConfig.model_validate(config_dict) |
| 107 | + assert config_restored.build_stage == config.build_stage |
| 108 | + assert config_restored.template == config.template |
| 109 | + assert config_restored.folder_name == config.folder_name |
| 110 | + assert config_restored.fraction_per_file == config.fraction_per_file |
| 111 | + |
| 112 | + |
49 | 113 | def test_get_processor_config_from_kwargs(): |
50 | 114 | # Test successful creation |
51 | | - config = get_processor_config_from_kwargs( |
| 115 | + config_drop_columns = get_processor_config_from_kwargs( |
52 | 116 | ProcessorType.DROP_COLUMNS, build_stage=BuildStage.POST_BATCH, column_names=["col1"] |
53 | 117 | ) |
54 | | - assert isinstance(config, DropColumnsProcessorConfig) |
55 | | - assert config.column_names == ["col1"] |
| 118 | + assert isinstance(config_drop_columns, DropColumnsProcessorConfig) |
| 119 | + assert config_drop_columns.column_names == ["col1"] |
| 120 | + assert config_drop_columns.processor_type == ProcessorType.DROP_COLUMNS |
| 121 | + |
| 122 | + config_to_jsonl = get_processor_config_from_kwargs( |
| 123 | + ProcessorType.TO_JSONL, |
| 124 | + build_stage=BuildStage.POST_BATCH, |
| 125 | + template={"text": "{{ col1 }}"}, |
| 126 | + folder_name="jsonl_output", |
| 127 | + fraction_per_file={"train.jsonl": 0.8, "validation.jsonl": 0.2}, |
| 128 | + ) |
| 129 | + assert isinstance(config_to_jsonl, ToJsonlProcessorConfig) |
| 130 | + assert config_to_jsonl.template == {"text": "{{ col1 }}"} |
| 131 | + assert config_to_jsonl.folder_name == "jsonl_output" |
| 132 | + assert config_to_jsonl.fraction_per_file == {"train.jsonl": 0.8, "validation.jsonl": 0.2} |
| 133 | + assert config_to_jsonl.processor_type == ProcessorType.TO_JSONL |
56 | 134 |
|
57 | 135 | # Test with unknown processor type returns None |
58 | 136 | from enum import Enum |
|
0 commit comments