update data generation tests

RobotSail · RobotSail · commit 143f41e5e233 · 2025-02-23T08:25:08.000Z
Signed-off-by: Oleg Silkin &lt;97077423+RobotSail@users.noreply.github.com&gt;
diff --git a/.pylintrc b/.pylintrc
@@ -448,7 +448,9 @@ disable=raw-checker-failed,
         abstract-method,
         wrong-import-order,
         line-too-long,
-        logging-fstring-interpolation
+        logging-fstring-interpolation,
+        # This is being set off by our deprecation warnings
+        duplicate-code
 
 # Enable the message, report, category or checker with the given id(s). You can
 # either give multiple identifier separated by comma (,) or put this option
diff --git a/src/instructlab/sdg/datamixing.py b/src/instructlab/sdg/datamixing.py
@@ -192,9 +192,9 @@ def _create_mixed_dataset(self, num_proc):
         )
 
         # assert that the dataset only has the allowed columns
-        assert set(mixed_ds.column_names) == set(ALLOWED_COLS), (
-            "Dataset has invalid columns"
-        )
+        assert set(mixed_ds.column_names) == set(
+            ALLOWED_COLS
+        ), "Dataset has invalid columns"
         return mixed_ds
 
     def add_dataset(self, path, sampling_size):
diff --git a/src/instructlab/sdg/mix_data.py b/src/instructlab/sdg/mix_data.py
diff --git a/tests/test_datamixing.py b/tests/test_datamixing.py
@@ -17,10 +17,10 @@
     DataMixer,
     Recipe,
     _add_extra_contexts_to_samples,
+    _conv_pretrain,
+    _create_auxiliary_dataset,
     _create_phase07_ds,
     _create_phase10_ds,
-    _create_auxiliary_dataset,
-    _conv_pretrain,
 )
 
 # We mock out the actual things that use num_procs anyway, but just
@@ -269,17 +269,17 @@ def test_phase07_creation(mock_auxiliary_dataset):
 
     # Check if Phase 0.7 contains knowledge and auxiliary datasets
     expected_phase07_size = len(knowledge_dataset) + len(auxiliary_dataset)
-    assert len(phase07_ds) == expected_phase07_size, (
-        "Phase 0.7 should contain knowledge and auxiliary datasets."
-    )
+    assert (
+        len(phase07_ds) == expected_phase07_size
+    ), "Phase 0.7 should contain knowledge and auxiliary datasets."
 
     # Verify that the content from all datasets is present in Phase 0.7
     auxiliary_ids = {item["id"] for item in auxiliary_dataset}
     phase07_ids = {item["id"] for item in phase07_ds}
 
-    assert auxiliary_ids.issubset(phase07_ids), (
-        "Phase 0.7 should include all auxiliary dataset entries."
-    )
+    assert auxiliary_ids.issubset(
+        phase07_ids
+    ), "Phase 0.7 should include all auxiliary dataset entries."
 
 
 @patch("instructlab.sdg.datamixing._create_auxiliary_dataset")
@@ -307,9 +307,9 @@ def test_phase10_creation(mock_auxiliary_dataset):
     )
 
     # Check if Phase 1.0 includes knowledge, auxiliary, and knowledge_skills content
-    assert len(phase10_ds) == phase10_expected_size, (
-        "Phase 1.0 should contain the expected number of entries, including Phase 0.7 content."
-    )
+    assert (
+        len(phase10_ds) == phase10_expected_size
+    ), "Phase 1.0 should contain the expected number of entries, including Phase 0.7 content."
 
 
 def test_all_samples_have_unmask_field():
@@ -375,11 +375,11 @@ def test_phase07_knowledge_samples_have_unmask_true():
             lambda rec: _conv_pretrain(rec, use_legacy_pretraining_format=False)
         )
         for sample in auxiliary_ds:
-            assert sample["unmask"] is True, (
-                "Auxiliary sample does not have unmask=True"
-            )
+            assert (
+                sample["unmask"] is True
+            ), "Auxiliary sample does not have unmask=True"
 
     # verify that at least ONE sample in phase10 has unmask=True
-    assert any(sample["unmask"] for sample in phase10_ds), (
-        "No samples in phase10 have unmask=True"
-    )
+    assert any(
+        sample["unmask"] for sample in phase10_ds
+    ), "No samples in phase10 have unmask=True"
diff --git a/tests/test_generate_data.py b/tests/test_generate_data.py
@@ -93,20 +93,21 @@ def validate_messages_dataset(dataset_file_name, expected_samples):
 
 def validate_skill_leaf_node_dataset(dataset_file_name):
     ds = load_dataset("json", data_files=dataset_file_name, split="train")
-    assert len(ds.features) == 9
+    assert len(ds.features) == 10
     features = [
-        "task_description",
-        "seed_context",
-        "seed_question",
-        "seed_response",
-        "output",
-        "id",
-        "leaf_node_path",
-        "leaf_node_type",
+        ("task_description", "string"),
+        ("seed_context", "string"),
+        ("seed_question", "string"),
+        ("seed_response", "string"),
+        ("output", "string"),
+        ("id", "string"),
+        ("leaf_node_path", "string"),
+        ("leaf_node_type", "string"),
+        ("unmask", "bool"),
     ]
-    for feature in features:
+    for feature, dtype in features:
         assert feature in ds.features
-        assert ds.features[feature].dtype == "string"
+        assert ds.features[feature].dtype == dtype
     assert "messages" in ds.features
     assert len(ds.features["messages"]) == 1
     assert len(ds.features["messages"][0]) == 2
@@ -116,11 +117,11 @@ def validate_skill_leaf_node_dataset(dataset_file_name):
 
 def validate_phase_leaf_node_dataset(dataset_file_name):
     ds = load_dataset("json", data_files=dataset_file_name, split="train")
-    assert len(ds.features) == 3
-    features = ["metadata", "id"]
-    for feature in features:
+    assert len(ds.features) == 4
+    features = [("metadata", "string"), ("id", "string"), ("unmask", "bool")]
+    for feature, dtype in features:
         assert feature in ds.features
-        assert ds.features[feature].dtype == "string"
+        assert ds.features[feature].dtype == dtype
     assert "messages" in ds.features
     assert len(ds.features["messages"]) == 1
     assert len(ds.features["messages"][0]) == 2