Update classifier and semantic deduplication PyTests (#697)

sarahyurick · web-flow · commit 7b501a9d1611 · 2025-05-07T10:34:48.000-07:00
* update semdedup checks to be more robust

Signed-off-by: Sarah Yurick &lt;sarahyurick@gmail.com&gt;

* fix ruff

Signed-off-by: Sarah Yurick &lt;sarahyurick@gmail.com&gt;

* fix tolist

Signed-off-by: Sarah Yurick &lt;sarahyurick@gmail.com&gt;

* releax rounding for prompt classifier test

Signed-off-by: Sarah Yurick &lt;sarahyurick@gmail.com&gt;

---------

Signed-off-by: Sarah Yurick &lt;sarahyurick@gmail.com&gt;
diff --git a/tests/test_classifiers.py b/tests/test_classifiers.py
@@ -273,12 +273,16 @@ def test_prompt_task_complexity_classifier(gpu_client) -> None:  # noqa: ANN001,
     # Rounded values to account for floating point errors
     result_pred["constraint_ct"] = round(result_pred["constraint_ct"], 2)
     expected_pred["constraint_ct"] = round(expected_pred["constraint_ct"], 2)
-    result_pred["contextual_knowledge"] = round(result_pred["contextual_knowledge"], 3)
-    expected_pred["contextual_knowledge"] = round(expected_pred["contextual_knowledge"], 3)
+    result_pred["contextual_knowledge"] = round(result_pred["contextual_knowledge"], 2)
+    expected_pred["contextual_knowledge"] = round(expected_pred["contextual_knowledge"], 2)
     result_pred["creativity_scope"] = round(result_pred["creativity_scope"], 2)
     expected_pred["creativity_scope"] = round(expected_pred["creativity_scope"], 2)
-    result_pred["prompt_complexity_score"] = round(result_pred["prompt_complexity_score"], 3)
-    expected_pred["prompt_complexity_score"] = round(expected_pred["prompt_complexity_score"], 3)
+    result_pred["domain_knowledge"] = round(result_pred["domain_knowledge"], 2)
+    expected_pred["domain_knowledge"] = round(expected_pred["domain_knowledge"], 2)
+    result_pred["prompt_complexity_score"] = round(result_pred["prompt_complexity_score"], 2)
+    expected_pred["prompt_complexity_score"] = round(expected_pred["prompt_complexity_score"], 2)
+    result_pred["reasoning"] = round(result_pred["reasoning"], 2)
+    expected_pred["reasoning"] = round(expected_pred["reasoning"], 2)
     result_pred["task_type_prob"] = round(result_pred["task_type_prob"], 2)
     expected_pred["task_type_prob"] = round(expected_pred["task_type_prob"], 2)
 
diff --git a/tests/test_semdedup.py b/tests/test_semdedup.py
@@ -11,8 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 import os
 import random
+from itertools import product
 from pathlib import Path
 from typing import TYPE_CHECKING, Literal
 
@@ -145,19 +147,35 @@ def test_sem_dedup(
             # Correctly returns the original dataset with no duplicates removed
             result = sem_duplicates(dedup_data)
             result_df = result.df.compute()
-            docs_to_remove = [1, 100]
-            if id_col_type == "str":
-                docs_to_remove = list(map(str, docs_to_remove))
 
             if not perform_removal:
-                expected_df = cudf.Series(docs_to_remove, name="id", dtype=id_col_type)
-                assert_eq(result_df["id"].sort_values(), expected_df, check_index=False)
+                first_doc_to_remove = [1, 2, 3, 4]
+                second_doc_to_remove = [100, 200, 300]
+                # Generate all possible combinations of documents to remove
+                expected_series_list = [
+                    cudf.Series([a, b], name="id", dtype=id_col_type).sort_values().reset_index(drop=True)
+                    for a, b in product(first_doc_to_remove, second_doc_to_remove)
+                ]
+
+                result_series = result_df["id"].sort_values().reset_index(drop=True)
+                assert any(result_series.equals(expected_series) for expected_series in expected_series_list)
             else:
-                assert_eq(
-                    result_df,
-                    dedup_data.df[~dedup_data.df["id"].isin(docs_to_remove)],
-                    check_index=False,
-                )
+                if id_col_type == "int":
+                    first_doc_to_keep = {1, 2, 3, 4}
+                    second_doc_to_keep = {100, 200, 300}
+                else:
+                    first_doc_to_keep = {"1", "2", "3", "4"}
+                    second_doc_to_keep = {"100", "200", "300"}
+
+                result_ids = set(result_df["id"].to_arrow().to_pylist())
+
+                # Intersection of the sets
+                num_kept_from_first = len(result_ids & first_doc_to_keep)
+                num_kept_from_second = len(result_ids & second_doc_to_keep)
+
+                assert len(result_ids) == 5  # noqa: PLR2004
+                assert num_kept_from_first == 3  # noqa: PLR2004
+                assert num_kept_from_second == 2  # noqa: PLR2004
 
     @pytest.mark.parametrize("n_clusters", [2, 3])
     @pytest.mark.parametrize("perform_removal", [True, False])