Changes in semdedup scripts (#785)

abdr17 · web-flow · commit e7d7ef6dcebd · 2025-07-11T20:01:21.000Z
* Changes in semdedup scripts

Signed-off-by: abdr17 &lt;abdulrahmanejaz19@gmail.com&gt;

* Made required changes in semdedup tests

Signed-off-by: abdr17 &lt;abdulrahmanejaz19@gmail.com&gt;

---------

Signed-off-by: abdr17 &lt;abdulrahmanejaz19@gmail.com&gt;
diff --git a/nemo_curator/modules/semantic_dedup/semanticclusterleveldedup.py b/nemo_curator/modules/semantic_dedup/semanticclusterleveldedup.py
@@ -149,7 +149,7 @@ def extract_dedup_data(self, eps_to_extract: float) -> DocumentDataset:
             msg = "eps_to_extract must be a float"
             self.logger.error(msg)
             raise TypeError(msg)
-        output_parquet_path = os.path.join(self.output_dir, f"unique_ids_{eps_to_extract}.parquet")
+        output_parquet_path = os.path.join(self.output_dir, f"duplicate_ids_{eps_to_extract}.parquet")
 
         t0 = time.time()
         with performance_report_if_with_ts_suffix(
diff --git a/nemo_curator/scripts/semdedup/README.md b/nemo_curator/scripts/semdedup/README.md
@@ -28,10 +28,10 @@ Please edit `config/sem_dedup_config.yaml` to configure the pipeline and run it
         - `embs_by_nearest_center` directory, containing `nearest_cent={x}` where x ranges from 0 to `num_clusters - 1`
         - Parquet files within `embs_by_nearest_center/nearest_cent={x}` containing the data points in each cluster
 
-4) Extract deduplicated data
+4) Extract duplicate data IDs to remove
     ```sh
-    semdedup_extract_unique_ids --id-column "my_id" --id-column-type "str" --config-file "$CONFIG_FILE"
+    semdedup_extract_duplicate_ids --id-column "my_id" --config-file "$CONFIG_FILE"
     ```
     **Input:** Output from step (3) and YAML file from step (1)
 
-    **Output:** `{config.cache_dir}/{config.clustering_save_loc}/unique_ids_{}.parquet`
+    **Output:** `{config.cache_dir}/{config.clustering_save_loc}/duplicate_ids_{}.parquet`
diff --git a/nemo_curator/scripts/semdedup/extract_dedup_data.py b/nemo_curator/scripts/semdedup/extract_dedup_data.py
@@ -58,7 +58,14 @@ def main(args: argparse.Namespace) -> None:
 
     semantic_dedup.compute_semantic_match_dfs()
     dedup_id_dataset = semantic_dedup.extract_dedup_data(eps_to_extract=semdedup_config.eps_to_extract)
-    print(dedup_id_dataset.df.head(10))
+
+    len_dedup_id_dataset = len(dedup_id_dataset.df.index)
+
+    # Check whether duplicates are found or not
+    if len_dedup_id_dataset == 0:
+        logger.info("No semantic duplicates found!")
+    else:
+        print(dedup_id_dataset.df.head(10, npartitions=-1))
 
     dt2 = time.perf_counter()
     logger.info(f"End: {dt2}")
@@ -77,7 +84,6 @@ def attach_args() -> argparse.ArgumentParser:
             "earlier using semdedup_extract_embeddings and semdedup_cluster_embeddings."
             "Input arguments include: "
             "--id-column for the the identifier in the dataset, "
-            "--id-column-type for the data type of ID column, "
             "--config-file for the path to the semantic deduplication configuration file. "
             "Important configuration parameters include:"
             " cache_dir for the directory to store cache"
diff --git a/pyproject.toml b/pyproject.toml
@@ -166,7 +166,7 @@ fineweb_nemotron_edu_classifier_inference = "nemo_curator.scripts.classifiers.fi
 blend_datasets = "nemo_curator.scripts.blend_datasets:console_script"
 semdedup_extract_embeddings = "nemo_curator.scripts.semdedup.compute_embeddings:console_script"
 semdedup_clustering = "nemo_curator.scripts.semdedup.clustering:console_script"
-semdedup_extract_unique_ids = "nemo_curator.scripts.semdedup.extract_dedup_data:console_script"
+semdedup_extract_duplicate_ids = "nemo_curator.scripts.semdedup.extract_dedup_data:console_script"
 async_llm_pii_redaction = "nemo_curator.scripts.async_llm_pii_redaction:console_script"
 llm_pii_redaction = "nemo_curator.scripts.llm_pii_redaction:console_script"
 
diff --git a/tests/test_semdedup.py b/tests/test_semdedup.py
@@ -690,22 +690,22 @@ def test_semantic_cluster_level_dedup(
 
         # Call extract_dedup_data
         semantic_cluster_level_dedup.extract_dedup_data(eps_to_extract=0.01)
-        # Check content of unique_ids
-        unique_ids_path = os.path.join(semantic_extraction_output_dir, "unique_ids_0.01.parquet")
-        assert os.path.exists(unique_ids_path)
-        unique_ids_df = pd.read_parquet(unique_ids_path)
-        assert unique_ids_df.columns.tolist() == [
+        # Check content of duplicate_ids
+        duplicate_ids_path = os.path.join(semantic_extraction_output_dir, "duplicate_ids_0.01.parquet")
+        assert os.path.exists(duplicate_ids_path)
+        duplicate_ids_df = pd.read_parquet(duplicate_ids_path)
+        assert duplicate_ids_df.columns.tolist() == [
             "id",
             "cosine_dist_to_cent",
             "cluster",
         ]
 
-        # Check content of semdedup_pruning_table with the filter matches the unique_ids
+        # Check content of semdedup_pruning_table with the filter matches the duplicate_ids
         semdedup_pruning_tables_df_filtered = semdedup_pruning_tables_df[
             semdedup_pruning_tables_df["cosine_sim_score"] >= 1 - 0.01
         ]
-        assert len(semdedup_pruning_tables_df_filtered) == len(unique_ids_df)
-        assert set(semdedup_pruning_tables_df_filtered["id"].to_list()) == set(unique_ids_df["id"].to_list())
+        assert len(semdedup_pruning_tables_df_filtered) == len(duplicate_ids_df)
+        assert set(semdedup_pruning_tables_df_filtered["id"].to_list()) == set(duplicate_ids_df["id"].to_list())
 
         # Check content of summary file
         summary_path = os.path.join(semantic_extraction_output_dir, "dedup_summary_0.01.csv")
@@ -729,6 +729,6 @@ def test_semantic_cluster_level_dedup(
                 }
             ),
         )
-        # Ensure that the unique_ids are also correct (this implicitly checks for semdedup_pruning_tables output)
-        assert len(unique_ids_df) == _removed
-        assert len(set(unique_ids_df["id"].to_list())) == _removed
+        # Ensure that the duplicate_ids are also correct (this implicitly checks for semdedup_pruning_tables output)
+        assert len(duplicate_ids_df) == _removed
+        assert len(set(duplicate_ids_df["id"].to_list())) == _removed