Add KMeans random_state to semantic deduplication configs (#575)

sarahyurick · web-flow · commit e662ac096a7a · 2025-02-26T13:18:47.000-08:00
* Add KMeans random_state to semantic deduplication configs

Signed-off-by: Sarah Yurick &lt;sarahyurick@gmail.com&gt;

* edit docstrings

Signed-off-by: Sarah Yurick &lt;sarahyurick@gmail.com&gt;

---------

Signed-off-by: Sarah Yurick &lt;sarahyurick@gmail.com&gt;
diff --git a/config/sem_dedup_config.yaml b/config/sem_dedup_config.yaml
@@ -16,6 +16,7 @@ write_to_filename: false
 max_iter: 100
 n_clusters: 1000
 clustering_save_loc: "clustering_results"
+random_state: 1234
 sim_metric: "cosine"
 which_to_keep: "hard"
 sort_clusters: true
diff --git a/docs/user-guide/semdedup.rst b/docs/user-guide/semdedup.rst
@@ -54,6 +54,7 @@ Semantic deduplication in NeMo Curator can be configured using a YAML file. Here
     max_iter: 100
     n_clusters: 1000
     clustering_save_loc: "clustering_results"
+    random_state: 1234
     sim_metric: "cosine"
     which_to_keep: "hard"
     sort_clusters: true
diff --git a/nemo_curator/modules/config.py b/nemo_curator/modules/config.py
@@ -173,6 +173,7 @@ class SemDedupConfig(BaseConfig):
         n_clusters (int): Number of clusters. Default is 1000.
         clustering_save_loc (str): Location to save clustering results.
             Default is "clustering_results".
+        random_state (int): KMeans random state used for reproducibility. Default is 1234.
         sim_metric (str): Similarity metric for deduplication.
             Default is "cosine".
         which_to_keep (str): Method to determine which duplicates to keep.
@@ -208,6 +209,7 @@ class SemDedupConfig(BaseConfig):
     max_iter: int = 100
     n_clusters: int = 1000
     clustering_save_loc: str = "clustering_results"
+    random_state: int = 1234
     sim_metric: str = "cosine"
     which_to_keep: str = "hard"
     sort_clusters: bool = True
diff --git a/nemo_curator/modules/semantic_dedup/clusteringmodel.py b/nemo_curator/modules/semantic_dedup/clusteringmodel.py
@@ -32,7 +32,6 @@
 from nemo_curator.utils.semdedup_utils import assign_and_sort_clusters
 
 
-### Clustering Module
 def get_embedding_ar(df: "cudf.DataFrame", embedding_col: str) -> cp.ndarray:
     return df[embedding_col].list.leaves.values.reshape(len(df), -1)
 
@@ -47,14 +46,16 @@ def add_dist_to_cents(
     return df
 
 
+# Clustering module
 class ClusteringModel:
     def __init__(
         self,
-        id_column: str,
-        max_iter: int,
-        n_clusters: int,
-        clustering_output_dir: str,
+        id_column: str = "id",
+        max_iter: int = 100,
+        n_clusters: int = 1000,
+        clustering_output_dir: str = "./clustering_results",
         embedding_column: str = "embeddings",
+        random_state: int = 1234,
         sim_metric: str = "cosine",
         which_to_keep: str = "hard",
         sort_clusters: bool = True,
@@ -68,25 +69,36 @@ def __init__(
 
         Args:
             id_column (str): Column name used as the identifier in the dataset.
-            max_iter (int): Maximum number of iterations for the clustering algorithm.
-            n_clusters (int): The number of clusters to form.
-            clustering_output_dir (str): Directory path where clustering results will be saved.
-            embedding_column (str): Column name where the embeddings are stored.
-            sim_metric (str): Similarity metric to use for clustering, default is "cosine".
-            which_to_keep (str): Strategy to decide which duplicates to keep; default is "hard".
-            sort_clusters (bool): Whether to sort clusters, default is True.
-            kmeans_with_cos_dist (bool): Whether to use KMeans with cosine distance, default is False.
-            clustering_input_partition_size (str): The size of data partition to run kmeans with, default is "2gb".
-            logger (Union[logging.Logger, str]): Logger object or directory path to save logs; default is "./".
-            profile_dir (str): If specified directory to write dask profile. Default is None.
-
-        This constructor sets up the parameters required for clustering operations.
+                Default is "id".
+            max_iter (int): Maximum iterations for clustering. Default is 100.
+            n_clusters (int): Number of clusters. Default is 1000.
+            clustering_output_dir (str): Location to save clustering results.
+                Default is "./clustering_results".
+            embedding_column (str): The column name that stores the embeddings.
+                Default is "embeddings".
+            random_state (int): KMeans random state used for reproducibility.
+                Default is 1234.
+            sim_metric (str): Similarity metric for deduplication.
+                Default is "cosine".
+            which_to_keep (str): Method to determine which duplicates to keep.
+                Default is "hard".
+            sort_clusters (bool): Whether to sort clusters. Default is True.
+            kmeans_with_cos_dist (bool): Whether or not to use KMeans with cosine distance.
+                Default is False.
+            clustering_input_partition_size (str): The size of data partition with which to run KMeans.
+                Default is "2gb".
+            logger (Union[logging.Logger, str]): Existing logger to log to, or a path to a log directory.
+                Default is "./".
+            profile_dir (Optional[str]): If specified, directory to write Dask profile.
+                Default is None.
+
         """
         self.id_col = id_column
         self.max_iter = max_iter
         self.n_clusters = n_clusters
         self.clustering_output_dir = clustering_output_dir
         self.embedding_column = embedding_column
+        self.random_state = random_state
         self.sim_metric = sim_metric
         self.keep_hard = which_to_keep == "hard"
         self.kmeans_with_cos_dist = kmeans_with_cos_dist
@@ -119,7 +131,7 @@ def __call__(self, embeddings_dataset: DocumentDataset):
 
         if self.embedding_column not in embeddings_df.columns:
             raise ValueError(
-                f"Expected embedding column '{self.embedding_column}'"
+                f'Expected embedding column "{self.embedding_column}"'
                 f" to be in dataset. Only found columns {embeddings_df.columns}"
             )
 
@@ -153,18 +165,22 @@ def __call__(self, embeddings_dataset: DocumentDataset):
             )
             cupy_darr.compute_chunk_sizes()
             t0 = time.time()
-            kmeans = KMeans(n_clusters=self.n_clusters, max_iter=self.max_iter)
+            kmeans = KMeans(
+                n_clusters=self.n_clusters,
+                max_iter=self.max_iter,
+                random_state=self.random_state,
+            )
             self.logger.info("KMeans starting fit")
             kmeans.fit(cupy_darr)
             self.logger.info("KMeans fit complete")
-            self.logger.info(f"Time taken for KMeans Fit: {time.time() - t0}")
+            self.logger.info(f"Time taken for KMeans fit: {time.time() - t0}")
 
             self.logger.info(
-                "Computing nearest centroids + distance to centers using kmeans.predict"
+                "Computing nearest centroids and distance to centers using kmeans.predict"
             )
             t0 = time.time()
             nearest_cents = kmeans.predict(cupy_darr)
-            self.logger.info(f"Time taken for KMeans Predict: {time.time() - t0}")
+            self.logger.info(f"Time taken for KMeans predict: {time.time() - t0}")
 
             t0 = time.time()
             embeddings_df["nearest_cent"] = nearest_cents.astype(np.int32)
@@ -196,13 +212,11 @@ def __call__(self, embeddings_dataset: DocumentDataset):
                 shutil.rmtree(clustering_output_dir)
 
             embeddings_df.to_parquet(
-                clustering_output_dir,
-                index=False,
-                partition_on="nearest_cent",
+                clustering_output_dir, index=False, partition_on="nearest_cent"
             )
             self.logger.info(
-                f"Time taken for Assigning distance to each embedding : {time.time() - t0} "
-                f"and output written at {clustering_output_dir}"
+                f"Time taken for assigning distance to each embedding: {time.time() - t0}s"
+                f" and output written at {clustering_output_dir}"
             )
 
             del embeddings_df
diff --git a/nemo_curator/modules/semantic_dedup/embeddings.py b/nemo_curator/modules/semantic_dedup/embeddings.py
@@ -36,7 +36,7 @@
 )
 
 
-# Embedding Creation Module
+# Embedding creation module
 @dataclass
 class EmbeddingConfig:
     model_name_or_path: str
@@ -47,7 +47,7 @@ def __post_init__(self):
         self.max_seq_length = AutoTokenizer.from_pretrained(
             self.model_name_or_path
         ).model_max_length
-        # Gaurd against the HF bug
+        # Guard against Hugging Face bug
         # which sets max_seq_length to max(int) for some models
         if self.max_seq_length > 1e5:
             self.max_seq_length = AutoConfig.from_pretrained(
@@ -133,9 +133,9 @@ def load_tokenizer(self):
 class EmbeddingCreator:
     def __init__(
         self,
-        embedding_model_name_or_path: str,
-        embedding_batch_size: int,
-        embedding_output_dir: str,
+        embedding_model_name_or_path: str = "sentence-transformers/all-MiniLM-L6-v2",
+        embedding_batch_size: int = 128,
+        embedding_output_dir: str = "./embeddings",
         embedding_max_mem_gb: Optional[int] = None,
         embedding_pooling_strategy: str = "mean_pooling",
         input_column: str = "text",
@@ -149,28 +149,29 @@ def __init__(
         Initializes an EmbeddingCreator for generating embeddings using the specified model configurations.
 
         Args:
-            embedding_model_name_or_path (str): The path or identifier for the model used to generate embeddings.
-            embedding_batch_size (int): Number of samples to process in each batch.
-            embedding_output_dir (str): Directory path where embeddings will be saved.
-            embedding_max_mem_gb (int): Maximum memory usage in GB for the embedding process.
-                                If None, it defaults to the available GPU memory minus 4 GB.
-            embedding_pooling_strategy (str): Strategy for pooling embeddings, either "mean_pooling" or "last_token". Defaults to "mean_pooling".
-            input_column (str): Column name from the data to be used for embedding generation, defaults to "text".
-            write_embeddings_to_disk (bool, optional): If True, saves the embeddings to disk, defaults to True.
-                                We recommend setting this to False when you have a delayed pipeline.
-                                Setting it to False can lead to more memory overhead.
-            write_to_filename (bool): If True, saves the embeddings to the same filename as input files, defaults to False.
-            logger (Union[logging.Logger, str]): Logger object or path to store logs, defaults to "./".
-            profile_dir (str): If specified directory to write dask profile. Default is None.
-
-        Attributes:
-            embeddings_config (EmbeddingConfig): Configuration for embeddings.
-            batch_size (int): Batch size for embedding generation.
-            logger (logging.Logger): Logger instance for the class.
-            embedding_output_dir (str): Output directory for embeddings.
-            input_column (str): Input column for data processing.
-            model (EmbeddingCrossFitModel): Model instance for embedding generation.
-            write_to_filename (bool): If True, saves the embeddings to the same filename as input files, defaults to False.
+            embedding_model_name_or_path (str): Model name or path for embeddings.
+                Default is "sentence-transformers/all-MiniLM-L6-v2".
+            embedding_batch_size (int): Initial batch size for processing embeddings.
+                Default is 128.
+            embedding_output_dir (str): Location to save embeddings.
+                Default is "./embeddings".
+            embedding_max_mem_gb (int, optional): Maximum memory usage in GB for the embedding process.
+                If None, it defaults to the available GPU memory minus 4 GB.
+            embedding_pooling_strategy: Strategy for pooling embeddings, either "mean_pooling" or "last_token".
+                Default is "mean_pooling".
+            input_column (str): Column name from the data to be used for embedding generation.
+                Default is "text".
+            embedding_column (str): The column name that stores the embeddings. Default is "embeddings".
+            write_embeddings_to_disk (bool): If True, saves the embeddings to disk.
+                We recommend setting this to False when you have a delayed pipeline.
+                Setting it to False can lead to more memory overhead. Default is True.
+            write_to_filename (bool): If True, saves the embeddings to the same filename as input files.
+                Default False.
+            logger (Union[logging.Logger, str]): Existing logger to log to, or a path to a log directory.
+                Default is "./".
+            profile_dir (Optional[str]): If specified, directory to write Dask profile.
+                Default is None.
+
         """
 
         self.embeddings_config = EmbeddingConfig(
diff --git a/nemo_curator/modules/semantic_dedup/semanticclusterleveldedup.py b/nemo_curator/modules/semantic_dedup/semanticclusterleveldedup.py
@@ -35,13 +35,13 @@
 class SemanticClusterLevelDedup:
     def __init__(
         self,
-        n_clusters: int,
-        emb_by_clust_dir: str,
-        sorted_clusters_dir: str,
-        id_column: str,
-        id_column_type: str,
-        which_to_keep: str,
-        output_dir: str,
+        n_clusters: int = 1000,
+        emb_by_clust_dir: str = "./clustering_results/embs_by_nearest_center",
+        sorted_clusters_dir: str = "./clustering_results/sorted",
+        id_column: str = "id",
+        id_column_type: str = "int",
+        which_to_keep: str = "hard",
+        output_dir: str = "./clustering_results",
         embedding_column: str = "embeddings",
         logger: Union[logging.Logger, str] = "./",
         profile_dir: Optional[str] = None,
@@ -50,16 +50,25 @@ def __init__(
         Initialize the SemanticClusterLevelDedup class.
 
         Args:
-            n_clusters (int): Number of clusters.
+            n_clusters (int): Number of clusters. Default is 1000.
             emb_by_clust_dir (str): Directory containing embeddings by cluster.
+                Default is "./clustering_results/embs_by_nearest_center".
             sorted_clusters_dir (str): Directory containing sorted clusters.
-            id_column (str): Column name for IDs.
-            id_column_type (str): Data type of the ID column.
-            which_to_keep (str): Strategy for which duplicate to keep.
+                Default is "./clustering_results/sorted".
+            id_column (str): Column name used as the identifier in the dataset.
+                Default is "id".
+            id_column_type (str): Data type of id_column. Default is "int".
+            which_to_keep (str): Method to determine which duplicates to keep.
+                Default is "hard".
             output_dir (str): Directory to save output files.
-            embedding_column (str): Column where the embeddings are stored.
-            logger (Union[logging.Logger, str]): Logger instance or path to the log file directory.
-            profile_dir (str): If specified directory to write dask profile. Default is None.
+                Default is "./clustering_results".
+            embedding_column (str): The column name that stores the embeddings.
+                Default is "embeddings".
+            logger (Union[logging.Logger, str]): Existing logger to log to, or a path to a log directory.
+                Default is "./".
+            profile_dir (Optional[str]): If specified, directory to write Dask profile.
+                Default is None.
+
         """
         self.n_clusters = n_clusters
         self.emb_by_clust_dir = emb_by_clust_dir
@@ -118,6 +127,7 @@ def compute_semantic_match_dfs(
             shutil.rmtree(self.semdedup_pruning_tables_dir)
         expand_outdir_and_mkdir(self.semdedup_pruning_tables_dir)
         t0 = time.time()
+
         with performance_report_if_with_ts_suffix(
             self.profile_dir, "semantic-match-compute"
         ):
diff --git a/nemo_curator/modules/semantic_dedup/semdedup.py b/nemo_curator/modules/semantic_dedup/semdedup.py
@@ -41,7 +41,13 @@ def __init__(
 
         Args:
             config (SemDedupConfig): Configuration for SemDedup.
-            logger (Union[logging.Logger, str]): Logger instance or path to the log file directory.
+            input_column (str): Column name from the data to be used for embedding generation.
+                Default is "text".
+            id_column (str): Column name used as the identifier in the dataset.
+                Default is "id".
+            id_column_type (str): Data type of id_column. Default is "int".
+            logger (Union[logging.Logger, str]): Existing logger to log to, or a path to a log directory.
+                Default is "./".
         """
         super().__init__(input_backend="cudf")
         self.config = config
diff --git a/nemo_curator/scripts/semdedup/clustering.py b/nemo_curator/scripts/semdedup/clustering.py
@@ -66,6 +66,13 @@ def main(args):
         max_iter=semdedup_config.max_iter,
         n_clusters=semdedup_config.n_clusters,
         clustering_output_dir=clustering_output_dir,
+        embedding_column=semdedup_config.embedding_column,
+        random_state=semdedup_config.random_state,
+        sim_metric=semdedup_config.sim_metric,
+        which_to_keep=semdedup_config.which_to_keep,
+        sort_clusters=semdedup_config.sort_clusters,
+        kmeans_with_cos_dist=semdedup_config.kmeans_with_cos_dist,
+        clustering_input_partition_size=semdedup_config.clustering_input_partition_size,
         logger=logger,
     )
 
diff --git a/nemo_curator/scripts/semdedup/compute_embeddings.py b/nemo_curator/scripts/semdedup/compute_embeddings.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -41,7 +41,7 @@ def main(args):
         semdedup_config.cache_dir, semdedup_config.embeddings_save_loc
     )
 
-    # Some time jsonl files are stored as .json
+    # Sometimes JSONL files are stored as .json
     # So to handle that case we can pass the input_file_extension
     if args.input_file_extension is not None:
         input_file_extension = args.input_file_extension
@@ -79,10 +79,13 @@ def main(args):
         embedding_output_dir=os.path.join(
             semdedup_config.cache_dir, semdedup_config.embeddings_save_loc
         ),
+        embedding_max_mem_gb=semdedup_config.embedding_max_mem_gb,
+        embedding_pooling_strategy=semdedup_config.embedding_pooling_strategy,
         input_column=args.input_text_field,
+        embedding_column=semdedup_config.embedding_column,
         write_embeddings_to_disk=semdedup_config.write_embeddings_to_disk,
+        write_to_filename=semdedup_config.write_to_filename,
         logger=logger,
-        write_to_filename=True,
     )
 
     embedding_dataset = embedding_creator(dataset=dataset)
diff --git a/nemo_curator/scripts/semdedup/extract_dedup_data.py b/nemo_curator/scripts/semdedup/extract_dedup_data.py
diff --git a/tests/test_semdedup.py b/tests/test_semdedup.py
diff --git a/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml b/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml
diff --git a/tutorials/peft-curation-with-sdg/config/sem_dedup_config.yaml b/tutorials/peft-curation-with-sdg/config/sem_dedup_config.yaml