Add batched pairwise similarity method for Semantic Dedup (#581)

praateekmahajan · web-flow · commit 70a3346efb79 · 2025-03-10T14:51:52.000-07:00
* add batched similarity

Signed-off-by: Praateek &lt;praateekm@gmail.com&gt;

* pre-commit

Signed-off-by: Praateek &lt;praateekm@gmail.com&gt;

* increase tolerance

Signed-off-by: Praateek &lt;praateekm@gmail.com&gt;

* uncomment

Signed-off-by: Praateek &lt;praateekm@gmail.com&gt;

* pr review + sad that we can only run tests on gpu

Signed-off-by: Praateek &lt;praateekm@gmail.com&gt;

* pc

Signed-off-by: Praateek &lt;praateekm@gmail.com&gt;

* pr suggestions

Signed-off-by: Praateek &lt;praateekm@gmail.com&gt;

* another test config

Signed-off-by: Praateek &lt;praateekm@gmail.com&gt;

* reduce tolerance for random array

Signed-off-by: Praateek &lt;praateekm@gmail.com&gt;

---------

Signed-off-by: Praateek &lt;praateekm@gmail.com&gt;
diff --git a/config/sem_dedup_config.yaml b/config/sem_dedup_config.yaml
@@ -19,6 +19,7 @@ clustering_save_loc: "clustering_results"
 random_state: 1234
 sim_metric: "cosine"
 which_to_keep: "hard"
+batched_cosine_similarity: 1024
 sort_clusters: true
 kmeans_with_cos_dist: false
 clustering_input_partition_size: "2gb"
diff --git a/docs/user-guide/semdedup.rst b/docs/user-guide/semdedup.rst
@@ -57,6 +57,7 @@ Semantic deduplication in NeMo Curator can be configured using a YAML file. Here
     random_state: 1234
     sim_metric: "cosine"
     which_to_keep: "hard"
+    batched_cosine_similarity: 1024
     sort_clusters: true
     kmeans_with_cos_dist: false
     clustering_input_partition_size: "2gb"
@@ -209,6 +210,7 @@ Use Individual Components
         id_column="doc_id",
         id_column_type="str",
         which_to_keep="hard",
+        batched_cosine_similarity=1024,
         output_dir="path/to/output/deduped",
         logger="path/to/log/dir"
     )
@@ -257,7 +259,7 @@ Key parameters in the configuration file include:
 - ``n_clusters``: Number of clusters for k-means clustering.
 - ``eps_to_extract``: Deduplication threshold. Higher values result in more aggressive deduplication.
 - ``which_to_keep``: Strategy for choosing which duplicate to keep ("hard" or "soft").
-
+- ``batched_cosine_similarity``: Whether to use batched cosine similarity (has less memory usage, O(N*B) where B is the batch size) or vanilla cosine similarity (O(N^2) memory usage).
 -----------------------------------------
 Output
 -----------------------------------------
diff --git a/nemo_curator/modules/config.py b/nemo_curator/modules/config.py
@@ -14,7 +14,7 @@
 
 import warnings
 from dataclasses import dataclass, field
-from typing import List, Optional
+from typing import List, Optional, Union
 
 import yaml
 
@@ -178,6 +178,9 @@ class SemDedupConfig(BaseConfig):
             Default is "cosine".
         which_to_keep (str): Method to determine which duplicates to keep.
             Default is "hard".
+        batched_cosine_similarity (Union[bool, int]): Whether to use batched cosine similarity (has less memory usage).
+            Default is 1024. When False or 0, no batching is used and memory requirements are O(N^2) where N is the number of items in the cluster.
+            When True, batch size is set to 1024 and memory requirements are O(N*B) where N is the number of items in the cluster and B is the batch size.
         sort_clusters (bool): Whether to sort clusters. Default is True.
         kmeans_with_cos_dist (bool): Whether or not to use KMeans with cosine distance.
             Default is False.
@@ -199,6 +202,7 @@ class SemDedupConfig(BaseConfig):
     embedding_batch_size: int = 128
     embeddings_save_loc: str = "embeddings"
     embedding_max_mem_gb: Optional[int] = None
+
     # Options: "mean_pooling", "last_token"
     embedding_pooling_strategy: str = "mean_pooling"
     embedding_column: str = "embeddings"
@@ -212,6 +216,7 @@ class SemDedupConfig(BaseConfig):
     random_state: int = 1234
     sim_metric: str = "cosine"
     which_to_keep: str = "hard"
+    batched_cosine_similarity: Union[bool, int] = 1024
     sort_clusters: bool = True
     kmeans_with_cos_dist: bool = False
     clustering_input_partition_size: str = "2gb"
@@ -230,3 +235,12 @@ def __post_init__(self):
             raise ValueError(
                 f"Epsilon to extract {self.eps_to_extract} must be in eps_thresholds {self.eps_thresholds}"
             )
+
+        # Convert bool to int
+        if isinstance(self.batched_cosine_similarity, bool):
+            if self.batched_cosine_similarity:
+                self.batched_cosine_similarity = 1024
+            else:
+                self.batched_cosine_similarity = 0
+        if not isinstance(self.batched_cosine_similarity, int):
+            raise ValueError("batched_cosine_similarity must be an integer")
diff --git a/nemo_curator/modules/semantic_dedup/semanticclusterleveldedup.py b/nemo_curator/modules/semantic_dedup/semanticclusterleveldedup.py
@@ -23,7 +23,6 @@
 
 from nemo_curator.datasets import DocumentDataset
 from nemo_curator.log import create_logger
-from nemo_curator.modules.config import SemDedupConfig
 from nemo_curator.utils.distributed_utils import performance_report_if_with_ts_suffix
 from nemo_curator.utils.file_utils import expand_outdir_and_mkdir
 from nemo_curator.utils.semdedup_utils import (
@@ -43,6 +42,7 @@ def __init__(
         which_to_keep: str = "hard",
         output_dir: str = "./clustering_results",
         embedding_column: str = "embeddings",
+        batched_cosine_similarity: int = 1024,
         logger: Union[logging.Logger, str] = "./",
         profile_dir: Optional[str] = None,
     ) -> None:
@@ -64,6 +64,9 @@ def __init__(
                 Default is "./clustering_results".
             embedding_column (str): The column name that stores the embeddings.
                 Default is "embeddings".
+            batched_cosine_similarity (int): Whether to use batched cosine similarity (has less memory usage).
+                Default is 1024. When greater than 0, batching is used and memory requirements are O(N*B) where N is the number of items in the cluster and B is the batch size.
+                When less than or equal to 0, no batching is used and memory requirements are O(N^2) where N is the number of items in the cluster.
             logger (Union[logging.Logger, str]): Existing logger to log to, or a path to a log directory.
                 Default is "./".
             profile_dir (Optional[str]): If specified, directory to write Dask profile.
@@ -82,6 +85,7 @@ def __init__(
         )
         self.computed_semantic_match_dfs = False
         self.embedding_column = embedding_column
+        self.batched_cosine_similarity = batched_cosine_similarity
         self.logger = self._setup_logger(logger)
         self.profile_dir = profile_dir
 
@@ -144,6 +148,7 @@ def compute_semantic_match_dfs(
                     output_dir=self.semdedup_pruning_tables_dir,
                     embedding_col=self.embedding_column,
                     which_to_keep=self.which_to_keep,
+                    batched_cosine_similarity=self.batched_cosine_similarity,
                 )
             )
             tasks.compute()
diff --git a/nemo_curator/modules/semantic_dedup/semdedup.py b/nemo_curator/modules/semantic_dedup/semdedup.py
@@ -91,6 +91,7 @@ def __init__(
             id_column=id_column,
             id_column_type=id_column_type,
             which_to_keep=config.which_to_keep,
+            batched_cosine_similarity=config.batched_cosine_similarity,
             output_dir=os.path.join(cache_dir, config.clustering_save_loc),
             embedding_column=config.embedding_column,
             logger=logger,
diff --git a/nemo_curator/scripts/semdedup/extract_dedup_data.py b/nemo_curator/scripts/semdedup/extract_dedup_data.py
@@ -54,6 +54,7 @@ def main(args):
         id_column=args.id_column,
         id_column_type=args.id_column_type,
         which_to_keep=semdedup_config.which_to_keep,
+        batched_cosine_similarity=semdedup_config.batched_cosine_similarity,
         output_dir=os.path.join(
             semdedup_config.cache_dir, semdedup_config.clustering_save_loc
         ),
diff --git a/nemo_curator/utils/semdedup_utils.py b/nemo_curator/utils/semdedup_utils.py
@@ -18,7 +18,7 @@
 import random
 import shutil
 import time
-from typing import List, Optional, Tuple
+from typing import List, Literal, Optional, Tuple
 
 import cudf
 import dask.bag as db
@@ -179,25 +179,60 @@ def rank_within_cluster(
     return len(cluster_ids) - missing_files
 
 
-def _semdedup(
-    cluster_reps: torch.Tensor, device: str
+def pairwise_cosine_similarity(
+    cluster_reps: torch.Tensor,
+    device: Literal["cuda", "cpu"],
 ) -> Tuple[torch.Tensor, List[int]]:
-    # compute pairwise cos sim between cluster items,
-    # then replace to diagonal with zeros to ignore self similarity
-    cluster_reps.to(device)
-    pair_w_sim_matrix = cluster_reps @ (cluster_reps.T)
+    """
+    Compute pairwise cosine similarity between cluster items,
+    then replace to diagonal with zeros to ignore self similarity
+    """
+    # Move to device
+    cluster_reps = cluster_reps.to(device)
+    # Compute pairwise cosine similarity
+    pairwise_sim_matrix = torch.mm(cluster_reps, cluster_reps.T)
     del cluster_reps
-    pair_w_sim_matrix.fill_diagonal_(0.0)
-    assert pair_w_sim_matrix.shape[0] == pair_w_sim_matrix.shape[1]
-
-    triu_sim_mat = torch.triu(pair_w_sim_matrix, diagonal=1)
-
-    M = torch.max(triu_sim_mat, dim=0)[0].cpu()
-    M1 = torch.max(triu_sim_mat, dim=0)[1].cpu().numpy().tolist()
-    return M, M1
-
-
-def get_cluster_reps(
+    # Get upper triangular matrix
+    assert pairwise_sim_matrix.shape[0] == pairwise_sim_matrix.shape[1]
+    triu_sim_mat = torch.triu(pairwise_sim_matrix, diagonal=1)
+    # Get max similarity and indices
+    max_values_and_indices = torch.max(triu_sim_mat, dim=0)
+    max_similarity = max_values_and_indices[0].cpu()
+    max_indices = max_values_and_indices[1].cpu().numpy().tolist()
+    return max_similarity, max_indices
+
+
+def pairwise_cosine_similarity_batched(
+    cluster_reps: torch.Tensor,
+    device: Literal["cuda", "cpu"],
+    batch_size: int = 1024,
+) -> Tuple[torch.Tensor, List[int]]:
+    """
+    Computes pairwise cosine similarity between cluster items,
+    then replace to diagonal with zeros to ignore self similarity.
+    This function is useful for large clusters where the pairwise similarity matrix
+    does not fit into memory.
+    We use a batched approach to compute the pairwise similarity matrix in batches.
+    Memory requirements are O(N*B) where N is the number of items in the cluster and B is the batch size
+    instead of O(N^2) for the full matrix.
+    """
+    cluster_reps = cluster_reps.to(device)
+    max_similarity = torch.zeros(cluster_reps.shape[0], device=device)
+    max_indices = torch.zeros(cluster_reps.shape[0], dtype=torch.int64, device=device)
+    for start_idx in range(0, cluster_reps.shape[0], batch_size):
+        end_idx = min(start_idx + batch_size, cluster_reps.shape[0])
+        batch = cluster_reps[start_idx:end_idx]
+        pairwise_sim_matrix = torch.mm(cluster_reps, batch.T)
+        triu_sim_matrix = torch.triu(pairwise_sim_matrix, diagonal=1 - start_idx)
+        del batch, pairwise_sim_matrix
+        max_values_and_indices = torch.max(triu_sim_matrix, dim=0)
+        max_similarity[start_idx:end_idx] = max_values_and_indices[0]
+        max_indices[start_idx:end_idx] = max_values_and_indices[1]
+
+    return max_similarity.cpu(), max_indices.cpu().numpy().tolist()
+
+
+def get_normalized_cluster_reps(
     cluster_id: int,
     emb_by_clust_dir: str,
     id_col: str,
@@ -220,6 +255,8 @@ def get_cluster_reps(
         cluster_reps[embedding_col].list.leaves.values.reshape(len(cluster_reps), -1),
         device="cuda",
     )
+    # Normalize embeddings
+    cluster_reps = cluster_reps / cluster_reps.norm(dim=1, keepdim=True)
     return cluster_reps
 
 
@@ -233,6 +270,7 @@ def get_semantic_matches_per_cluster(
     output_dir: str,
     embedding_col: str,
     which_to_keep: str,
+    batched_cosine_similarity: int = 1024,
 ) -> None:
 
     output_df_file_path = os.path.join(output_dir, f"cluster_{cluster_id}.parquet")
@@ -266,22 +304,26 @@ def get_semantic_matches_per_cluster(
 
     text_ids = cluster_i[:, 0].astype(id_col_type)
 
-    cluster_reps = get_cluster_reps(
+    cluster_reps = get_normalized_cluster_reps(
         cluster_id, emb_by_clust_dir, id_col, embedding_col, text_ids
     )
-    M, M1 = _semdedup(cluster_reps, "cuda")
+    if batched_cosine_similarity > 0:
+        max_similarity, max_indices = pairwise_cosine_similarity_batched(
+            cluster_reps, "cuda", batched_cosine_similarity
+        )
+    else:
+        max_similarity, max_indices = pairwise_cosine_similarity(cluster_reps, "cuda")
     assert cluster_reps.shape[0] == len(text_ids)
-
-    M1_id = [text_ids[m] for m in M1]
+    max_indices_id = [text_ids[m] for m in max_indices]
 
     points_to_remove_df = cudf.DataFrame()
     points_to_remove_df["indices"] = clutser_items_indices
     points_to_remove_df["id"] = text_ids
-    points_to_remove_df["max_id"] = M1_id
-    points_to_remove_df["cosine_sim_score"] = M.numpy().tolist()
+    points_to_remove_df["max_id"] = max_indices_id
+    points_to_remove_df["cosine_sim_score"] = max_similarity.numpy().tolist()
 
     for eps in eps_list:
-        eps_points_to_remove = M > 1 - eps
+        eps_points_to_remove = max_similarity > 1 - eps
         points_to_remove_df[f"eps={eps}"] = eps_points_to_remove
 
     points_to_remove_df.to_parquet(output_df_file_path)
diff --git a/tests/test_semdedup.py b/tests/test_semdedup.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
+from typing import TYPE_CHECKING, Literal
 
 import numpy as np
 import pytest
@@ -29,6 +30,17 @@
 EmbeddingCreator = gpu_only_import_from(
     "nemo_curator.modules.semantic_dedup.embeddings", "EmbeddingCreator"
 )
+pairwise_cosine_similarity = gpu_only_import_from(
+    "nemo_curator.utils.semdedup_utils", "pairwise_cosine_similarity"
+)
+pairwise_cosine_similarity_batched = gpu_only_import_from(
+    "nemo_curator.utils.semdedup_utils", "pairwise_cosine_similarity_batched"
+)
+if TYPE_CHECKING:
+    from nemo_curator.utils.semdedup_utils import (
+        pairwise_cosine_similarity,
+        pairwise_cosine_similarity_batched,
+    )
 
 
 @pytest.fixture
@@ -233,3 +245,59 @@ def get_reference_embeddings(
         embs.append(normed_emb)
 
     return np.array(embs)
+
+
+class TestPairwiseCosineSimilarity:
+    def setup_method(self):
+        # We create a 5x3 array where each row is a unit vector
+        # The second and last two rows are the same
+        input_arr = torch.tensor(
+            np.asarray(
+                [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12], [1, 2, 3], [1, 2, 3]],
+            ),
+            dtype=torch.float32,
+        )
+        # Normalize the input array
+        self.input_arr = input_arr / torch.norm(input_arr, dim=1, keepdim=True)
+        self.expected_similarity = torch.tensor(
+            [0.0000, 0.974631, 0.998190, 0.999618, 1.0000, 1.0000]
+        )
+        self.expected_indices = [0, 0, 1, 2, 0, 0]
+
+    @pytest.mark.parametrize("device", [pytest.param("cuda", marks=pytest.mark.gpu)])
+    def test_pairwise_cosine_similarity(self, device: Literal["cpu", "cuda"]):
+        max_similarity, max_indices = pairwise_cosine_similarity(
+            self.input_arr.to(device), device
+        )
+        torch.testing.assert_close(
+            max_similarity, self.expected_similarity, rtol=1e-6, atol=1e-6
+        )
+        assert max_indices == self.expected_indices
+
+    @pytest.mark.parametrize("device", [pytest.param("cuda", marks=pytest.mark.gpu)])
+    @pytest.mark.parametrize("batch_size", [1, 2, 3, 4, 5, 6])
+    def test_pairwise_cosine_similarity_batched(
+        self, device: Literal["cpu", "cuda"], batch_size: int
+    ):
+        max_similarity, max_indices = pairwise_cosine_similarity_batched(
+            self.input_arr.to(device), device, batch_size
+        )
+        torch.testing.assert_close(max_similarity, self.expected_similarity)
+        assert max_indices == self.expected_indices
+
+    @pytest.mark.parametrize("device", [pytest.param("cuda", marks=pytest.mark.gpu)])
+    @pytest.mark.parametrize("batch_size", [100, 512, 1024, 2048])
+    def test_pairwise_cosine_similarity_batched_rand_array(
+        self, device: Literal["cpu", "cuda"], batch_size: int
+    ):
+        N = 1024
+        D = 512
+        rand_arr = torch.randn(N, D, device=device)
+        max_similarity, max_indices = pairwise_cosine_similarity(rand_arr, device)
+        max_similarity_batched, max_indices_batched = (
+            pairwise_cosine_similarity_batched(rand_arr, device, batch_size=batch_size)
+        )
+        torch.testing.assert_close(
+            max_similarity, max_similarity_batched, rtol=1e-5, atol=1e-5
+        )
+        assert max_indices == max_indices_batched
diff --git a/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml b/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml
@@ -18,6 +18,7 @@ clustering_save_loc: "clustering_results"
 random_state: 1234
 sim_metric: "cosine"
 which_to_keep: "hard"
+batched_cosine_similarity: 1024
 sort_clusters: true
 kmeans_with_cos_dist: false
 clustering_input_partition_size: "2gb"
diff --git a/tutorials/image-curation/image-curation.ipynb b/tutorials/image-curation/image-curation.ipynb
@@ -681,6 +681,7 @@
     "    id_column_type=\"str\",\n",
     "    embedding_col=\"image_embedding\",\n",
     "    which_to_keep=\"hard\",\n",
+    "    batched_cosine_similarity=1024,\n",
     "    output_dir=duplicate_output,\n",
     ")\n",
     "semantic_dedup.compute_semantic_match_dfs([0.01, 0.001])\n",
diff --git a/tutorials/peft-curation-with-sdg/config/sem_dedup_config.yaml b/tutorials/peft-curation-with-sdg/config/sem_dedup_config.yaml
@@ -18,6 +18,7 @@ clustering_save_loc: "clustering_results"
 random_state: 1234
 sim_metric: "cosine"
 which_to_keep: "hard"
+batched_cosine_similarity: 1024
 sort_clusters: true
 kmeans_with_cos_dist: false
 clustering_input_partition_size: "2gb"