Skip to content

Commit 119edd4

Browse files
authored
Improvements for semantic deduplication and DAPT tutorial (#564)
* push fixes Signed-off-by: Sarah Yurick <sarahyurick@gmail.com> * run black Signed-off-by: Sarah Yurick <sarahyurick@gmail.com> * fix cache? Signed-off-by: Sarah Yurick <sarahyurick@gmail.com> * add vibhu's suggestions Signed-off-by: Sarah Yurick <sarahyurick@gmail.com> * update config to include all params Signed-off-by: Sarah Yurick <sarahyurick@gmail.com> * add vibhu's comments Signed-off-by: Sarah Yurick <sarahyurick@gmail.com> --------- Signed-off-by: Sarah Yurick <sarahyurick@gmail.com>
1 parent 48d48bf commit 119edd4

File tree

14 files changed

+229
-108
lines changed

14 files changed

+229
-108
lines changed

config/sem_dedup_config.yaml

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,22 +3,24 @@ cache_dir: "semdedup_cache"
33
num_files: 16
44

55
# Embeddings configuration
6-
embeddings_save_loc: "embeddings"
76
embedding_model_name_or_path: "sentence-transformers/all-MiniLM-L6-v2"
87
embedding_batch_size: 128
8+
embeddings_save_loc: "embeddings"
9+
# Options: "mean_pooling", "last_token"
10+
embedding_pooling_strategy: "mean_pooling"
11+
embedding_column: "embeddings"
912
write_embeddings_to_disk: true
13+
write_to_filename: false
1014

1115
# Clustering configuration
12-
clustering_save_loc: "clustering_results"
13-
n_clusters: 1000
14-
seed: 1234
1516
max_iter: 100
16-
kmeans_with_cos_dist: false
17-
18-
# Semdedup configuration
19-
which_to_keep: "hard"
20-
largest_cluster_size_to_process: 100000
17+
n_clusters: 1000
18+
clustering_save_loc: "clustering_results"
2119
sim_metric: "cosine"
20+
which_to_keep: "hard"
21+
sort_clusters: true
22+
kmeans_with_cos_dist: false
23+
clustering_input_partition_size: "2gb"
2224

2325
# Extract dedup configuration
2426
eps_thresholds:

docs/user-guide/semdedup.rst

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -42,22 +42,23 @@ Semantic deduplication in NeMo Curator can be configured using a YAML file. Here
4242
num_files: -1
4343
4444
# Embeddings configuration
45-
embeddings_save_loc: "embeddings"
4645
embedding_model_name_or_path: "sentence-transformers/all-MiniLM-L6-v2"
4746
embedding_batch_size: 128
47+
embeddings_save_loc: "embeddings"
48+
embedding_pooling_strategy: "mean_pooling"
49+
embedding_column: "embeddings"
4850
write_embeddings_to_disk: true
51+
write_to_filename: false
4952
5053
# Clustering configuration
51-
clustering_save_loc: "clustering_results"
52-
n_clusters: 1000
53-
seed: 1234
5454
max_iter: 100
55-
kmeans_with_cos_dist: false
56-
57-
# Semdedup configuration
58-
which_to_keep: "hard"
59-
largest_cluster_size_to_process: 100000
55+
n_clusters: 1000
56+
clustering_save_loc: "clustering_results"
6057
sim_metric: "cosine"
58+
which_to_keep: "hard"
59+
sort_clusters: true
60+
kmeans_with_cos_dist: false
61+
clustering_input_partition_size: "2gb"
6162
6263
# Extract dedup configuration
6364
eps_thresholds:

nemo_curator/modules/config.py

Lines changed: 46 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
22
#
33
# Licensed under the Apache License, Version 2.0 (the "License");
44
# you may not use this file except in compliance with the License.
@@ -147,51 +147,72 @@ class SemDedupConfig(BaseConfig):
147147
148148
Attributes:
149149
cache_dir (str): Directory to store cache.
150-
profile_dir (Optional[str]): If specified directory to write dask profile. Default is None.
151-
cache_dir (str): Directory to store cache.
150+
profile_dir (Optional[str]): If specified, directory to write Dask profile.
151+
Default is None.
152152
num_files (int): Number of files. Default is -1, meaning all files.
153-
embeddings_save_loc (str): Location to save embeddings.
153+
154154
embedding_model_name_or_path (str): Model name or path for embeddings.
155-
embedding_batch_size (int): Inital Batch size for processing embeddings.
156-
embedding_pooling_strategy (str): Strategy for pooling embeddings, either "mean_pooling" or "last_token". Defaults to "mean_pooling".
157-
write_embeddings_to_disk (bool): If True, saves the embeddings to disk, defaults to True.
155+
Default is "sentence-transformers/all-MiniLM-L6-v2".
156+
embedding_batch_size (int): Initial batch size for processing embeddings.
157+
Default is 128.
158+
embeddings_save_loc (str): Location to save embeddings.
159+
Default is "embeddings".
160+
embedding_max_mem_gb (int): Maximum memory usage in GB for the embedding process.
161+
If None, it defaults to the available GPU memory minus 4 GB.
162+
embedding_pooling_strategy (str): Strategy for pooling embeddings, either
163+
"mean_pooling" or "last_token". Default is "mean_pooling".
164+
embedding_column (str): The column name that stores the embeddings.
165+
Default is "embeddings".
166+
write_embeddings_to_disk (bool): If True, saves the embeddings to disk.
158167
We recommend setting this to False when you have a delayed pipeline.
159-
Setting it to False can lead to more memory overhead.
168+
Setting it to False can lead to more memory overhead. Default is True.
169+
write_to_filename (bool): If True, saves the embeddings to the same filename as input files.
170+
Default False.
171+
172+
max_iter (int): Maximum iterations for clustering. Default is 100.
173+
n_clusters (int): Number of clusters. Default is 1000.
160174
clustering_save_loc (str): Location to save clustering results.
161-
n_clusters (int): Number of clusters.
162-
seed (int): Seed for clustering.
163-
max_iter (int): Maximum iterations for clustering.
164-
kmeans_with_cos_dist (bool): Use KMeans with cosine distance.
165-
which_to_keep (str): Which duplicates to keep.
166-
largest_cluster_size_to_process (int): Largest cluster size to process.
175+
Default is "clustering_results".
167176
sim_metric (str): Similarity metric for deduplication.
168-
eps_thresholds (List[float]): Epsilon thresholds to calculate if semantically similar or not.
177+
Default is "cosine".
178+
which_to_keep (str): Method to determine which duplicates to keep.
179+
Default is "hard".
180+
sort_clusters (bool): Whether to sort clusters. Default is True.
181+
kmeans_with_cos_dist (bool): Whether or not to use KMeans with cosine distance.
182+
Default is False.
183+
clustering_input_partition_size (str): The size of data partition with which to run KMeans.
184+
Default is "2gb".
185+
186+
eps_thresholds (List[float]): Epsilon thresholds to calculate if semantically
187+
similar or not. Default is [0.01, 0.001].
169188
eps_to_extract (float): Epsilon value to extract deduplicated data.
189+
Default is 0.01.
170190
"""
171191

172192
cache_dir: str
173193
profile_dir: Optional[str] = None
174194
num_files: int = -1
175195

176196
# Embeddings
177-
embeddings_save_loc: str = "embeddings"
178197
embedding_model_name_or_path: str = "sentence-transformers/all-MiniLM-L6-v2"
179198
embedding_batch_size: int = 128
199+
embeddings_save_loc: str = "embeddings"
200+
embedding_max_mem_gb: Optional[int] = None
180201
# Options: "mean_pooling", "last_token"
181202
embedding_pooling_strategy: str = "mean_pooling"
203+
embedding_column: str = "embeddings"
182204
write_embeddings_to_disk: bool = True
205+
write_to_filename: bool = False
183206

184-
# Clustering config
185-
clustering_save_loc: str = "clustering_results"
186-
n_clusters: int = 1000
187-
seed: int = 1234
207+
# Clustering
188208
max_iter: int = 100
189-
kmeans_with_cos_dist: bool = False
190-
191-
# Semdedup config
192-
which_to_keep: str = "hard"
193-
largest_cluster_size_to_process: int = 100000
209+
n_clusters: int = 1000
210+
clustering_save_loc: str = "clustering_results"
194211
sim_metric: str = "cosine"
212+
which_to_keep: str = "hard"
213+
sort_clusters: bool = True
214+
kmeans_with_cos_dist: bool = False
215+
clustering_input_partition_size: str = "2gb"
195216

196217
# Extract dedup config
197218
eps_thresholds: List[float] = field(default_factory=lambda: [0.01, 0.001])

nemo_curator/modules/semantic_dedup/clusteringmodel.py

Lines changed: 32 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
22
#
33
# Licensed under the Apache License, Version 2.0 (the "License");
44
# you may not use this file except in compliance with the License.
@@ -54,12 +54,12 @@ def __init__(
5454
max_iter: int,
5555
n_clusters: int,
5656
clustering_output_dir: str,
57-
embedding_col: str = "embeddings",
57+
embedding_column: str = "embeddings",
5858
sim_metric: str = "cosine",
5959
which_to_keep: str = "hard",
6060
sort_clusters: bool = True,
6161
kmeans_with_cos_dist: bool = False,
62-
partition_size: str = "2gb",
62+
clustering_input_partition_size: str = "2gb",
6363
logger: Union[logging.Logger, str] = "./",
6464
profile_dir: Optional[str] = None,
6565
):
@@ -71,12 +71,12 @@ def __init__(
7171
max_iter (int): Maximum number of iterations for the clustering algorithm.
7272
n_clusters (int): The number of clusters to form.
7373
clustering_output_dir (str): Directory path where clustering results will be saved.
74-
embedding_col (str): Column name where the embeddings are stored.
74+
embedding_column (str): Column name where the embeddings are stored.
7575
sim_metric (str): Similarity metric to use for clustering, default is "cosine".
7676
which_to_keep (str): Strategy to decide which duplicates to keep; default is "hard".
7777
sort_clusters (bool): Whether to sort clusters, default is True.
7878
kmeans_with_cos_dist (bool): Whether to use KMeans with cosine distance, default is False.
79-
partition_size (str): The size of data partition to run kmeans with, default is "2gb".
79+
clustering_input_partition_size (str): The size of data partition to run kmeans with, default is "2gb".
8080
logger (Union[logging.Logger, str]): Logger object or directory path to save logs; default is "./".
8181
profile_dir (str): If specified directory to write dask profile. Default is None.
8282
@@ -86,11 +86,11 @@ def __init__(
8686
self.max_iter = max_iter
8787
self.n_clusters = n_clusters
8888
self.clustering_output_dir = clustering_output_dir
89-
self.embedding_col = embedding_col
89+
self.embedding_column = embedding_column
9090
self.sim_metric = sim_metric
9191
self.keep_hard = which_to_keep == "hard"
9292
self.kmeans_with_cos_dist = kmeans_with_cos_dist
93-
self.partition_size = partition_size
93+
self.clustering_input_partition_size = clustering_input_partition_size
9494
self.sort_clusters = sort_clusters
9595
self.logger = self._setup_logger(logger)
9696
self.profile_dir = profile_dir
@@ -117,22 +117,39 @@ def _setup_logger(self, logger):
117117
def __call__(self, embeddings_dataset: DocumentDataset):
118118
embeddings_df = embeddings_dataset.df
119119

120-
if self.embedding_col not in embeddings_df.columns:
120+
if self.embedding_column not in embeddings_df.columns:
121121
raise ValueError(
122-
f"Expected embedding column '{self.embedding_col}'"
122+
f"Expected embedding column '{self.embedding_column}'"
123123
f" to be in dataset. Only found columns {embeddings_df.columns}"
124124
)
125125

126126
with performance_report_if_with_ts_suffix(self.profile_dir, "clustering-model"):
127-
embeddings_df = embeddings_df[[self.id_col, self.embedding_col]]
127+
embeddings_df = embeddings_df[[self.id_col, self.embedding_column]]
128128
embeddings_df = embeddings_df.repartition(
129-
partition_size=self.partition_size
129+
partition_size=self.clustering_input_partition_size
130130
)
131-
embeddings_df = embeddings_df.to_backend("pandas").persist()
131+
132+
try:
133+
embeddings_df = embeddings_df.to_backend("pandas").persist()
134+
embeddings_length = embeddings_df.shape[0].compute()
135+
136+
if embeddings_length < self.n_clusters:
137+
raise ValueError(
138+
"Number of clusters is greater than the number of documents in your dataset: "
139+
f"dataset length is {embeddings_length} while n_clusters is set to {self.n_clusters}. "
140+
f"Please reduce n_clusters to be less than or equal to {embeddings_length}."
141+
)
142+
except IndexError as e:
143+
raise IndexError(
144+
f'Original error message: "{e}". '
145+
"This could be due to empty partitions in your DocumentDataset. "
146+
"Please check your dataset for empty partitions and remove them if necessary."
147+
)
148+
132149
embeddings_df = embeddings_df.to_backend("cudf")
133150

134151
cupy_darr = embeddings_df.map_partitions(
135-
get_embedding_ar, self.embedding_col, meta=cp.ndarray([1, 1])
152+
get_embedding_ar, self.embedding_column, meta=cp.ndarray([1, 1])
136153
)
137154
cupy_darr.compute_chunk_sizes()
138155
t0 = time.time()
@@ -156,7 +173,7 @@ def __call__(self, embeddings_dataset: DocumentDataset):
156173
meta_df["dist_to_cent"] = cp.zeros(1)
157174
embeddings_df = embeddings_df.map_partitions(
158175
add_dist_to_cents,
159-
embedding_col=self.embedding_col,
176+
embedding_col=self.embedding_column,
160177
centroids=kmeans.cluster_centers_,
161178
meta=meta_df,
162179
)
@@ -198,7 +215,7 @@ def __call__(self, embeddings_dataset: DocumentDataset):
198215
output_sorted_clusters_dir=os.path.join(
199216
self.clustering_output_dir, "sorted"
200217
),
201-
embedding_col=self.embedding_col,
218+
embedding_col=self.embedding_column,
202219
sim_metric=self.sim_metric,
203220
keep_hard=self.keep_hard,
204221
kmeans_with_cos_dist=self.kmeans_with_cos_dist,

nemo_curator/modules/semantic_dedup/embeddings.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
22
#
33
# Licensed under the Apache License, Version 2.0 (the "License");
44
# you may not use this file except in compliance with the License.
@@ -228,6 +228,12 @@ def __call__(self, dataset: DocumentDataset) -> DocumentDataset:
228228
self.profile_dir, "embedding-creator"
229229
):
230230
embedding_ddf = self.create_embeddings(dataset.df, self.input_column)
231+
232+
# category column dtypes are not supported by the GPU-accelerated Parquet writer
233+
for col in embedding_ddf.columns:
234+
if embedding_ddf[col].dtype.name == "category":
235+
embedding_ddf[col] = embedding_ddf[col].astype("str")
236+
231237
write_to_disk(
232238
embedding_ddf,
233239
self.embedding_output_dir,

nemo_curator/modules/semantic_dedup/semanticclusterleveldedup.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
22
#
33
# Licensed under the Apache License, Version 2.0 (the "License");
44
# you may not use this file except in compliance with the License.
@@ -42,7 +42,7 @@ def __init__(
4242
id_column_type: str,
4343
which_to_keep: str,
4444
output_dir: str,
45-
embedding_col: str = "embeddings",
45+
embedding_column: str = "embeddings",
4646
logger: Union[logging.Logger, str] = "./",
4747
profile_dir: Optional[str] = None,
4848
) -> None:
@@ -57,7 +57,7 @@ def __init__(
5757
id_column_type (str): Data type of the ID column.
5858
which_to_keep (str): Strategy for which duplicate to keep.
5959
output_dir (str): Directory to save output files.
60-
embedding_col (str): Column where the embeddings are stored.
60+
embedding_column (str): Column where the embeddings are stored.
6161
logger (Union[logging.Logger, str]): Logger instance or path to the log file directory.
6262
profile_dir (str): If specified directory to write dask profile. Default is None.
6363
"""
@@ -72,7 +72,7 @@ def __init__(
7272
output_dir, "semdedup_pruning_tables"
7373
)
7474
self.computed_semantic_match_dfs = False
75-
self.embedding_col = embedding_col
75+
self.embedding_column = embedding_column
7676
self.logger = self._setup_logger(logger)
7777
self.profile_dir = profile_dir
7878

@@ -132,7 +132,7 @@ def compute_semantic_match_dfs(
132132
id_col_type=self.id_col_type,
133133
eps_list=eps_list,
134134
output_dir=self.semdedup_pruning_tables_dir,
135-
embedding_col=self.embedding_col,
135+
embedding_col=self.embedding_column,
136136
which_to_keep=self.which_to_keep,
137137
)
138138
)

nemo_curator/modules/semantic_dedup/semdedup.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
22
#
33
# Licensed under the Apache License, Version 2.0 (the "License");
44
# you may not use this file except in compliance with the License.
@@ -50,10 +50,13 @@ def __init__(
5050
self.embedding_creator = EmbeddingCreator(
5151
embedding_model_name_or_path=config.embedding_model_name_or_path,
5252
embedding_batch_size=config.embedding_batch_size,
53+
embedding_output_dir=os.path.join(cache_dir, config.embeddings_save_loc),
54+
embedding_max_mem_gb=config.embedding_max_mem_gb,
5355
embedding_pooling_strategy=config.embedding_pooling_strategy,
5456
input_column=input_column,
55-
embedding_output_dir=os.path.join(cache_dir, config.embeddings_save_loc),
57+
embedding_column=config.embedding_column,
5658
write_embeddings_to_disk=config.write_embeddings_to_disk,
59+
write_to_filename=config.write_to_filename,
5760
logger=logger,
5861
profile_dir=self.config.profile_dir,
5962
)
@@ -62,6 +65,12 @@ def __init__(
6265
max_iter=config.max_iter,
6366
n_clusters=config.n_clusters,
6467
clustering_output_dir=os.path.join(cache_dir, config.clustering_save_loc),
68+
embedding_column=config.embedding_column,
69+
sim_metric=config.sim_metric,
70+
which_to_keep=config.which_to_keep,
71+
sort_clusters=config.sort_clusters,
72+
kmeans_with_cos_dist=config.kmeans_with_cos_dist,
73+
clustering_input_partition_size=config.clustering_input_partition_size,
6574
logger=logger,
6675
profile_dir=self.config.profile_dir,
6776
)
@@ -77,6 +86,7 @@ def __init__(
7786
id_column_type=id_column_type,
7887
which_to_keep=config.which_to_keep,
7988
output_dir=os.path.join(cache_dir, config.clustering_save_loc),
89+
embedding_column=config.embedding_column,
8090
logger=logger,
8191
profile_dir=self.config.profile_dir,
8292
)

0 commit comments

Comments
 (0)