Fix minor bugs in fuzzy workflow (#999)

ayushdg · web-flow · commit f1067d481d96 · 2025-08-29T14:35:09.000-07:00
Signed-off-by: Ayush Dattagupta &lt;ayushdg95@gmail.com&gt;
diff --git a/nemo_curator/stages/deduplication/fuzzy/identify_duplicates.py b/nemo_curator/stages/deduplication/fuzzy/identify_duplicates.py
@@ -95,6 +95,9 @@ def _get_removal_ids(self, df: "cudf.DataFrame") -> "cudf.DataFrame":
         """
         Get the removal ids for the given dataframe.
         """
+        if len(df) == 0:
+            return df[[self.document_id_field]]
+
         removal_ids = df[df[self.duplicate_group_field].duplicated(keep="first")][self.document_id_field]
         removal_ids = removal_ids.sort_values(ignore_index=True)
         return removal_ids.to_frame()
diff --git a/nemo_curator/stages/deduplication/fuzzy/lsh/lsh.py b/nemo_curator/stages/deduplication/fuzzy/lsh/lsh.py
@@ -245,6 +245,8 @@ def group_by_bucket(self, df: cudf.DataFrame, include_singles: bool = False) ->
         -------
             DataFrame with bucket IDs and lists of document IDs.
         """
+        if len(df) == 0:
+            return df
         if not include_singles:
             # TODO: Add support for generating LSH index with single-document buckets that can be reused in incremental runs
             # Find bucket_ids that appear more than once (have multiple documents)
diff --git a/nemo_curator/stages/deduplication/fuzzy/lsh/stage.py b/nemo_curator/stages/deduplication/fuzzy/lsh/stage.py
@@ -62,6 +62,9 @@ class LSHStage(ProcessingStage[FileGroupTask, FileGroupTask]):
     bands_per_iteration
         Number of bands to process per shuffle iteration. Between 1 and num_bands.
         Higher values reduce the number of shuffle iterations but increase the memory usage.
+    total_nparts
+        Total number of partitions to write during the shuffle.
+        If None, the number of partitions will be decided automatically by the executor as the closest power of 2 <= number of input tasks.
     """
 
     _name = "LSHStage"
@@ -84,6 +87,7 @@ class LSHStage(ProcessingStage[FileGroupTask, FileGroupTask]):
     spill_memory_limit: int | Literal["auto"] | None = "auto"
     enable_statistics: bool = False
     bands_per_iteration: int = 5  # number of bands to process in each iteration
+    total_nparts: int | None = None
 
     def __post_init__(self):
         super().__init__()
@@ -102,6 +106,7 @@ def __post_init__(self):
             "enable_statistics": self.enable_statistics,
             "read_kwargs": self.read_kwargs,
             "write_kwargs": self.write_kwargs,
+            "total_nparts": self.total_nparts,  # Can be None, executor will set it
         }
 
         if self.bands_per_iteration < 1 or self.bands_per_iteration > self.num_bands:
diff --git a/nemo_curator/stages/deduplication/fuzzy/workflow.py b/nemo_curator/stages/deduplication/fuzzy/workflow.py
@@ -221,7 +221,8 @@ def _create_lsh_pipeline(self) -> Pipeline:
                     num_bands=self.num_bands,
                     minhashes_per_band=self.minhashes_per_band,
                     output_path=self.cache_path,
-                    read_kwargs=self.read_kwargs,
+                    # Reading minhashes from cache_path
+                    read_kwargs=self.cache_kwargs,
                     write_kwargs=self.cache_kwargs,
                     bands_per_iteration=self.bands_per_iteration,
                     rmm_pool_size="auto",
@@ -236,17 +237,17 @@ def _create_connected_components_pipeline(self) -> Pipeline:
             stages=[
                 BucketsToEdgesStage(
                     output_path=self.cache_path,
-                    read_kwargs=self.read_kwargs,
+                    read_kwargs=self.cache_kwargs,
                     write_kwargs=self.cache_kwargs,
                 ),
                 ConnectedComponentsStage(
                     output_path=self.cache_path,
-                    read_kwargs=self.read_kwargs,
+                    read_kwargs=self.cache_kwargs,
                     write_kwargs=self.cache_kwargs,
                 ),
                 IdentifyDuplicatesStage(
                     output_path=self.output_path,
-                    read_kwargs=self.read_kwargs,
+                    read_kwargs=self.cache_kwargs,
                     write_kwargs=self.write_kwargs,
                     rmm_pool_size="auto",
                     spill_memory_limit="auto",
diff --git a/tests/stages/deduplication/fuzzy/test_lsh_stage.py b/tests/stages/deduplication/fuzzy/test_lsh_stage.py
@@ -62,12 +62,19 @@ def minhash_data(self, tmp_path: Path) -> FileGroupTask:
             },
         )
 
-    @pytest.mark.parametrize("bands_per_iteration", [2, 3])
+    @pytest.mark.parametrize(
+        ("bands_per_iteration", "total_nparts"),
+        [
+            (2, 4),
+            (3, None),
+        ],
+    )
     def test_lsh(
         self,
         minhash_data: FileGroupTask,
         tmp_path: Path,
         bands_per_iteration: int,
+        total_nparts: int | None,
     ) -> None:
         # Create LSHStage
         lsh_stage = LSHStage(
@@ -77,6 +84,7 @@ def test_lsh(
             bands_per_iteration=bands_per_iteration,
             minhash_field="_minhash_signature",
             id_field=CURATOR_DEDUP_ID_STR,
+            total_nparts=total_nparts,
         )
 
         # Create pipeline and executor