NVIDIA-NeMo
diff --git a/‎ray-curator/ray_curator/backends/experimental/ray_actor_pool/shuffle_adapter.py‎
Lines changed: 2 additions & 1 deletion b/‎ray-curator/ray_curator/backends/experimental/ray_actor_pool/shuffle_adapter.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎ray-curator/ray_curator/stages/deduplication/fuzzy/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎ray-curator/ray_curator/stages/deduplication/fuzzy/__init__.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎ray-curator/ray_curator/stages/deduplication/fuzzy/buckets_to_edges.py‎
Lines changed: 7 additions & 10 deletions b/‎ray-curator/ray_curator/stages/deduplication/fuzzy/buckets_to_edges.py‎
Lines changed: 7 additions & 10 deletions
diff --git a/‎ray-curator/ray_curator/stages/deduplication/fuzzy/connected_components.py‎
Lines changed: 24 additions & 23 deletions b/‎ray-curator/ray_curator/stages/deduplication/fuzzy/connected_components.py‎
Lines changed: 24 additions & 23 deletions
diff --git a/‎ray-curator/ray_curator/stages/deduplication/fuzzy/identify_duplicates.py‎
Lines changed: 144 additions & 0 deletions b/‎ray-curator/ray_curator/stages/deduplication/fuzzy/identify_duplicates.py‎
Lines changed: 144 additions & 0 deletions
@@ -16,7 +16,8 @@
     from ray_curator.stages.shuffler.stage import ShuffleStage
 
 
-@ray.remote
+# TODO: Remove once UCX memory usage with GPU staging buffers is fixed.
+@ray.remote(runtime_env={"env_vars": {"UCX_RNDV_FRAG_MEM_TYPES": "host"}})
 class ShuffleStageAdapter(BaseStageAdapter):
     """Ray actor that wraps a shuffle stage and its actor.
 
 
@@ -0,0 +1,3 @@
+from ray_curator.stages.deduplication.fuzzy.workflow import FuzzyDeduplicationWorkflow
+
+__all__ = ["FuzzyDeduplicationWorkflow"]
@@ -24,7 +24,7 @@
 from ray_curator.stages.deduplication.id_generator import CURATOR_DEDUP_ID_STR
 from ray_curator.stages.resources import Resources
 from ray_curator.tasks import FileGroupTask
-from ray_curator.utils.file_utils import delete_dir, get_fs, is_not_empty
+from ray_curator.utils.file_utils import create_or_overwrite_dir, get_fs
 
 
 class BucketsToEdgesStage(ProcessingStage[FileGroupTask, FileGroupTask]):
@@ -34,7 +34,7 @@ class BucketsToEdgesStage(ProcessingStage[FileGroupTask, FileGroupTask]):
 
     Args:
         doc_id_field: The field name containing the document ids for each bucket.
-        output_dir: The directory to write the output file to.
+        output_path: The directory to write the output file to.
         read_kwargs: Keyword arguments to pass for reading the input files.
             Only the storage_options key is supported for now.
         write_kwargs: Keyword arguments to pass for writing the output files.
@@ -46,7 +46,7 @@ class BucketsToEdgesStage(ProcessingStage[FileGroupTask, FileGroupTask]):
 
     def __init__(
         self,
-        output_dir: str,
+        output_path: str,
         doc_id_field: str = CURATOR_DEDUP_ID_STR,
         read_kwargs: dict[str, Any] | None = None,
         write_kwargs: dict[str, Any] | None = None,
@@ -57,14 +57,11 @@ def __init__(
         self.read_storage_options = read_kwargs.get("storage_options") if read_kwargs is not None else None
         self.write_storage_options = write_kwargs.get("storage_options") if write_kwargs is not None else None
 
-        self.output_fs = get_fs(output_dir, self.write_storage_options)
-        self.output_dir = self.output_fs.sep.join([output_dir, self.name])
+        self.output_fs = get_fs(output_path, self.write_storage_options)
+        self.output_path = self.output_fs.sep.join([output_path, self.name])
 
         # Handle output directory cleanup logic
-        if is_not_empty(self.output_dir, self.output_fs):
-            logger.warning(f"Output directory {self.output_dir} is not empty. Deleting it.")
-            delete_dir(self.output_dir, self.output_fs)
-        self.output_fs.mkdirs(self.output_dir, exist_ok=True)
+        create_or_overwrite_dir(self.output_path, fs=self.output_fs)
 
     def _check_io_kwargs(self, kwargs: dict[str, Any] | None) -> None:
         if kwargs is not None:
@@ -81,7 +78,7 @@ def process(self, task: FileGroupTask) -> FileGroupTask:
         edges = [list(edge) for edge in edges]
         edges = pa.Table.from_pandas(pd.DataFrame(edges, columns=[f"{self.doc_id_field}_x", f"{self.doc_id_field}_y"]))
 
-        output_path = self.output_fs.sep.join([self.output_dir, f"{task.task_id}.parquet"])
+        output_path = self.output_fs.sep.join([self.output_path, f"{task._uuid}.parquet"])
         pq.write_table(edges, output_path, filesystem=self.output_fs)
         return FileGroupTask(
             task_id=f"{task.task_id}",
 
@@ -23,6 +23,7 @@
 
 from ray_curator.backends.experimental.utils import RayStageSpecKeys
 from ray_curator.stages.base import ProcessingStage
+from ray_curator.stages.deduplication.fuzzy.utils import CURATOR_FUZZY_DUPLICATE_GROUP_FIELD
 from ray_curator.stages.deduplication.id_generator import (
     CURATOR_DEDUP_ID_STR,
 )
@@ -38,23 +39,23 @@
 class ConnectedComponentsStage(ProcessingStage[FileGroupTask, FileGroupTask], DeduplicationIO):
     def __init__(
         self,
-        output_dir: str,
-        source_column: str = f"{CURATOR_DEDUP_ID_STR}_x",
-        destination_column: str = f"{CURATOR_DEDUP_ID_STR}_y",
+        output_path: str,
+        source_field: str = f"{CURATOR_DEDUP_ID_STR}_x",
+        destination_field: str = f"{CURATOR_DEDUP_ID_STR}_y",
         read_kwargs: dict | None = None,
         write_kwargs: dict | None = None,
     ):
         """
         Args:
-            output_dir: The directory to write the resulting connected components to.
-            source_column: The column name containing the document ids of the source of the edge.
-            destination_column: The column name containing the document ids of the destination of the edge.
+            output_path: The path to write the resulting connected components to.
+            source_field: The field name containing the document ids of the source of the edge.
+            destination_field: The field name containing the document ids of the destination of the edge.
             read_kwargs: Keyword arguments to pass for reading the input files.
             write_kwargs: Keyword arguments to pass for writing the output files.
         """
 
-        self.source_column = source_column
-        self.destination_column = destination_column
+        self.source_field = source_field
+        self.destination_field = destination_field
         self.read_kwargs = read_kwargs if read_kwargs is not None else {}
         self.write_kwargs = write_kwargs if write_kwargs is not None else {}
 
@@ -63,9 +64,9 @@ def __init__(
         self._batch_size = None
 
         # Handle output directory cleanup logic
-        self.output_fs = get_fs(output_dir, self.write_kwargs.get("storage_options"))
-        self.output_dir = self.output_fs.sep.join([output_dir, self.name])
-        create_or_overwrite_dir(self.output_dir, self.output_fs)
+        self.output_fs = get_fs(output_path, self.write_kwargs.get("storage_options"))
+        self.output_path = self.output_fs.sep.join([output_path, self.name])
+        create_or_overwrite_dir(self.output_path, fs=self.output_fs)
 
     def setup(self, _worker_metadata: "WorkerMetadata | None" = None) -> None:
         if not hasattr(self, "_raft_handle"):
@@ -159,34 +160,34 @@ def process(self, task: FileGroupTask) -> FileGroupTask:
 
     def process_batch(self, tasks: list[FileGroupTask]) -> list[FileGroupTask]:
         """
-        Process an input file, compute minhashes, and write results to an output file.
-        Automatically adds a unique _curator_id field to each document if not present.
+        Process a batch of input files containing edges between documents.
+        Compute the weakly connected components of the graph and write a mapping of document ids to their connected component id.
 
         Parameters
         ----------
-        infiles: str, list[str]
-            Path to input file (JSONL format) or list of paths
-        outfile: str
-            Path to output file (Parquet format)
-        columns: list, optional
-            Columns to read from input file
+        tasks: list[FileGroupTask]
+            A list of FileGroupTasks containing the input files.
+        Returns
+        -------
+        list[FileGroupTask]
+            A list of FileGroupTasks containing the output doc_id to connected component id mapping.
         """
         input_files = []
         for task in tasks:
             input_files.extend(task.data)
-        output_file = self.output_fs.sep.join([self.output_dir, f"{tasks[0].task_id}.parquet"])
-        edgelist_columns = [self.source_column, self.destination_column]
+        output_file = self.output_fs.sep.join([self.output_path, f"{tasks[0]._uuid}.parquet"])
+        edgelist_columns = [self.source_field, self.destination_field]
         dfs = []
         for input_file in input_files:
             dfs.append(self.read_parquet(input_file, columns=edgelist_columns, **self.read_kwargs))
         df = cudf.concat(dfs)
         # remove duplicate edges
         df = df.drop_duplicates(subset=edgelist_columns, ignore_index=True)
-        vertices, labels = self.weakly_connected_components(df, self.source_column, self.destination_column)
+        vertices, labels = self.weakly_connected_components(df, self.source_field, self.destination_field)
         df = cudf.DataFrame(
             {
                 CURATOR_DEDUP_ID_STR: vertices,
-                "_duplicate_group_id": labels,
+                CURATOR_FUZZY_DUPLICATE_GROUP_FIELD: labels,
             }
         )
         self.write_parquet(df=df, filepath=output_file, index=False, **self.write_kwargs)
 
@@ -0,0 +1,144 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING, Any, Literal
+
+from ray_curator.stages.deduplication.fuzzy.utils import CURATOR_FUZZY_DUPLICATE_GROUP_FIELD
+from ray_curator.stages.deduplication.id_generator import CURATOR_DEDUP_ID_STR
+from ray_curator.stages.shuffler.rapidsmpf_shuffler import pylibcudf_to_cudf_dataframe
+from ray_curator.stages.shuffler.stage import ShuffleStage
+from ray_curator.tasks import FileGroupTask
+from ray_curator.utils.file_utils import get_fs
+
+if TYPE_CHECKING:
+    import cudf
+
+DUPLICATE_IDS_SUBDIR = "FuzzyDuplicateIds"
+
+
+class IdentifyDuplicatesStage(ShuffleStage):
+    """
+    Stage that generates removal IDs for fuzzy deduplication.
+    The approach involves shuffling the data based on the duplicate group field similar to grouping by the group field.
+    followed by selecting one document per group.
+    Currently the removal strategy is to randomly keep one document per group.
+
+    Parameters
+    ----------
+    duplicate_group_field
+        Column name representing the group id for a document.
+    total_nparts
+        Total number of output partitions. If None, will be set automatically by the executor.
+    output_path
+        Path to write output files.
+    read_kwargs
+        Keyword arguments for cudf.read_parquet method.
+    write_kwargs
+        Keyword arguments for cudf.to_parquet method.
+    rmm_pool_size
+        Size of the RMM GPU memory pool in bytes.
+        If "auto", the memory pool is set to 90% of the free GPU memory.
+        If None, the memory pool is set to 50% of the free GPU memory that can expand if needed.
+    spill_memory_limit
+        Device memory limit in bytes for spilling to host.
+        If "auto", the limit is set to 80% of the RMM pool size.
+        If None spilling is disabled.
+    enable_statistics
+        Whether the underlying rapidsmpf shuffler should collect shuffle statistics.
+    """
+
+    _name = "IdentifyDuplicates"
+
+    def __init__(  # noqa: PLR0913
+        self,
+        duplicate_group_field: str = CURATOR_FUZZY_DUPLICATE_GROUP_FIELD,
+        document_id_field: str = CURATOR_DEDUP_ID_STR,
+        total_nparts: int | None = None,
+        output_path: str = "./",
+        read_kwargs: dict[str, Any] | None = None,
+        write_kwargs: dict[str, Any] | None = None,
+        rmm_pool_size: int | Literal["auto"] | None = "auto",
+        spill_memory_limit: int | Literal["auto"] | None = "auto",
+        enable_statistics: bool = False,
+    ):
+        self.duplicate_group_field = duplicate_group_field
+        self.document_id_field = document_id_field
+        self.output_fs = get_fs(
+            output_path, storage_options=read_kwargs.get("storage_options") if read_kwargs is not None else None
+        )
+        self.output_path = self.output_fs.sep.join([output_path, DUPLICATE_IDS_SUBDIR])
+        self.write_kwargs = write_kwargs
+
+        super().__init__(
+            shuffle_on=[duplicate_group_field],
+            total_nparts=total_nparts,
+            output_path=self.output_path,
+            read_kwargs=read_kwargs,
+            write_kwargs=write_kwargs,
+            rmm_pool_size=rmm_pool_size,
+            spill_memory_limit=spill_memory_limit,
+            enable_statistics=enable_statistics,
+        )
+
+    def _get_removal_ids(self, df: "cudf.DataFrame") -> "cudf.DataFrame":
+        """
+        Get the removal ids for the given dataframe.
+        """
+        removal_ids = df[df[self.duplicate_group_field].duplicated(keep="first")][self.document_id_field]
+        removal_ids = removal_ids.sort_values(ignore_index=True)
+        return removal_ids.to_frame()
+
+    def process(self, task: FileGroupTask) -> FileGroupTask:
+        return super().process(task)
+
+    def ray_stage_spec(self) -> dict[str, Any]:
+        return super().ray_stage_spec()
+
+    def read_and_insert(self, task: FileGroupTask) -> FileGroupTask:
+        super().read_and_insert(task)
+        return task
+
+    def insert_finished(self) -> None:
+        super().insert_finished()
+
+    def extract_and_write(self) -> list[FileGroupTask]:
+        self._check_actor_obj()
+        write_kwargs = self.write_kwargs.copy()
+        write_kwargs["index"] = write_kwargs.get("index", False)
+
+        result_tasks = []
+        for partition_id, partition in self._actor_obj.extract():
+            shuffled_partition_df = pylibcudf_to_cudf_dataframe(partition, column_names=self.output_columns)
+            num_groups = shuffled_partition_df[self.duplicate_group_field].nunique()
+            removal_ids = self._get_removal_ids(shuffled_partition_df)
+
+            output_file = self.output_fs.sep.join([self.output_path, f"part.{partition_id}.parquet"])
+            # If user has not specified row_group_size_rows, set it to the lower of 10% of the number of removal ids or 1M (default) or a minimum of 1k (for small datasets)
+            write_kwargs["row_group_size_rows"] = write_kwargs.get(
+                "row_group_size_rows", max(1000, min(len(removal_ids) // 10, 1000 * 1000))
+            )
+            removal_ids.to_parquet(output_file, **write_kwargs)
+            result_tasks.append(
+                FileGroupTask(
+                    task_id=partition_id,
+                    dataset_name=self.dataset_name + f"{self.name}",
+                    data=[output_file],
+                    _metadata={
+                        "partition_index": partition_id,
+                        "num_groups": num_groups,
+                        "num_removal_ids": len(removal_ids),
+                    },
+                )
+            )
+        return result_tasks
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from ray_curator.stages.deduplication.fuzzy.workflow import FuzzyDeduplicationWorkflow`
	`2`	`+`
	`3`	`+__all__ = ["FuzzyDeduplicationWorkflow"]`