Bump rapids to 25.10 (#1204)

ayushdg · web-flow · commit cebf111c57ec · 2025-11-12T16:23:40.000-05:00
* Bump rapids to 25.10 and remove cugraph in favor of pylibcugraph

Signed-off-by: Ayush Dattagupta &lt;ayushdg95@gmail.com&gt;

* Update shuffler based on rapidsmpf api changes

Signed-off-by: Ayush Dattagupta &lt;ayushdg95@gmail.com&gt;

* Remove cugraph comms init in favor of pylibcugraph comms

Signed-off-by: Ayush Dattagupta &lt;ayushdg95@gmail.com&gt;

* Update KMeans API to the latest version

Signed-off-by: Ayush Dattagupta &lt;ayushdg95@gmail.com&gt;

* Update KMeans to pass in random state and use kmeans|| for single GPU consistency check

Signed-off-by: Ayush Dattagupta &lt;ayushdg95@gmail.com&gt;

* Upgrade packages in lockfile

Signed-off-by: Ayush Dattagupta &lt;ayushdg95@gmail.com&gt;

* Update lockfile

Signed-off-by: Ayush Dattagupta &lt;ayushdg95@gmail.com&gt;

* Exclude rapidsmpf shuffler class from testing since it's tested indirectly via shuffle stage

Signed-off-by: Ayush Dattagupta &lt;ayushdg95@gmail.com&gt;

---------

Signed-off-by: Ayush Dattagupta &lt;ayushdg95@gmail.com&gt;
diff --git a/nemo_curator/stages/deduplication/fuzzy/connected_components.py b/nemo_curator/stages/deduplication/fuzzy/connected_components.py
@@ -16,10 +16,10 @@
 from typing import TYPE_CHECKING, Any
 
 import cudf
-from cugraph.dask.comms.comms_wrapper import init_subcomms as c_init_subcomms
 from loguru import logger
 from pylibcugraph import GraphProperties, MGGraph, ResourceHandle
 from pylibcugraph import weakly_connected_components as pylibcugraph_wcc
+from pylibcugraph.comms.comms_wrapper import init_subcomms as c_init_subcomms
 
 from nemo_curator.backends.experimental.utils import RayStageSpecKeys
 from nemo_curator.stages.base import ProcessingStage
diff --git a/nemo_curator/stages/deduplication/semantic/kmeans.py b/nemo_curator/stages/deduplication/semantic/kmeans.py
@@ -189,8 +189,8 @@ def process_batch(self, tasks: list[FileGroupTask]) -> list[_EmptyTask]:
         logger.debug(f"Read time: {(t1 - t0):.2f} seconds")
         # Fit the model cooperatively across actors, then predict on local data
         concatenated_embeddings = cp.concatenate(embeddings_arrays, axis=0)
-        self.kmeans.fit(concatenated_embeddings, sample_weight=None)
-        labels = self.kmeans.predict(concatenated_embeddings).astype(cp.int32)
+        self.kmeans._fit(concatenated_embeddings, sample_weight=None, convert_dtype=False, multigpu=True)
+        labels = self.kmeans.predict(concatenated_embeddings, convert_dtype=False).astype(cp.int32)
 
         t2 = time.perf_counter()
         logger.info(f"KMeans fit+predict time: {(t2 - t1):.2f} seconds")
@@ -233,7 +233,7 @@ def process_batch(self, tasks: list[FileGroupTask]) -> list[_EmptyTask]:
         return results
 
     def setup(self, _: WorkerMetadata | None = None) -> None:
-        from cuml.cluster.kmeans_mg import KMeansMG as cumlKMeans
+        from cuml.cluster.kmeans import KMeans as cumlKMeans
 
         if not hasattr(self, "_raft_handle"):
             msg = "RAFT handle not found. Make sure the stage is initialized with RAFT"
@@ -246,11 +246,11 @@ def setup(self, _: WorkerMetadata | None = None) -> None:
             n_clusters=self.n_clusters,
             max_iter=self.max_iter,
             tol=self.tol,
+            random_state=self.random_state,
             verbose=self.verbose,
             n_init=self.n_init,
             oversampling_factor=self.oversampling_factor,
             max_samples_per_batch=self.max_samples_per_batch,
-            convert_dtype=False,
         )
 
     @staticmethod
diff --git a/nemo_curator/stages/deduplication/shuffle_utils/rapidsmpf_shuffler.py b/nemo_curator/stages/deduplication/shuffle_utils/rapidsmpf_shuffler.py
@@ -20,7 +20,12 @@
 import rmm.mr
 from rapidsmpf.buffer.buffer import MemoryType
 from rapidsmpf.buffer.resource import BufferResource, LimitAvailableMemory
-from rapidsmpf.shuffler import partition_and_pack, unpack_and_concat
+from rapidsmpf.integrations.cudf.partition import (
+    partition_and_pack,
+    unpack_and_concat,
+    unspill_partitions,
+)
+from rapidsmpf.rmm_resource_adaptor import RmmResourceAdaptor
 from rapidsmpf.statistics import Statistics
 from rapidsmpf.utils.cudf import cudf_to_pylibcudf_table, pylibcudf_to_cudf_dataframe
 from rapidsmpf.utils.ray_utils import BaseShufflingActor
@@ -34,7 +39,8 @@
     from rapidsmpf.shuffler import Shuffler
 
 
-class BulkRapidsMPFShuffler(BaseShufflingActor):
+# Exempt this class from coverage is it's indirectly tested by the ShuffleStage which coverage tools don't pick up.
+class BulkRapidsMPFShuffler(BaseShufflingActor):  # pragma: no cover
     """
     Class that performs a bulk shuffle operation.
     This class is compatible with Ray Actors communicating with each other using UCXX communication.
@@ -120,7 +126,7 @@ def setup_worker(self, root_address_bytes: bytes) -> None:
         super().setup_worker(root_address_bytes)
 
         # Initialize the RMM memory resource
-        mr = rmm.mr.StatisticsResourceAdaptor(
+        mr = RmmResourceAdaptor(
             rmm.mr.PoolMemoryResource(
                 rmm.mr.CudaMemoryResource(),
                 initial_pool_size=self.rmm_pool_size,
@@ -134,14 +140,14 @@ def setup_worker(self, root_address_bytes: bytes) -> None:
             if self.spill_memory_limit is None
             else {MemoryType.DEVICE: LimitAvailableMemory(mr, limit=self.spill_memory_limit)}
         )
-        br = BufferResource(mr, memory_available)
+        self.br = BufferResource(device_mr=mr, memory_available=memory_available)
         # Create a statistics object
-        self.stats = Statistics(self.enable_statistics)
+        self.stats = Statistics(enable=self.enable_statistics, mr=mr)
         # Create a shuffler
         self.shuffler: Shuffler = self.create_shuffler(
             0,
             total_num_partitions=self.total_nparts,
-            buffer_resource=br,
+            buffer_resource=self.br,
             statistics=self.stats,
         )
 
@@ -216,11 +222,11 @@ def insert_chunk(self, table: plc.Table | cudf.DataFrame, column_names: list[str
             table = cudf_to_pylibcudf_table(table)
         columns_to_hash = tuple(column_names.index(val) for val in self.shuffle_on)
         packed_inputs = partition_and_pack(
-            table,
+            table=table,
+            br=self.br,
             columns_to_hash=columns_to_hash,
             num_partitions=self.total_nparts,
             stream=DEFAULT_STREAM,
-            device_mr=rmm.mr.get_current_device_resource(),
         )
         self.shuffler.insert_chunks(packed_inputs)
 
@@ -269,9 +275,14 @@ def extract(self) -> Iterator[tuple[int, plc.Table]]:
             partition_id = self.shuffler.wait_any()
             packed_chunks = self.shuffler.extract(partition_id)
             partition = unpack_and_concat(
-                packed_chunks,
+                unspill_partitions(
+                    packed_chunks,
+                    br=self.br,
+                    allow_overbooking=True,
+                    statistics=self.stats,
+                ),
+                br=self.br,
                 stream=DEFAULT_STREAM,
-                device_mr=rmm.mr.get_current_device_resource(),
             )
             yield partition_id, partition
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -71,13 +71,12 @@ cuda12 = ["gpustat", "nvidia-ml-py"]
 
 # Installs CPU + GPU text curation modules
 deduplication_cuda12 = [
-    "cudf-cu12==25.6.*",
-    "cugraph-cu12==25.6.*",
-    "cuml-cu12==25.6.*",
-    "nx-cugraph-cu12==25.6.*",
-    "pylibraft-cu12==25.6.*",
-    "raft-dask-cu12==25.6.*",
-    "rapidsmpf-cu12==25.6.*",
+    "cudf-cu12==25.10.*",
+    "cuml-cu12==25.10.*",
+    "pylibcugraph-cu12==25.10.*",
+    "pylibraft-cu12==25.10.*",
+    "raft-dask-cu12==25.10.*",
+    "rapidsmpf-cu12==25.10.*",
 ]
 
 audio_cpu = [
diff --git a/tests/stages/deduplication/semantic/test_kmeans.py b/tests/stages/deduplication/semantic/test_kmeans.py
@@ -114,7 +114,7 @@ def run_single_gpu_baseline(
 ) -> np.ndarray:
     single_gpu_kmeans = cuml.KMeans(
         n_clusters=n_clusters,
-        init="k-means++",
+        init="k-means||",
         max_iter=300,
         tol=1e-4,
         random_state=RANDOM_STATE,
@@ -377,7 +377,7 @@ def test_process_batch_multiple_groups(self, tmp_path: Path):  # noqa: PLR0915
 
         # Only mock the essential parts that can't run without RAFT setup
         mock_kmeans = Mock()
-        mock_kmeans.fit = Mock()
+        mock_kmeans._fit = Mock()
         mock_kmeans.predict = Mock(return_value=cp.zeros(40, dtype=cp.int32))
         mock_kmeans.cluster_centers_ = cp.random.random((2, 32), dtype=cp.float32)
         stage.kmeans = mock_kmeans
@@ -439,11 +439,11 @@ def spy_write(*args, **kwargs) -> None:
                 assert call_kwargs["assign_id"] is False
 
             # Verify KMeans operations
-            mock_kmeans.fit.assert_called_once()
+            mock_kmeans._fit.assert_called_once()
             mock_kmeans.predict.assert_called_once()
 
             # Check the concatenated embeddings shape
-            fit_call_args = mock_kmeans.fit.call_args[0]
+            fit_call_args = mock_kmeans._fit.call_args[0]
             embeddings_passed_to_fit = fit_call_args[0]
             assert embeddings_passed_to_fit.shape == (40, 32), "Should concatenate embeddings from all groups"
 
diff --git a/uv.lock b/uv.lock