Replace numba-cuda runtime dependency with cuda-core (#589)

pentschev · jameslamb · web-flow · commit 5f2aec6f628e · 2026-02-13T12:58:03.000-06:00
* Replace numba-cuda runtime dependency with cuda-core

* Fix numba-cuda selection of CUDA-versioned packages

---------

Co-authored-by: James Lamb &lt;jaylamb20@gmail.com&gt;
diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml
@@ -10,6 +10,7 @@ dependencies:
 - c-compiler
 - cloudpickle
 - cmake>=3.26.4,!=3.30.0
+- cuda-core>=0.3.2
 - cuda-cudart-dev
 - cuda-nvcc
 - cuda-version=12.9
diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml
@@ -10,6 +10,7 @@ dependencies:
 - c-compiler
 - cloudpickle
 - cmake>=3.26.4,!=3.30.0
+- cuda-core>=0.3.2
 - cuda-cudart-dev
 - cuda-nvcc
 - cuda-version=12.9
diff --git a/conda/environments/all_cuda-131_arch-aarch64.yaml b/conda/environments/all_cuda-131_arch-aarch64.yaml
@@ -10,6 +10,7 @@ dependencies:
 - c-compiler
 - cloudpickle
 - cmake>=3.26.4,!=3.30.0
+- cuda-core>=0.3.2
 - cuda-cudart-dev
 - cuda-nvcc
 - cuda-version=13.1
diff --git a/conda/environments/all_cuda-131_arch-x86_64.yaml b/conda/environments/all_cuda-131_arch-x86_64.yaml
@@ -10,6 +10,7 @@ dependencies:
 - c-compiler
 - cloudpickle
 - cmake>=3.26.4,!=3.30.0
+- cuda-core>=0.3.2
 - cuda-cudart-dev
 - cuda-nvcc
 - cuda-version=13.1
diff --git a/conda/recipes/ucxx/recipe.yaml b/conda/recipes/ucxx/recipe.yaml
@@ -290,8 +290,7 @@ outputs:
         - ${{ pin_subpackage("libucxx", exact=True) }}
         - cuda-cudart-dev
       run:
-        - numba >=0.60.0,<0.62.0
-        - numba-cuda >=0.22.1
+        - cuda-core >=0.3.2
         - numpy >=1.23,<3.0
         # 'nvidia-ml-py' provides the 'pynvml' module
         - nvidia-ml-py>=12
@@ -431,8 +430,7 @@ outputs:
         - setuptools>=77.0.0
         - wheel
       run:
-        - numba >=0.60.0,<0.62.0
-        - numba-cuda >=0.22.1
+        - cuda-core >=0.3.2
         - python
         - pyyaml >=6
         - rapids-dask-dependency ${{ rapids_version }}
diff --git a/dependencies.yaml b/dependencies.yaml
@@ -297,73 +297,74 @@ dependencies:
           - &numpy numpy>=1.23,<3.0
           # 'nvidia-ml-py' provides the 'pynvml' module
           - nvidia-ml-py>=12
+          - cuda-core>=0.3.2
+  run_python_distributed_ucxx:
+    common:
+      - output_types: [conda, requirements, pyproject]
+        packages:
+          - rapids-dask-dependency==26.4.*,>=0.0.0a0
+          - pyyaml>=6
+          - cuda-core>=0.3.2
+  test_cpp:
+    common:
+      - output_types: conda
+        packages:
+          - *cmake_ver
+  test_python_ucxx:
+    common:
+      - output_types: [conda, requirements, pyproject]
+        packages:
+          - cloudpickle
+          - pytest<9.0.0
+          - pytest-asyncio>=1.0.0
+          - pytest-rerunfailures!=16.0.0  # See https://github.com/pytest-dev/pytest-rerunfailures/issues/302
+          - rapids-dask-dependency==26.4.*,>=0.0.0a0
       - output_types: [conda]
         packages:
-          - &numba_cuda numba-cuda>=0.22.1
+          - &numba_cuda_test numba-cuda>=0.22.1
     specific:
       - output_types: [requirements, pyproject]
         matrices:
           - matrix:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - &numba_cuda_cu12 numba-cuda[cu12]>=0.22.1
+              - &numba_cuda_cu12_test numba-cuda[cu12]>=0.22.1
           - matrix:
               cuda: "13.*"
               cuda_suffixed: "true"
             packages:
-              - &numba_cuda_cu13 numba-cuda[cu13]>=0.22.1
+              - &numba_cuda_cu13_test numba-cuda[cu13]>=0.22.1
           # fallback to numba-cuda with no extra CUDA packages if 'cuda_suffixed' isn't true
           - matrix:
             packages:
-              - *numba_cuda
-  run_python_distributed_ucxx:
+              - *numba_cuda_test
+  test_python_distributed_ucxx:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - rapids-dask-dependency==26.4.*,>=0.0.0a0
-          - pyyaml>=6
+          - *numpy
+          - pytest<9.0.0
+          - pytest-rerunfailures!=16.0.0  # See https://github.com/pytest-dev/pytest-rerunfailures/issues/302
       - output_types: [conda]
         packages:
-          - *numba_cuda
+          - *numba_cuda_test
     specific:
       - output_types: [requirements, pyproject]
         matrices:
           - matrix:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - *numba_cuda_cu12
+              - *numba_cuda_cu12_test
           - matrix:
               cuda: "13.*"
               cuda_suffixed: "true"
             packages:
-              - *numba_cuda_cu13
-          # fallback to numba-cuda with no extra CUDA packages if 'cuda_suffixed' isn't true
+              - *numba_cuda_cu13_test
           - matrix:
             packages:
-              - *numba_cuda
-  test_cpp:
-    common:
-      - output_types: conda
-        packages:
-          - *cmake_ver
-  test_python_ucxx:
-    common:
-      - output_types: [conda, requirements, pyproject]
-        packages:
-          - cloudpickle
-          - pytest<9.0.0
-          - pytest-asyncio>=1.0.0
-          - pytest-rerunfailures!=16.0.0  # See https://github.com/pytest-dev/pytest-rerunfailures/issues/302
-          - rapids-dask-dependency==26.4.*,>=0.0.0a0
-  test_python_distributed_ucxx:
-    common:
-      - output_types: [conda, requirements, pyproject]
-        packages:
-          - *numpy
-          - pytest<9.0.0
-          - pytest-rerunfailures!=16.0.0  # See https://github.com/pytest-dev/pytest-rerunfailures/issues/302
+              - *numba_cuda_test
   depends_on_cupy:
     common:
       - output_types: conda
diff --git a/python/distributed-ucxx/distributed_ucxx/ucxx.py b/python/distributed-ucxx/distributed_ucxx/ucxx.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: BSD-3-Clause
 
 """
@@ -94,13 +94,12 @@ class CudaStream(Enum):
 
 
 def synchronize_stream(stream: CudaStream = CudaStream.Default):
-    import numba.cuda
+    from ucxx._cuda_context import synchronize_default_stream
 
     if stream == CudaStream.Default:
-        numba_stream = numba.cuda.default_stream()
+        synchronize_default_stream()
     else:
         raise ValueError("Unsupported stream")
-    numba_stream.synchronize()
 
 
 class gc_disabled:
@@ -246,11 +245,11 @@ def init_once():
         or ("cuda" in ucx_tls and "^cuda" not in ucx_tls)
     ):
         try:
-            import numba.cuda
-        except ImportError:
+            from ucxx._cuda_context import ensure_cuda_context
+        except ImportError as e:
             raise ImportError(
-                "CUDA support with UCX requires Numba for context management"
-            )
+                "CUDA support with UCX requires cuda-core for context management."
+            ) from e
 
         cuda_visible_device = get_device_index_and_uuid(
             os.environ.get("CUDA_VISIBLE_DEVICES", "0").split(",")[0]
@@ -261,7 +260,7 @@ def init_once():
                 pre_existing_cuda_context.device_info, os.getpid()
             )
 
-        numba.cuda.current_context()
+        ensure_cuda_context(0)
 
         cuda_context_created = has_cuda_context()
         if (
@@ -291,7 +290,8 @@ def init_once():
 
     pool_size_str = get_rmm_config("pool-size")
 
-    # Find the function, `cuda_array()`, to use when allocating new CUDA arrays
+    # Find the function, `cuda_array()`, to use when allocating new CUDA arrays.
+    # RMM is required for CUDA array allocation at runtime (numba is only for tests).
     try:
         import rmm
 
@@ -304,22 +304,9 @@ def device_array(n):
                 pool_allocator=True, managed_memory=False, initial_pool_size=pool_size
             )
     except ImportError:
-        try:
-            import numba.cuda
-
-            def numba_device_array(n):
-                a = numba.cuda.device_array((n,), dtype="u1")
-                weakref.finalize(a, numba.cuda.current_context)
-                return a
-
-            device_array = numba_device_array
 
-        except ImportError:
-
-            def device_array(n):
-                raise RuntimeError(
-                    "In order to send/recv CUDA arrays, Numba or RMM is required"
-                )
+        def device_array(n):
+            raise RuntimeError("In order to send/recv CUDA arrays, RMM is required.")
 
         if pool_size_str is not None:
             logger.warning(
diff --git a/python/distributed-ucxx/pyproject.toml b/python/distributed-ucxx/pyproject.toml
@@ -20,7 +20,7 @@ license = "BSD-3-Clause"
 license-files = ["LICENSE"]
 requires-python = ">=3.11"
 dependencies = [
-    "numba-cuda>=0.22.1",
+    "cuda-core>=0.3.2",
     "pyyaml>=6",
     "rapids-dask-dependency==26.4.*,>=0.0.0a0",
     "ucxx==0.49.*,>=0.0.0a0",
@@ -46,6 +46,7 @@ docs = [
 test = [
     "cudf==26.4.*,>=0.0.0a0",
     "cupy-cuda13x>=13.6.0",
+    "numba-cuda>=0.22.1",
     "numpy>=1.23,<3.0",
     "pytest-rerunfailures!=16.0.0",
     "pytest<9.0.0",
diff --git a/python/ucxx/examples/basic.py b/python/ucxx/examples/basic.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: BSD-3-Clause
 
 import argparse
@@ -12,9 +12,9 @@
 
 
 def _create_cuda_context():
-    import numba.cuda
+    from ucxx._cuda_context import ensure_cuda_context
 
-    numba.cuda.current_context()
+    ensure_cuda_context(0)
 
 
 async def _progress_coroutine(worker):
diff --git a/python/ucxx/pyproject.toml b/python/ucxx/pyproject.toml
@@ -19,8 +19,8 @@ authors = [
 license = "BSD-3-Clause"
 requires-python = ">=3.11"
 dependencies = [
+    "cuda-core>=0.3.2",
     "libucxx==0.49.*,>=0.0.0a0",
-    "numba-cuda>=0.22.1",
     "numpy>=1.23,<3.0",
     "nvidia-ml-py>=12",
     "rmm==26.4.*,>=0.0.0a0",
@@ -44,6 +44,7 @@ test = [
     "cloudpickle",
     "cudf==26.4.*,>=0.0.0a0",
     "cupy-cuda13x>=13.6.0",
+    "numba-cuda>=0.22.1",
     "pytest-asyncio>=1.0.0",
     "pytest-rerunfailures!=16.0.0",
     "pytest<9.0.0",
diff --git a/python/ucxx/ucxx/_cuda_context.py b/python/ucxx/ucxx/_cuda_context.py
@@ -0,0 +1,54 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""CUDA context management using cuda.core.
+
+Provides helpers to ensure a CUDA context is created and to synchronize
+the default stream.
+"""
+
+
+def _get_device_class():
+    """Get the Device class from cuda.core."""
+    try:
+        from cuda.core import Device
+
+        return Device
+    except ImportError:
+        try:
+            from cuda.core.experimental import Device
+
+            return Device
+        except ImportError as e:
+            raise ImportError(
+                "CUDA context management requires cuda-core (cuda-core>=0.3.2)."
+            ) from e
+
+
+def ensure_cuda_context(device_id: int = 0) -> None:
+    """Ensure a CUDA context exists for the given device and set it as current.
+
+    Parameters
+    ----------
+    device_id : int, optional
+        The CUDA device index (default: 0).
+    """
+    Device = _get_device_class()
+    Device(device_id).set_current()
+
+
+def synchronize_default_stream(device_id: int = 0) -> None:
+    """Synchronize the default CUDA stream of the current device.
+
+    Required when coordinating with UCX CUDA transfers (e.g. before send/recv
+    of CUDA buffers).
+
+    Parameters
+    ----------
+    device_id : int, optional
+        The CUDA device index (default: 0).
+    """
+    Device = _get_device_class()
+    device = Device(device_id)
+    device.set_current()
+    device.sync()
diff --git a/python/ucxx/ucxx/_lib_async/continuous_ucx_progress.py b/python/ucxx/ucxx/_lib_async/continuous_ucx_progress.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: BSD-3-Clause
 
 
@@ -70,9 +70,9 @@ def __eq__(self, other):
 
 
 def _create_context():
-    import numba.cuda
+    from ucxx._cuda_context import ensure_cuda_context
 
-    numba.cuda.current_context()
+    ensure_cuda_context(0)
 
 
 class ThreadMode(ProgressTask):
diff --git a/python/ucxx/ucxx/benchmarks/backends/ucxx_core.py b/python/ucxx/ucxx/benchmarks/backends/ucxx_core.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: BSD-3-Clause
 
 from argparse import Namespace
@@ -17,9 +17,9 @@
 
 
 def _create_cuda_context(device):
-    import numba.cuda
+    from ucxx._cuda_context import ensure_cuda_context
 
-    numba.cuda.current_context(0)
+    ensure_cuda_context(device)
 
 
 def _transfer_wireup(ep, server):
diff --git a/python/ucxx/ucxx/benchmarks/send_recv.py b/python/ucxx/ucxx/benchmarks/send_recv.py
diff --git a/python/ucxx/ucxx/benchmarks/utils.py b/python/ucxx/ucxx/benchmarks/utils.py