scikit-hep · shwina · Nov 26, 2025 · Nov 26, 2025 · Nov 27, 2025 · ianna
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -210,6 +210,7 @@ jobs:
             -c rapidsai
             -c nvidia
             python=3.13
+            cccl-python
             cudf
             cupy
             cuda-version=${{ matrix.cuda-version }}

diff --git a/requirements-test-gpu.txt b/requirements-test-gpu.txt
@@ -1,8 +1,9 @@
-# The CI installs cudf and cupy using conda
+# The CI installs cudf, cupy, and cuda-cccl using conda
 # If you are using this file manually uncomment the following lines and
 # set the cuda version matching your system.
 # cudf-cu12
 # cupy-cuda12x
+# cuda-cccl[cu12]
 fsspec>=2022.11.0
 numba>=0.60
 numba-cuda

diff --git a/src/awkward/_backends/cupy.py b/src/awkward/_backends/cupy.py
@@ -4,7 +4,7 @@
 
 from awkward._backends.backend import Backend, KernelKeyType
 from awkward._backends.dispatch import register_backend
-from awkward._kernels import CupyKernel, NumpyKernel
+from awkward._kernels import CudaComputeKernel, CupyKernel, NumpyKernel
 from awkward._nplikes.cupy import Cupy
 from awkward._nplikes.numpy import Numpy
 from awkward._nplikes.numpy_like import NumpyMetadata
@@ -27,13 +27,69 @@ def nplike(self) -> Cupy:
     def __init__(self):
         self._cupy = Cupy.instance()
 
-    def __getitem__(self, index: KernelKeyType) -> CupyKernel | NumpyKernel:
+    def __getitem__(
+        self, index: KernelKeyType
+    ) -> CudaComputeKernel | CupyKernel | NumpyKernel:
         from awkward._connect import cuda
+        from awkward._connect.cuda import _compute as cuda_compute
 
+        kernel_name = index[0] if index else ""
+
+        # Try CuPy kernels first (primary implementation)
         cupy = cuda.import_cupy("Awkward Arrays with CUDA")
         _cuda_kernels = cuda.initialize_cuda_kernels(cupy)
         func = _cuda_kernels[index]
+
         if func is not None:
+            # CuPy kernel exists, use it
             return CupyKernel(func, index)
-        else:
-            raise AssertionError(f"CuPyKernel not found: {index!r}")
+
+        # CuPy kernel not found, try cuda.compute as fallback
+        if self._supports_cuda_compute(kernel_name):
+            if cuda_compute.is_available():
+                # Return CudaComputeKernel for supported operations
+                compute_impl = self._get_cuda_compute_impl(kernel_name)
+                if compute_impl is not None:
+                    return CudaComputeKernel(compute_impl, index)
+            else:
+                # cuda.compute is needed but not available
+                raise NotImplementedError(
+                    f"Operation '{kernel_name}' on CUDA backend requires cuda.compute library "
+                    f"(no CuPy kernel available). "
+                    f"Please install cuda.compute or use the CPU backend: "
+                    f"ak.to_backend(array, 'cpu')"
+                )
+
+        # Neither CuPy kernel nor cuda.compute implementation found
+        raise AssertionError(
+            f"Operation '{kernel_name}' is not supported on CUDA backend. "
+            f"CuPy kernel not found: {index!r}"
+        )
+
+    def _supports_cuda_compute(self, kernel_name: str) -> bool:
+        """
+        Check if the given kernel operation is supported by cuda.compute.
+
+        Currently supports:
+        - awkward_sort
+        - awkward_argsort (future)
+        """
+        # For now, we only support sort operations
+        return kernel_name in ("awkward_sort",)
+
+    def _get_cuda_compute_impl(self, kernel_name: str):
+        """
+        Get the cuda.compute implementation for a kernel operation.
+
+        Args:
+            kernel_name: Name of the kernel operation (e.g., "awkward_sort")
+
+        Returns:
+            Callable implementing the operation, or None if not supported
+        """
+        from awkward._connect.cuda import _compute as cuda_compute
+
+        if kernel_name == "awkward_sort":
+            return cuda_compute.segmented_sort
+
+        return None
diff --git a/src/awkward/_connect/cuda/_compute.py b/src/awkward/_connect/cuda/_compute.py
@@ -0,0 +1,65 @@
+# BSD 3-Clause License; see https://github.com/scikit-hep/awkward/blob/main/LICENSE
+
+from __future__ import annotations
+
+from awkward._nplikes.cupy import Cupy
+
+# Cache for cuda.compute availability
+_cuda_compute_available: bool | None = None
+
+
+def is_available() -> bool:
+    global _cuda_compute_available
+
+    if _cuda_compute_available is not None:
+        return _cuda_compute_available
+
+    try:
+        import cuda.compute  # noqa: F401
+
+        _cuda_compute_available = True
+    except ImportError:
+        _cuda_compute_available = False
+
+    return _cuda_compute_available
+
+
+def segmented_sort(
+    toptr,
+    fromptr,
+    length,
+    offsets,
+    offsetslength,
+    parentslength,
+    ascending,
+    stable,
+):
+    from cuda.compute import SortOrder, segmented_sort
+
+    cupy_nplike = Cupy.instance()
+    cp = cupy_nplike._module
+
+    # Ensure offsets are int64 as expected by segmented_sort
+    if offsets.dtype != cp.int64:
+        offsets = offsets.astype(cp.int64)
+
+    num_segments = offsetslength - 1
+    num_items = int(offsets[-1]) if len(offsets) > 0 else 0
+
+    start_offsets = offsets[:-1]
+    end_offsets = offsets[1:]
+
+    order = SortOrder.ASCENDING if ascending else SortOrder.DESCENDING
+
+    segmented_sort(
+        fromptr,  # d_in_keys
+        toptr,  # d_out_keys
+        None,  # d_in_values (not sorting values, just keys)
+        None,  # d_out_values
+        num_items,  # num_items
+        num_segments,  # num_segments
+        start_offsets,  # start_offsets_in
+        end_offsets,  # end_offsets_in
+        order,  # order (ASCENDING or DESCENDING)
+        None,  # stream (use default stream)
+    )
diff --git a/src/awkward/_kernels.py b/src/awkward/_kernels.py
@@ -17,7 +17,8 @@
 from awkward._nplikes.typetracer import try_touch_data
 from awkward._typing import Protocol, TypeAlias
 
-KernelKeyType: TypeAlias = tuple  # Tuple[str, Unpack[Tuple[metadata.dtype, ...]]]
+# Tuple[str, Unpack[Tuple[metadata.dtype, ...]]]
+KernelKeyType: TypeAlias = tuple
 
 
 numpy = Numpy.instance()
@@ -215,6 +216,24 @@ def __call__(self, *args) -> None:
         self._impl(grid, blocks, args)
 
 
+class CudaComputeKernel(BaseKernel):
+    """
+    Kernel implementation using cuda.compute library.
+
+    When the CUDA backend is used, this kernel is used for operations
+    that have ``cuda.compute`` implementations. For other operations,
+    the ``CupyKernel`` is used.
+    """
+
+    def __init__(self, impl: Callable[..., Any], key: KernelKeyType):
+        super().__init__(impl, key)
+        self._cupy = Cupy.instance()
+
+    def __call__(self, *args) -> None:
+        args = maybe_materialize(*args)
+        return self._impl(*args)
+
+
 class TypeTracerKernelError(KernelError):
     def __init__(self):
         self.str = None

diff --git a/studies/cccl/_segment_algorithms.py b/studies/cccl/_segment_algorithms.py
@@ -237,7 +237,8 @@ def segmented_select(
     num_segments = len(d_in_segments) - 1
 
     cond = numba.cuda.jit(cond)
-    # Apply select to get the data and indices where condition is true
+
+    # Step 1: Apply select to get the data and indices where condition is true
 
     def select_predicate(pair):
         return cond(pair[0])
@@ -253,13 +254,13 @@ def select_predicate(pair):
     d_indices_out = d_indices_out[:total_selected]
     d_selected_indices = d_indices_out[:total_selected]
 
-    # Step 3: Use searchsorted to count selected items per segment
+    # Step 2: Use searchsorted to count selected items per segment
     # Use side='left' to count elements strictly less than each offset boundary
     positions = cp.searchsorted(
         d_selected_indices, d_in_segments, side='left')
     d_counts = (positions[1:] - positions[:-1]).astype(cp.uint64)
 
-    # Step 4: Use exclusive scan to compute output segment start offsets
+    # Step 3: Use exclusive scan to compute output segment start offsets
     exclusive_scan(
         d_counts,
         d_out_segments[:-1],
@@ -269,7 +270,7 @@ def select_predicate(pair):
         stream,
     )
 
-    # Step 5: Set the final offset to the total count
+    # Set the final offset to the total count
     d_out_segments[-1] = total_selected
     return total_selected
 

diff --git a/tests-cuda/test_3459_virtualarray_with_cuda.py b/tests-cuda/test_3459_virtualarray_with_cuda.py
@@ -494,7 +494,6 @@ def test_numpyarray_nanargmax(numpyarray, virtual_numpyarray):
     assert virtual_numpyarray.is_all_materialized
 
 
-@pytest.mark.xfail(reason="awkward_sort is not implemented")
 def test_numpyarray_sort(numpyarray, virtual_numpyarray):
     assert not virtual_numpyarray.is_any_materialized
     assert ak.array_equal(
@@ -1224,7 +1223,6 @@ def test_listoffsetarray_nanargmax(numpy_like):
     assert virtual_array.is_all_materialized
 
 
-@pytest.mark.xfail(reason="awkward_sort is not implemented")
 def test_listoffsetarray_sort(listoffsetarray, virtual_listoffsetarray):
     assert not virtual_listoffsetarray.is_any_materialized
     assert ak.array_equal(
@@ -2256,7 +2254,9 @@ def test_listarray_nanargmax(numpy_like):
     assert virtual_array.is_all_materialized
 
 
-@pytest.mark.xfail(reason="awkward_sort is not implemented")
+@pytest.mark.xfail(
+    reason="ListArray.to_ListOffsetArray64 fails with virtual arrays on CUDA"
+)
 def test_listarray_sort(listarray, virtual_listarray):
     assert not virtual_listarray.is_any_materialized
     assert ak.array_equal(
@@ -3356,7 +3356,6 @@ def test_recordarray_argmax_y_field(recordarray, virtual_recordarray):
     assert virtual_recordarray.is_any_materialized
 
 
-@pytest.mark.xfail(reason="awkward_sort is not implemented")
 def test_recordarray_sort_x_field(recordarray, virtual_recordarray):
     # Test sort on the x field (ListOffsetArray)
     assert not virtual_recordarray.is_any_materialized
@@ -3370,7 +3369,6 @@ def test_recordarray_sort_x_field(recordarray, virtual_recordarray):
     assert virtual_recordarray.is_any_materialized
 
 
-@pytest.mark.xfail(reason="awkward_sort is not implemented")
 def test_recordarray_sort_y_field(recordarray, virtual_recordarray):
     # Test sort on the y field (NumpyArray)
     assert not virtual_recordarray.is_any_materialized