diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index d92809e885..1457ca9c13 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -210,6 +210,7 @@ jobs: -c rapidsai -c nvidia python=3.13 + cccl-python cudf cupy cuda-version=${{ matrix.cuda-version }} diff --git a/requirements-test-gpu.txt b/requirements-test-gpu.txt index 5be787cd65..abece09a74 100644 --- a/requirements-test-gpu.txt +++ b/requirements-test-gpu.txt @@ -1,8 +1,9 @@ -# The CI installs cudf and cupy using conda +# The CI installs cudf, cupy, and cuda-cccl using conda # If you are using this file manually uncomment the following lines and # set the cuda version matching your system. # cudf-cu12 # cupy-cuda12x +# cuda-cccl[cu12] fsspec>=2022.11.0 numba>=0.60 numba-cuda diff --git a/src/awkward/_backends/cupy.py b/src/awkward/_backends/cupy.py index 7c2c4d69e0..028a604b8a 100644 --- a/src/awkward/_backends/cupy.py +++ b/src/awkward/_backends/cupy.py @@ -4,7 +4,7 @@ from awkward._backends.backend import Backend, KernelKeyType from awkward._backends.dispatch import register_backend -from awkward._kernels import CupyKernel, NumpyKernel +from awkward._kernels import CudaComputeKernel, CupyKernel, NumpyKernel from awkward._nplikes.cupy import Cupy from awkward._nplikes.numpy import Numpy from awkward._nplikes.numpy_like import NumpyMetadata @@ -27,13 +27,69 @@ def nplike(self) -> Cupy: def __init__(self): self._cupy = Cupy.instance() - def __getitem__(self, index: KernelKeyType) -> CupyKernel | NumpyKernel: + def __getitem__( + self, index: KernelKeyType + ) -> CudaComputeKernel | CupyKernel | NumpyKernel: from awkward._connect import cuda + from awkward._connect.cuda import _compute as cuda_compute + kernel_name = index[0] if index else "" + + # Try CuPy kernels first (primary implementation) cupy = cuda.import_cupy("Awkward Arrays with CUDA") _cuda_kernels = cuda.initialize_cuda_kernels(cupy) func = _cuda_kernels[index] + if func is not None: + # CuPy kernel exists, use it return CupyKernel(func, index) - else: - raise AssertionError(f"CuPyKernel not found: {index!r}") + + # CuPy kernel not found, try cuda.compute as fallback + if self._supports_cuda_compute(kernel_name): + if cuda_compute.is_available(): + # Return CudaComputeKernel for supported operations + compute_impl = self._get_cuda_compute_impl(kernel_name) + if compute_impl is not None: + return CudaComputeKernel(compute_impl, index) + else: + # cuda.compute is needed but not available + raise NotImplementedError( + f"Operation '{kernel_name}' on CUDA backend requires cuda.compute library " + f"(no CuPy kernel available). " + f"Please install cuda.compute or use the CPU backend: " + f"ak.to_backend(array, 'cpu')" + ) + + # Neither CuPy kernel nor cuda.compute implementation found + raise AssertionError( + f"Operation '{kernel_name}' is not supported on CUDA backend. " + f"CuPy kernel not found: {index!r}" + ) + + def _supports_cuda_compute(self, kernel_name: str) -> bool: + """ + Check if the given kernel operation is supported by cuda.compute. + + Currently supports: + - awkward_sort + - awkward_argsort (future) + """ + # For now, we only support sort operations + return kernel_name in ("awkward_sort",) + + def _get_cuda_compute_impl(self, kernel_name: str): + """ + Get the cuda.compute implementation for a kernel operation. + + Args: + kernel_name: Name of the kernel operation (e.g., "awkward_sort") + + Returns: + Callable implementing the operation, or None if not supported + """ + from awkward._connect.cuda import _compute as cuda_compute + + if kernel_name == "awkward_sort": + return cuda_compute.segmented_sort + + return None diff --git a/src/awkward/_connect/cuda/_compute.py b/src/awkward/_connect/cuda/_compute.py new file mode 100644 index 0000000000..0c0d77368d --- /dev/null +++ b/src/awkward/_connect/cuda/_compute.py @@ -0,0 +1,65 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward/blob/main/LICENSE + +from __future__ import annotations + +from awkward._nplikes.cupy import Cupy + +# Cache for cuda.compute availability +_cuda_compute_available: bool | None = None + + +def is_available() -> bool: + global _cuda_compute_available + + if _cuda_compute_available is not None: + return _cuda_compute_available + + try: + import cuda.compute # noqa: F401 + + _cuda_compute_available = True + except ImportError: + _cuda_compute_available = False + + return _cuda_compute_available + + +def segmented_sort( + toptr, + fromptr, + length, + offsets, + offsetslength, + parentslength, + ascending, + stable, +): + from cuda.compute import SortOrder, segmented_sort + + cupy_nplike = Cupy.instance() + cp = cupy_nplike._module + + # Ensure offsets are int64 as expected by segmented_sort + if offsets.dtype != cp.int64: + offsets = offsets.astype(cp.int64) + + num_segments = offsetslength - 1 + num_items = int(offsets[-1]) if len(offsets) > 0 else 0 + + start_offsets = offsets[:-1] + end_offsets = offsets[1:] + + order = SortOrder.ASCENDING if ascending else SortOrder.DESCENDING + + segmented_sort( + fromptr, # d_in_keys + toptr, # d_out_keys + None, # d_in_values (not sorting values, just keys) + None, # d_out_values + num_items, # num_items + num_segments, # num_segments + start_offsets, # start_offsets_in + end_offsets, # end_offsets_in + order, # order (ASCENDING or DESCENDING) + None, # stream (use default stream) + ) diff --git a/src/awkward/_kernels.py b/src/awkward/_kernels.py index b578e2860b..03dbe9aed2 100644 --- a/src/awkward/_kernels.py +++ b/src/awkward/_kernels.py @@ -17,7 +17,8 @@ from awkward._nplikes.typetracer import try_touch_data from awkward._typing import Protocol, TypeAlias -KernelKeyType: TypeAlias = tuple # Tuple[str, Unpack[Tuple[metadata.dtype, ...]]] +# Tuple[str, Unpack[Tuple[metadata.dtype, ...]]] +KernelKeyType: TypeAlias = tuple numpy = Numpy.instance() @@ -215,6 +216,24 @@ def __call__(self, *args) -> None: self._impl(grid, blocks, args) +class CudaComputeKernel(BaseKernel): + """ + Kernel implementation using cuda.compute library. + + When the CUDA backend is used, this kernel is used for operations + that have ``cuda.compute`` implementations. For other operations, + the ``CupyKernel`` is used. + """ + + def __init__(self, impl: Callable[..., Any], key: KernelKeyType): + super().__init__(impl, key) + self._cupy = Cupy.instance() + + def __call__(self, *args) -> None: + args = maybe_materialize(*args) + return self._impl(*args) + + class TypeTracerKernelError(KernelError): def __init__(self): self.str = None diff --git a/studies/cccl/_segment_algorithms.py b/studies/cccl/_segment_algorithms.py index fed911a791..39c8f0e088 100644 --- a/studies/cccl/_segment_algorithms.py +++ b/studies/cccl/_segment_algorithms.py @@ -237,7 +237,8 @@ def segmented_select( num_segments = len(d_in_segments) - 1 cond = numba.cuda.jit(cond) - # Apply select to get the data and indices where condition is true + + # Step 1: Apply select to get the data and indices where condition is true def select_predicate(pair): return cond(pair[0]) @@ -253,13 +254,13 @@ def select_predicate(pair): d_indices_out = d_indices_out[:total_selected] d_selected_indices = d_indices_out[:total_selected] - # Step 3: Use searchsorted to count selected items per segment + # Step 2: Use searchsorted to count selected items per segment # Use side='left' to count elements strictly less than each offset boundary positions = cp.searchsorted( d_selected_indices, d_in_segments, side='left') d_counts = (positions[1:] - positions[:-1]).astype(cp.uint64) - # Step 4: Use exclusive scan to compute output segment start offsets + # Step 3: Use exclusive scan to compute output segment start offsets exclusive_scan( d_counts, d_out_segments[:-1], @@ -269,7 +270,7 @@ def select_predicate(pair): stream, ) - # Step 5: Set the final offset to the total count + # Set the final offset to the total count d_out_segments[-1] = total_selected return total_selected diff --git a/tests-cuda/test_3459_virtualarray_with_cuda.py b/tests-cuda/test_3459_virtualarray_with_cuda.py index e2bcab1275..bb15a2a04b 100644 --- a/tests-cuda/test_3459_virtualarray_with_cuda.py +++ b/tests-cuda/test_3459_virtualarray_with_cuda.py @@ -494,7 +494,6 @@ def test_numpyarray_nanargmax(numpyarray, virtual_numpyarray): assert virtual_numpyarray.is_all_materialized -@pytest.mark.xfail(reason="awkward_sort is not implemented") def test_numpyarray_sort(numpyarray, virtual_numpyarray): assert not virtual_numpyarray.is_any_materialized assert ak.array_equal( @@ -1224,7 +1223,6 @@ def test_listoffsetarray_nanargmax(numpy_like): assert virtual_array.is_all_materialized -@pytest.mark.xfail(reason="awkward_sort is not implemented") def test_listoffsetarray_sort(listoffsetarray, virtual_listoffsetarray): assert not virtual_listoffsetarray.is_any_materialized assert ak.array_equal( @@ -2256,7 +2254,9 @@ def test_listarray_nanargmax(numpy_like): assert virtual_array.is_all_materialized -@pytest.mark.xfail(reason="awkward_sort is not implemented") +@pytest.mark.xfail( + reason="ListArray.to_ListOffsetArray64 fails with virtual arrays on CUDA" +) def test_listarray_sort(listarray, virtual_listarray): assert not virtual_listarray.is_any_materialized assert ak.array_equal( @@ -3356,7 +3356,6 @@ def test_recordarray_argmax_y_field(recordarray, virtual_recordarray): assert virtual_recordarray.is_any_materialized -@pytest.mark.xfail(reason="awkward_sort is not implemented") def test_recordarray_sort_x_field(recordarray, virtual_recordarray): # Test sort on the x field (ListOffsetArray) assert not virtual_recordarray.is_any_materialized @@ -3370,7 +3369,6 @@ def test_recordarray_sort_x_field(recordarray, virtual_recordarray): assert virtual_recordarray.is_any_materialized -@pytest.mark.xfail(reason="awkward_sort is not implemented") def test_recordarray_sort_y_field(recordarray, virtual_recordarray): # Test sort on the y field (NumpyArray) assert not virtual_recordarray.is_any_materialized diff --git a/tests-cuda/test_3749_cuda_backend_sort.py b/tests-cuda/test_3749_cuda_backend_sort.py new file mode 100644 index 0000000000..44d3121510 --- /dev/null +++ b/tests-cuda/test_3749_cuda_backend_sort.py @@ -0,0 +1,131 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward/blob/main/LICENSE + +from __future__ import annotations + +import numpy as np +import pytest + +import awkward as ak + +to_list = ak.operations.to_list + + +def test_sort_cuda_basic(): + data = ak.Array([[7, 5, 7], [], [2], [8, 2]]) + gpu_data = ak.to_backend(data, "cuda") + gpu_sorted = ak.sort(gpu_data) + result = ak.to_backend(gpu_sorted, "cpu") + + assert to_list(result) == [[5, 7, 7], [], [2], [2, 8]] + + +def test_sort_cuda_descending(): + data = ak.Array([[3, 1, 2], [5, 4], [], [6]]) + gpu_data = ak.to_backend(data, "cuda") + gpu_sorted = ak.sort(gpu_data, ascending=False) + result = ak.to_backend(gpu_sorted, "cpu") + + assert to_list(result) == [[3, 2, 1], [5, 4], [], [6]] + + +def test_sort_cuda_float(): + data = ak.Array([[3.5, 1.2, 2.8], [5.1, 4.9], [], [6.0]]) + gpu_data = ak.to_backend(data, "cuda") + gpu_sorted = ak.sort(gpu_data) + result = ak.to_backend(gpu_sorted, "cpu") + + expected = [[1.2, 2.8, 3.5], [4.9, 5.1], [], [6.0]] + result_list = to_list(result) + + # Compare with tolerance for floats + assert len(result_list) == len(expected) + for res_sublist, exp_sublist in zip(result_list, expected): + assert len(res_sublist) == len(exp_sublist) + for res_val, exp_val in zip(res_sublist, exp_sublist): + assert abs(res_val - exp_val) < 1e-10 + + +def test_sort_cuda_large(): + # Create random data + np.random.seed(42) + data_list = [] + for _ in range(100): + size = np.random.randint(0, 50) + if size > 0: + data_list.append(np.random.randint(0, 100, size).tolist()) + else: + data_list.append([]) + + data = ak.Array(data_list) + cpu_sorted = ak.sort(data) + + gpu_data = ak.to_backend(data, "cuda") + gpu_sorted = ak.sort(gpu_data) + result = ak.to_backend(gpu_sorted, "cpu") + + assert to_list(result) == to_list(cpu_sorted) + + +def test_sort_cuda_nested(): + data = ak.Array([[[3, 1, 2], [5, 4]], [[9, 7, 8]], [[6]]]) + cpu_sorted = ak.sort(data, axis=-1) + + gpu_data = ak.to_backend(data, "cuda") + gpu_sorted = ak.sort(gpu_data, axis=-1) + result = ak.to_backend(gpu_sorted, "cpu") + + assert to_list(result) == to_list(cpu_sorted) + assert to_list(result) == [[[1, 2, 3], [4, 5]], [[7, 8, 9]], [[6]]] + + +def test_sort_cuda_deeply_nested(): + data = ak.Array([[[[5, 2, 8], [1, 3]], [[4, 6]]], [[[9, 7]]]]) + cpu_sorted = ak.sort(data, axis=-1) + + gpu_data = ak.to_backend(data, "cuda") + gpu_sorted = ak.sort(gpu_data, axis=-1) + result = ak.to_backend(gpu_sorted, "cpu") + + assert to_list(result) == to_list(cpu_sorted) + assert to_list(result) == [[[[2, 5, 8], [1, 3]], [[4, 6]]], [[[7, 9]]]] + + +def test_sort_cuda_unsupported_axis(): + """Test that sorting at unsupported axes fails with clear error.""" + # Sorting at axis=-2 requires CuPy kernels that don't exist + # This should fail with an AssertionError indicating missing kernels + data = ak.Array([[[7, 2, 3], [4, 5, 6]]]) + gpu_data = ak.to_backend(data, "cuda") + + # axis=-1 should work (our cuda.compute implementation) + sorted_axis_minus1 = ak.sort(gpu_data, axis=-1) + result = ak.to_backend(sorted_axis_minus1, "cpu") + assert to_list(result) == [[[2, 3, 7], [4, 5, 6]]] + + # axis=-2 should fail (requires CuPy kernels not available) + with pytest.raises( + AssertionError, + match=r"(CuPyKernel not found|Operation .* is not supported)", + ): + ak.sort(gpu_data, axis=-2) + + +def test_sort_cuda_no_compute(): + """Test that helpful error is raised when cuda.compute is not available.""" + from awkward._connect.cuda import _compute as cuda_compute + + original_available = cuda_compute._cuda_compute_available + + try: + # Temporarily make cuda.compute unavailable + cuda_compute._cuda_compute_available = False + + data = ak.Array([[7, 5, 7], [], [2], [8, 2]]) + gpu_data = ak.to_backend(data, "cuda") + + with pytest.raises(NotImplementedError, match=r"cuda\.compute"): + ak.sort(gpu_data) + + finally: + # Restore original state + cuda_compute._cuda_compute_available = original_available