Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,7 @@ jobs:
-c rapidsai
-c nvidia
python=3.13
cccl-python
cudf
cupy
cuda-version=${{ matrix.cuda-version }}
Expand Down
3 changes: 2 additions & 1 deletion requirements-test-gpu.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# The CI installs cudf and cupy using conda
# The CI installs cudf, cupy, and cuda-cccl using conda
# If you are using this file manually uncomment the following lines and
# set the cuda version matching your system.
# cudf-cu12
# cupy-cuda12x
# cuda-cccl[cu12]
fsspec>=2022.11.0
numba>=0.60
numba-cuda
Expand Down
64 changes: 60 additions & 4 deletions src/awkward/_backends/cupy.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from awkward._backends.backend import Backend, KernelKeyType
from awkward._backends.dispatch import register_backend
from awkward._kernels import CupyKernel, NumpyKernel
from awkward._kernels import CudaComputeKernel, CupyKernel, NumpyKernel
from awkward._nplikes.cupy import Cupy
from awkward._nplikes.numpy import Numpy
from awkward._nplikes.numpy_like import NumpyMetadata
Expand All @@ -27,13 +27,69 @@ def nplike(self) -> Cupy:
def __init__(self):
self._cupy = Cupy.instance()

def __getitem__(self, index: KernelKeyType) -> CupyKernel | NumpyKernel:
def __getitem__(
self, index: KernelKeyType
) -> CudaComputeKernel | CupyKernel | NumpyKernel:
from awkward._connect import cuda
from awkward._connect.cuda import _compute as cuda_compute

kernel_name = index[0] if index else ""

# Try CuPy kernels first (primary implementation)
cupy = cuda.import_cupy("Awkward Arrays with CUDA")
_cuda_kernels = cuda.initialize_cuda_kernels(cupy)
func = _cuda_kernels[index]

if func is not None:
# CuPy kernel exists, use it
return CupyKernel(func, index)
else:
raise AssertionError(f"CuPyKernel not found: {index!r}")

# CuPy kernel not found, try cuda.compute as fallback
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps we should prioritize CCCL kernels for GPU operations, with CuPy serving as a fallback when CCCL is unavailable?

if self._supports_cuda_compute(kernel_name):
if cuda_compute.is_available():
# Return CudaComputeKernel for supported operations
compute_impl = self._get_cuda_compute_impl(kernel_name)
if compute_impl is not None:
return CudaComputeKernel(compute_impl, index)
else:
# cuda.compute is needed but not available
raise NotImplementedError(
f"Operation '{kernel_name}' on CUDA backend requires cuda.compute library "
f"(no CuPy kernel available). "
f"Please install cuda.compute or use the CPU backend: "
f"ak.to_backend(array, 'cpu')"
)

# Neither CuPy kernel nor cuda.compute implementation found
raise AssertionError(
f"Operation '{kernel_name}' is not supported on CUDA backend. "
f"CuPy kernel not found: {index!r}"
)

def _supports_cuda_compute(self, kernel_name: str) -> bool:
"""
Check if the given kernel operation is supported by cuda.compute.

Currently supports:
- awkward_sort
- awkward_argsort (future)
"""
# For now, we only support sort operations
return kernel_name in ("awkward_sort",)

def _get_cuda_compute_impl(self, kernel_name: str):
"""
Get the cuda.compute implementation for a kernel operation.

Args:
kernel_name: Name of the kernel operation (e.g., "awkward_sort")

Returns:
Callable implementing the operation, or None if not supported
"""
from awkward._connect.cuda import _compute as cuda_compute

if kernel_name == "awkward_sort":
return cuda_compute.segmented_sort

return None
65 changes: 65 additions & 0 deletions src/awkward/_connect/cuda/_compute.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# BSD 3-Clause License; see https://github.com/scikit-hep/awkward/blob/main/LICENSE

from __future__ import annotations

from awkward._nplikes.cupy import Cupy

# Cache for cuda.compute availability
_cuda_compute_available: bool | None = None


def is_available() -> bool:
global _cuda_compute_available

if _cuda_compute_available is not None:
return _cuda_compute_available

try:
import cuda.compute # noqa: F401

_cuda_compute_available = True
except ImportError:
_cuda_compute_available = False

return _cuda_compute_available


def segmented_sort(
toptr,
fromptr,
length,
offsets,
offsetslength,
parentslength,
ascending,
stable,
):
from cuda.compute import SortOrder, segmented_sort

cupy_nplike = Cupy.instance()
cp = cupy_nplike._module

# Ensure offsets are int64 as expected by segmented_sort
if offsets.dtype != cp.int64:
offsets = offsets.astype(cp.int64)

num_segments = offsetslength - 1
num_items = int(offsets[-1]) if len(offsets) > 0 else 0

start_offsets = offsets[:-1]
end_offsets = offsets[1:]

order = SortOrder.ASCENDING if ascending else SortOrder.DESCENDING

segmented_sort(
fromptr, # d_in_keys
toptr, # d_out_keys
None, # d_in_values (not sorting values, just keys)
None, # d_out_values
num_items, # num_items
num_segments, # num_segments
start_offsets, # start_offsets_in
end_offsets, # end_offsets_in
order, # order (ASCENDING or DESCENDING)
None, # stream (use default stream)
)
21 changes: 20 additions & 1 deletion src/awkward/_kernels.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
from awkward._nplikes.typetracer import try_touch_data
from awkward._typing import Protocol, TypeAlias

KernelKeyType: TypeAlias = tuple # Tuple[str, Unpack[Tuple[metadata.dtype, ...]]]
# Tuple[str, Unpack[Tuple[metadata.dtype, ...]]]
KernelKeyType: TypeAlias = tuple


numpy = Numpy.instance()
Expand Down Expand Up @@ -215,6 +216,24 @@ def __call__(self, *args) -> None:
self._impl(grid, blocks, args)


class CudaComputeKernel(BaseKernel):
"""
Kernel implementation using cuda.compute library.

When the CUDA backend is used, this kernel is used for operations
that have ``cuda.compute`` implementations. For other operations,
the ``CupyKernel`` is used.
"""

def __init__(self, impl: Callable[..., Any], key: KernelKeyType):
super().__init__(impl, key)
self._cupy = Cupy.instance()

def __call__(self, *args) -> None:
args = maybe_materialize(*args)
return self._impl(*args)


class TypeTracerKernelError(KernelError):
def __init__(self):
self.str = None
Expand Down
9 changes: 5 additions & 4 deletions studies/cccl/_segment_algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,8 @@ def segmented_select(
num_segments = len(d_in_segments) - 1

cond = numba.cuda.jit(cond)
# Apply select to get the data and indices where condition is true

# Step 1: Apply select to get the data and indices where condition is true

def select_predicate(pair):
return cond(pair[0])
Expand All @@ -253,13 +254,13 @@ def select_predicate(pair):
d_indices_out = d_indices_out[:total_selected]
d_selected_indices = d_indices_out[:total_selected]

# Step 3: Use searchsorted to count selected items per segment
# Step 2: Use searchsorted to count selected items per segment
# Use side='left' to count elements strictly less than each offset boundary
positions = cp.searchsorted(
d_selected_indices, d_in_segments, side='left')
d_counts = (positions[1:] - positions[:-1]).astype(cp.uint64)

# Step 4: Use exclusive scan to compute output segment start offsets
# Step 3: Use exclusive scan to compute output segment start offsets
exclusive_scan(
d_counts,
d_out_segments[:-1],
Expand All @@ -269,7 +270,7 @@ def select_predicate(pair):
stream,
)

# Step 5: Set the final offset to the total count
# Set the final offset to the total count
d_out_segments[-1] = total_selected
return total_selected

Expand Down
8 changes: 3 additions & 5 deletions tests-cuda/test_3459_virtualarray_with_cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,7 +494,6 @@ def test_numpyarray_nanargmax(numpyarray, virtual_numpyarray):
assert virtual_numpyarray.is_all_materialized


@pytest.mark.xfail(reason="awkward_sort is not implemented")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@shwina - we should probably keep the virtual arrays out of POC. Our design goal is to support delayed (lazy) evaluation of operations on virtual arrays.
This ensures that array transformations are represented symbolically until explicitly
materialized.

Copy link
Collaborator

@pfackeldey pfackeldey Nov 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with keeping virtual arrays out of this for now, I think adding this later is an easy step as we can just explicitly "maybe_materialize" when calling CCCL kernels.

Just to clarify: Virtual arrays are not for delaying kernels/operations. It's only to lazify the IO when using ak.from_buffers to create an ak.Array. Once the buffer has been delivered by the IO source, everything is eager (there's no symbolic representation of the program or any compute graph here).

It's not the goal of virtual arrays to create a symbolic representation of the program. That doesn't exist in awkward so far. It was what we've done over in https://github.com/dask-contrib/dask-awkward at the level of "highlevel Arrays". In principle it's interesting to have this though, but then it should likely be at the level of buffers and kernels that act on those.

Copy link
Collaborator

@ikrommyd ikrommyd Nov 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking at the ci errors btw for virtual arrays, the tests seem to be hitting this cupy/cupy#9089 which has been fixed but is to be out in cupy 14.
I see

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

I think if it wasn't for this problem, they would probably pass too. So yeah...they can't be enabled until cupy 14 is out at least.

def test_numpyarray_sort(numpyarray, virtual_numpyarray):
assert not virtual_numpyarray.is_any_materialized
assert ak.array_equal(
Expand Down Expand Up @@ -1224,7 +1223,6 @@ def test_listoffsetarray_nanargmax(numpy_like):
assert virtual_array.is_all_materialized


@pytest.mark.xfail(reason="awkward_sort is not implemented")
def test_listoffsetarray_sort(listoffsetarray, virtual_listoffsetarray):
assert not virtual_listoffsetarray.is_any_materialized
assert ak.array_equal(
Expand Down Expand Up @@ -2256,7 +2254,9 @@ def test_listarray_nanargmax(numpy_like):
assert virtual_array.is_all_materialized


@pytest.mark.xfail(reason="awkward_sort is not implemented")
@pytest.mark.xfail(
reason="ListArray.to_ListOffsetArray64 fails with virtual arrays on CUDA"
)
def test_listarray_sort(listarray, virtual_listarray):
assert not virtual_listarray.is_any_materialized
assert ak.array_equal(
Expand Down Expand Up @@ -3356,7 +3356,6 @@ def test_recordarray_argmax_y_field(recordarray, virtual_recordarray):
assert virtual_recordarray.is_any_materialized


@pytest.mark.xfail(reason="awkward_sort is not implemented")
def test_recordarray_sort_x_field(recordarray, virtual_recordarray):
# Test sort on the x field (ListOffsetArray)
assert not virtual_recordarray.is_any_materialized
Expand All @@ -3370,7 +3369,6 @@ def test_recordarray_sort_x_field(recordarray, virtual_recordarray):
assert virtual_recordarray.is_any_materialized


@pytest.mark.xfail(reason="awkward_sort is not implemented")
def test_recordarray_sort_y_field(recordarray, virtual_recordarray):
# Test sort on the y field (NumpyArray)
assert not virtual_recordarray.is_any_materialized
Expand Down
Loading
Loading