diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index d92809e885..1457ca9c13 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -210,6 +210,7 @@ jobs:
             -c rapidsai
             -c nvidia
             python=3.13
+            cccl-python
             cudf
             cupy
             cuda-version=${{ matrix.cuda-version }}
diff --git a/requirements-test-gpu.txt b/requirements-test-gpu.txt
index 5be787cd65..abece09a74 100644
--- a/requirements-test-gpu.txt
+++ b/requirements-test-gpu.txt
@@ -1,8 +1,9 @@
-# The CI installs cudf and cupy using conda
+# The CI installs cudf, cupy, and cuda-cccl using conda
 # If you are using this file manually uncomment the following lines and
 # set the cuda version matching your system.
 # cudf-cu12
 # cupy-cuda12x
+# cuda-cccl[cu12]
 fsspec>=2022.11.0
 numba>=0.60
 numba-cuda
diff --git a/src/awkward/_backends/cupy.py b/src/awkward/_backends/cupy.py
index 7c2c4d69e0..028a604b8a 100644
--- a/src/awkward/_backends/cupy.py
+++ b/src/awkward/_backends/cupy.py
@@ -4,7 +4,7 @@
 
 from awkward._backends.backend import Backend, KernelKeyType
 from awkward._backends.dispatch import register_backend
-from awkward._kernels import CupyKernel, NumpyKernel
+from awkward._kernels import CudaComputeKernel, CupyKernel, NumpyKernel
 from awkward._nplikes.cupy import Cupy
 from awkward._nplikes.numpy import Numpy
 from awkward._nplikes.numpy_like import NumpyMetadata
@@ -27,13 +27,69 @@ def nplike(self) -> Cupy:
     def __init__(self):
         self._cupy = Cupy.instance()
 
-    def __getitem__(self, index: KernelKeyType) -> CupyKernel | NumpyKernel:
+    def __getitem__(
+        self, index: KernelKeyType
+    ) -> CudaComputeKernel | CupyKernel | NumpyKernel:
         from awkward._connect import cuda
+        from awkward._connect.cuda import _compute as cuda_compute
 
+        kernel_name = index[0] if index else ""
+
+        # Try CuPy kernels first (primary implementation)
         cupy = cuda.import_cupy("Awkward Arrays with CUDA")
         _cuda_kernels = cuda.initialize_cuda_kernels(cupy)
         func = _cuda_kernels[index]
+
         if func is not None:
+            # CuPy kernel exists, use it
             return CupyKernel(func, index)
-        else:
-            raise AssertionError(f"CuPyKernel not found: {index!r}")
+
+        # CuPy kernel not found, try cuda.compute as fallback
+        if self._supports_cuda_compute(kernel_name):
+            if cuda_compute.is_available():
+                # Return CudaComputeKernel for supported operations
+                compute_impl = self._get_cuda_compute_impl(kernel_name)
+                if compute_impl is not None:
+                    return CudaComputeKernel(compute_impl, index)
+            else:
+                # cuda.compute is needed but not available
+                raise NotImplementedError(
+                    f"Operation '{kernel_name}' on CUDA backend requires cuda.compute library "
+                    f"(no CuPy kernel available). "
+                    f"Please install cuda.compute or use the CPU backend: "
+                    f"ak.to_backend(array, 'cpu')"
+                )
+
+        # Neither CuPy kernel nor cuda.compute implementation found
+        raise AssertionError(
+            f"Operation '{kernel_name}' is not supported on CUDA backend. "
+            f"CuPy kernel not found: {index!r}"
+        )
+
+    def _supports_cuda_compute(self, kernel_name: str) -> bool:
+        """
+        Check if the given kernel operation is supported by cuda.compute.
+
+        Currently supports:
+        - awkward_sort
+        - awkward_argsort (future)
+        """
+        # For now, we only support sort operations
+        return kernel_name in ("awkward_sort",)
+
+    def _get_cuda_compute_impl(self, kernel_name: str):
+        """
+        Get the cuda.compute implementation for a kernel operation.
+
+        Args:
+            kernel_name: Name of the kernel operation (e.g., "awkward_sort")
+
+        Returns:
+            Callable implementing the operation, or None if not supported
+        """
+        from awkward._connect.cuda import _compute as cuda_compute
+
+        if kernel_name == "awkward_sort":
+            return cuda_compute.segmented_sort
+
+        return None
diff --git a/src/awkward/_connect/cuda/_compute.py b/src/awkward/_connect/cuda/_compute.py
new file mode 100644
index 0000000000..0c0d77368d
--- /dev/null
+++ b/src/awkward/_connect/cuda/_compute.py
@@ -0,0 +1,65 @@
+# BSD 3-Clause License; see https://github.com/scikit-hep/awkward/blob/main/LICENSE
+
+from __future__ import annotations
+
+from awkward._nplikes.cupy import Cupy
+
+# Cache for cuda.compute availability
+_cuda_compute_available: bool | None = None
+
+
+def is_available() -> bool:
+    global _cuda_compute_available
+
+    if _cuda_compute_available is not None:
+        return _cuda_compute_available
+
+    try:
+        import cuda.compute  # noqa: F401
+
+        _cuda_compute_available = True
+    except ImportError:
+        _cuda_compute_available = False
+
+    return _cuda_compute_available
+
+
+def segmented_sort(
+    toptr,
+    fromptr,
+    length,
+    offsets,
+    offsetslength,
+    parentslength,
+    ascending,
+    stable,
+):
+    from cuda.compute import SortOrder, segmented_sort
+
+    cupy_nplike = Cupy.instance()
+    cp = cupy_nplike._module
+
+    # Ensure offsets are int64 as expected by segmented_sort
+    if offsets.dtype != cp.int64:
+        offsets = offsets.astype(cp.int64)
+
+    num_segments = offsetslength - 1
+    num_items = int(offsets[-1]) if len(offsets) > 0 else 0
+
+    start_offsets = offsets[:-1]
+    end_offsets = offsets[1:]
+
+    order = SortOrder.ASCENDING if ascending else SortOrder.DESCENDING
+
+    segmented_sort(
+        fromptr,  # d_in_keys
+        toptr,  # d_out_keys
+        None,  # d_in_values (not sorting values, just keys)
+        None,  # d_out_values
+        num_items,  # num_items
+        num_segments,  # num_segments
+        start_offsets,  # start_offsets_in
+        end_offsets,  # end_offsets_in
+        order,  # order (ASCENDING or DESCENDING)
+        None,  # stream (use default stream)
+    )
diff --git a/src/awkward/_kernels.py b/src/awkward/_kernels.py
index b578e2860b..03dbe9aed2 100644
--- a/src/awkward/_kernels.py
+++ b/src/awkward/_kernels.py
@@ -17,7 +17,8 @@
 from awkward._nplikes.typetracer import try_touch_data
 from awkward._typing import Protocol, TypeAlias
 
-KernelKeyType: TypeAlias = tuple  # Tuple[str, Unpack[Tuple[metadata.dtype, ...]]]
+# Tuple[str, Unpack[Tuple[metadata.dtype, ...]]]
+KernelKeyType: TypeAlias = tuple
 
 
 numpy = Numpy.instance()
@@ -215,6 +216,24 @@ def __call__(self, *args) -> None:
         self._impl(grid, blocks, args)
 
 
+class CudaComputeKernel(BaseKernel):
+    """
+    Kernel implementation using cuda.compute library.
+
+    When the CUDA backend is used, this kernel is used for operations
+    that have ``cuda.compute`` implementations. For other operations,
+    the ``CupyKernel`` is used.
+    """
+
+    def __init__(self, impl: Callable[..., Any], key: KernelKeyType):
+        super().__init__(impl, key)
+        self._cupy = Cupy.instance()
+
+    def __call__(self, *args) -> None:
+        args = maybe_materialize(*args)
+        return self._impl(*args)
+
+
 class TypeTracerKernelError(KernelError):
     def __init__(self):
         self.str = None
diff --git a/studies/cccl/_segment_algorithms.py b/studies/cccl/_segment_algorithms.py
index fed911a791..39c8f0e088 100644
--- a/studies/cccl/_segment_algorithms.py
+++ b/studies/cccl/_segment_algorithms.py
@@ -237,7 +237,8 @@ def segmented_select(
     num_segments = len(d_in_segments) - 1
 
     cond = numba.cuda.jit(cond)
-    # Apply select to get the data and indices where condition is true
+
+    # Step 1: Apply select to get the data and indices where condition is true
 
     def select_predicate(pair):
         return cond(pair[0])
@@ -253,13 +254,13 @@ def select_predicate(pair):
     d_indices_out = d_indices_out[:total_selected]
     d_selected_indices = d_indices_out[:total_selected]
 
-    # Step 3: Use searchsorted to count selected items per segment
+    # Step 2: Use searchsorted to count selected items per segment
     # Use side='left' to count elements strictly less than each offset boundary
     positions = cp.searchsorted(
         d_selected_indices, d_in_segments, side='left')
     d_counts = (positions[1:] - positions[:-1]).astype(cp.uint64)
 
-    # Step 4: Use exclusive scan to compute output segment start offsets
+    # Step 3: Use exclusive scan to compute output segment start offsets
     exclusive_scan(
         d_counts,
         d_out_segments[:-1],
@@ -269,7 +270,7 @@ def select_predicate(pair):
         stream,
     )
 
-    # Step 5: Set the final offset to the total count
+    # Set the final offset to the total count
     d_out_segments[-1] = total_selected
     return total_selected
 
diff --git a/tests-cuda/test_3459_virtualarray_with_cuda.py b/tests-cuda/test_3459_virtualarray_with_cuda.py
index e2bcab1275..bb15a2a04b 100644
--- a/tests-cuda/test_3459_virtualarray_with_cuda.py
+++ b/tests-cuda/test_3459_virtualarray_with_cuda.py
@@ -494,7 +494,6 @@ def test_numpyarray_nanargmax(numpyarray, virtual_numpyarray):
     assert virtual_numpyarray.is_all_materialized
 
 
-@pytest.mark.xfail(reason="awkward_sort is not implemented")
 def test_numpyarray_sort(numpyarray, virtual_numpyarray):
     assert not virtual_numpyarray.is_any_materialized
     assert ak.array_equal(
@@ -1224,7 +1223,6 @@ def test_listoffsetarray_nanargmax(numpy_like):
     assert virtual_array.is_all_materialized
 
 
-@pytest.mark.xfail(reason="awkward_sort is not implemented")
 def test_listoffsetarray_sort(listoffsetarray, virtual_listoffsetarray):
     assert not virtual_listoffsetarray.is_any_materialized
     assert ak.array_equal(
@@ -2256,7 +2254,9 @@ def test_listarray_nanargmax(numpy_like):
     assert virtual_array.is_all_materialized
 
 
-@pytest.mark.xfail(reason="awkward_sort is not implemented")
+@pytest.mark.xfail(
+    reason="ListArray.to_ListOffsetArray64 fails with virtual arrays on CUDA"
+)
 def test_listarray_sort(listarray, virtual_listarray):
     assert not virtual_listarray.is_any_materialized
     assert ak.array_equal(
@@ -3356,7 +3356,6 @@ def test_recordarray_argmax_y_field(recordarray, virtual_recordarray):
     assert virtual_recordarray.is_any_materialized
 
 
-@pytest.mark.xfail(reason="awkward_sort is not implemented")
 def test_recordarray_sort_x_field(recordarray, virtual_recordarray):
     # Test sort on the x field (ListOffsetArray)
     assert not virtual_recordarray.is_any_materialized
@@ -3370,7 +3369,6 @@ def test_recordarray_sort_x_field(recordarray, virtual_recordarray):
     assert virtual_recordarray.is_any_materialized
 
 
-@pytest.mark.xfail(reason="awkward_sort is not implemented")
 def test_recordarray_sort_y_field(recordarray, virtual_recordarray):
     # Test sort on the y field (NumpyArray)
     assert not virtual_recordarray.is_any_materialized
diff --git a/tests-cuda/test_3749_cuda_backend_sort.py b/tests-cuda/test_3749_cuda_backend_sort.py
new file mode 100644
index 0000000000..44d3121510
--- /dev/null
+++ b/tests-cuda/test_3749_cuda_backend_sort.py
@@ -0,0 +1,131 @@
+# BSD 3-Clause License; see https://github.com/scikit-hep/awkward/blob/main/LICENSE
+
+from __future__ import annotations
+
+import numpy as np
+import pytest
+
+import awkward as ak
+
+to_list = ak.operations.to_list
+
+
+def test_sort_cuda_basic():
+    data = ak.Array([[7, 5, 7], [], [2], [8, 2]])
+    gpu_data = ak.to_backend(data, "cuda")
+    gpu_sorted = ak.sort(gpu_data)
+    result = ak.to_backend(gpu_sorted, "cpu")
+
+    assert to_list(result) == [[5, 7, 7], [], [2], [2, 8]]
+
+
+def test_sort_cuda_descending():
+    data = ak.Array([[3, 1, 2], [5, 4], [], [6]])
+    gpu_data = ak.to_backend(data, "cuda")
+    gpu_sorted = ak.sort(gpu_data, ascending=False)
+    result = ak.to_backend(gpu_sorted, "cpu")
+
+    assert to_list(result) == [[3, 2, 1], [5, 4], [], [6]]
+
+
+def test_sort_cuda_float():
+    data = ak.Array([[3.5, 1.2, 2.8], [5.1, 4.9], [], [6.0]])
+    gpu_data = ak.to_backend(data, "cuda")
+    gpu_sorted = ak.sort(gpu_data)
+    result = ak.to_backend(gpu_sorted, "cpu")
+
+    expected = [[1.2, 2.8, 3.5], [4.9, 5.1], [], [6.0]]
+    result_list = to_list(result)
+
+    # Compare with tolerance for floats
+    assert len(result_list) == len(expected)
+    for res_sublist, exp_sublist in zip(result_list, expected):
+        assert len(res_sublist) == len(exp_sublist)
+        for res_val, exp_val in zip(res_sublist, exp_sublist):
+            assert abs(res_val - exp_val) < 1e-10
+
+
+def test_sort_cuda_large():
+    # Create random data
+    np.random.seed(42)
+    data_list = []
+    for _ in range(100):
+        size = np.random.randint(0, 50)
+        if size > 0:
+            data_list.append(np.random.randint(0, 100, size).tolist())
+        else:
+            data_list.append([])
+
+    data = ak.Array(data_list)
+    cpu_sorted = ak.sort(data)
+
+    gpu_data = ak.to_backend(data, "cuda")
+    gpu_sorted = ak.sort(gpu_data)
+    result = ak.to_backend(gpu_sorted, "cpu")
+
+    assert to_list(result) == to_list(cpu_sorted)
+
+
+def test_sort_cuda_nested():
+    data = ak.Array([[[3, 1, 2], [5, 4]], [[9, 7, 8]], [[6]]])
+    cpu_sorted = ak.sort(data, axis=-1)
+
+    gpu_data = ak.to_backend(data, "cuda")
+    gpu_sorted = ak.sort(gpu_data, axis=-1)
+    result = ak.to_backend(gpu_sorted, "cpu")
+
+    assert to_list(result) == to_list(cpu_sorted)
+    assert to_list(result) == [[[1, 2, 3], [4, 5]], [[7, 8, 9]], [[6]]]
+
+
+def test_sort_cuda_deeply_nested():
+    data = ak.Array([[[[5, 2, 8], [1, 3]], [[4, 6]]], [[[9, 7]]]])
+    cpu_sorted = ak.sort(data, axis=-1)
+
+    gpu_data = ak.to_backend(data, "cuda")
+    gpu_sorted = ak.sort(gpu_data, axis=-1)
+    result = ak.to_backend(gpu_sorted, "cpu")
+
+    assert to_list(result) == to_list(cpu_sorted)
+    assert to_list(result) == [[[[2, 5, 8], [1, 3]], [[4, 6]]], [[[7, 9]]]]
+
+
+def test_sort_cuda_unsupported_axis():
+    """Test that sorting at unsupported axes fails with clear error."""
+    # Sorting at axis=-2 requires CuPy kernels that don't exist
+    # This should fail with an AssertionError indicating missing kernels
+    data = ak.Array([[[7, 2, 3], [4, 5, 6]]])
+    gpu_data = ak.to_backend(data, "cuda")
+
+    # axis=-1 should work (our cuda.compute implementation)
+    sorted_axis_minus1 = ak.sort(gpu_data, axis=-1)
+    result = ak.to_backend(sorted_axis_minus1, "cpu")
+    assert to_list(result) == [[[2, 3, 7], [4, 5, 6]]]
+
+    # axis=-2 should fail (requires CuPy kernels not available)
+    with pytest.raises(
+        AssertionError,
+        match=r"(CuPyKernel not found|Operation .* is not supported)",
+    ):
+        ak.sort(gpu_data, axis=-2)
+
+
+def test_sort_cuda_no_compute():
+    """Test that helpful error is raised when cuda.compute is not available."""
+    from awkward._connect.cuda import _compute as cuda_compute
+
+    original_available = cuda_compute._cuda_compute_available
+
+    try:
+        # Temporarily make cuda.compute unavailable
+        cuda_compute._cuda_compute_available = False
+
+        data = ak.Array([[7, 5, 7], [], [2], [8, 2]])
+        gpu_data = ak.to_backend(data, "cuda")
+
+        with pytest.raises(NotImplementedError, match=r"cuda\.compute"):
+            ak.sort(gpu_data)
+
+    finally:
+        # Restore original state
+        cuda_compute._cuda_compute_available = original_available