From 860868d536e5162670bf51950d6c9bde988ff70b Mon Sep 17 00:00:00 2001
From: ManasviGoyal <mg.manasvi@gmail.com>
Date: Wed, 29 May 2024 11:28:40 +0200
Subject: [PATCH 01/33] feat: add tree reduction implementation of argmin and
 argmax

---
 dev/generate-kernel-signatures.py             |  2 +
 kernel-test-data.json                         |  4 +-
 .../cuda_kernels/awkward_reduce_argmax.cu     | 76 +++++++++++++++++--
 .../cuda_kernels/awkward_reduce_argmin.cu     | 76 +++++++++++++++++--
 4 files changed, 144 insertions(+), 14 deletions(-)
diff --git a/dev/generate-kernel-signatures.py b/dev/generate-kernel-signatures.py
index b64946626c..e020391e01 100644
--- a/dev/generate-kernel-signatures.py
+++ b/dev/generate-kernel-signatures.py
@@ -374,6 +374,8 @@ def kernel_signatures_cuda_py(specification):
 from awkward._connect.cuda import fetch_specialization
 from awkward._connect.cuda import import_cupy
 
+import math
+
 cupy = import_cupy("Awkward Arrays with CUDA")
 """
         )
diff --git a/kernel-test-data.json b/kernel-test-data.json
index fde02211fa..fc0fb07cd5 100644
--- a/kernel-test-data.json
+++ b/kernel-test-data.json
@@ -23348,7 +23348,7 @@
         },
         {
             "name": "awkward_reduce_argmax",
-            "status": false,
+            "status": true,
             "tests": [
                 {
                     "error": false,
@@ -25544,7 +25544,7 @@
         },
         {
             "name": "awkward_reduce_argmin",
-            "status": false,
+            "status": true,
             "tests": [
                 {
                     "error": false,
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu
index 555420c3f8..ee57d912fc 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu
@@ -3,10 +3,20 @@
 // BEGIN PYTHON
 // def f(grid, block, args):
 //     (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_a", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_b", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code))
+//     shared_mem_size = block[0] * toptr.dtype.itemsize
+//     if block[0] > 0:
+//         segment = math.floor((outlength + block[0] - 1) / block[0])
+//         partial_size = outlength * ((lenparents + block[0] - 1) / block[0])
+//     else:
+//         segment = 0
+//         partial_size = 0
+//     partial = cupy.full(math.floor(partial_size), -1, dtype=toptr.dtype)
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, partial, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, partial, invocation_index, err_code), shared_mem=shared_mem_size)
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, partial, invocation_index, err_code))
 // out["awkward_reduce_argmax_a", {dtype_specializations}] = None
 // out["awkward_reduce_argmax_b", {dtype_specializations}] = None
+// out["awkward_reduce_argmax_c", {dtype_specializations}] = None
 // END PYTHON
 
 template <typename T, typename C, typename U>
@@ -17,10 +27,12 @@ awkward_reduce_argmax_a(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
+    T* partial,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
     if (thread_id < outlength) {
       toptr[thread_id] = -1;
     }
@@ -35,17 +47,69 @@ awkward_reduce_argmax_b(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
+    T* partial,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
-    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
+    T *shared_mem = reinterpret_cast<T *>(shared_memory);
+
+    int64_t idx = threadIdx.x;
+    int64_t thread_id = blockIdx.x * blockDim.x + idx;
+
+    if (thread_id < lenparents) {
+      shared_mem[idx] = thread_id;
+    } else {
+      shared_mem[idx] = -1;
+    }
+    __syncthreads();
+
+    for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
+      int64_t index = -1;
+      if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+        index = shared_mem[idx - stride];
+      }
+      if (index != -1 && (shared_mem[idx] == -1 || fromptr[index] > fromptr[shared_mem[idx]] ||
+         (fromptr[index] == fromptr[shared_mem[idx]] && index < shared_mem[idx]))) {
+        shared_mem[idx] = index;
+      }
+      __syncthreads();
+    }
 
     if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
-      if (toptr[parent] == -1 ||
-          (fromptr[thread_id] > (fromptr[toptr[parent]]))) {
-        toptr[parent] = thread_id; // we need the last parent filled, thread random order problem, find max arg at that index
+      if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+        partial[blockIdx.x * outlength + parent] = shared_mem[idx];
+      }
+    }
+  }
+}
+
+template <typename T, typename C, typename U>
+__global__ void
+awkward_reduce_argmax_c(
+    T* toptr,
+    const C* fromptr,
+    const U* parents,
+    int64_t lenparents,
+    int64_t outlength,
+    T* partial,
+    uint64_t invocation_index,
+    uint64_t* err_code) {
+  if (err_code[0] == NO_ERROR) {
+    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (thread_id < outlength) {
+      int64_t argmax = -1;
+      int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x;
+      for (int64_t i = 0; i < blocks; ++i) {
+        int64_t index = partial[i * outlength + thread_id];
+        if (index != -1 && (argmax == -1 || fromptr[index] > fromptr[argmax]) ||
+           (fromptr[index] == fromptr[argmax] && index < argmax)) {
+          argmax = index;
+        }
       }
+      toptr[thread_id] = argmax;
     }
   }
 }
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu
index 282ebd11cc..262a51b57a 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu
@@ -3,10 +3,20 @@
 // BEGIN PYTHON
 // def f(grid, block, args):
 //     (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmin_a", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmin_b", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code))
+//     shared_mem_size = block[0] * toptr.dtype.itemsize
+//     if block[0] > 0:
+//         segment = math.floor((outlength + block[0] - 1) / block[0])
+//         partial_size = outlength * ((lenparents + block[0] - 1) / block[0])
+//     else:
+//         segment = 0
+//         partial_size = 0
+//     partial = cupy.full(math.floor(partial_size), -1, dtype=toptr.dtype)
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmin_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, partial, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmin_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, partial, invocation_index, err_code), shared_mem=shared_mem_size)
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmin_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, partial, invocation_index, err_code))
 // out["awkward_reduce_argmin_a", {dtype_specializations}] = None
 // out["awkward_reduce_argmin_b", {dtype_specializations}] = None
+// out["awkward_reduce_argmin_c", {dtype_specializations}] = None
 // END PYTHON
 
 template <typename T, typename C, typename U>
@@ -17,10 +27,12 @@ awkward_reduce_argmin_a(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
+    T* partial,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
     if (thread_id < outlength) {
       toptr[thread_id] = -1;
     }
@@ -35,17 +47,69 @@ awkward_reduce_argmin_b(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
+    T* partial,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
-    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
+    T *shared_mem = reinterpret_cast<T *>(shared_memory);
+
+    int64_t idx = threadIdx.x;
+    int64_t thread_id = blockIdx.x * blockDim.x + idx;
+
+    if (thread_id < lenparents) {
+      shared_mem[idx] = thread_id;
+    } else {
+      shared_mem[idx] = -1;
+    }
+    __syncthreads();
+
+    for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
+      int64_t index = -1;
+      if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+        index = shared_mem[idx - stride];
+      }
+      if (index != -1 && (shared_mem[idx] == -1 || fromptr[index] < fromptr[shared_mem[idx]] ||
+         (fromptr[index] == fromptr[shared_mem[idx]] && index < shared_mem[idx]))) {
+        shared_mem[idx] = index;
+      }
+      __syncthreads();
+    }
 
     if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
-      if (toptr[parent] == -1 ||
-          (fromptr[thread_id] < (fromptr[toptr[parent]]))) {
-        toptr[parent] = thread_id;
+      if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+        partial[blockIdx.x * outlength + parent] = shared_mem[idx];
+      }
+    }
+  }
+}
+
+template <typename T, typename C, typename U>
+__global__ void
+awkward_reduce_argmin_c(
+    T* toptr,
+    const C* fromptr,
+    const U* parents,
+    int64_t lenparents,
+    int64_t outlength,
+    T* partial,
+    uint64_t invocation_index,
+    uint64_t* err_code) {
+  if (err_code[0] == NO_ERROR) {
+    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (thread_id < outlength) {
+      int64_t argmin = -1;
+      int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x;
+      for (int64_t i = 0; i < blocks; ++i) {
+        int64_t index = partial[i * outlength + thread_id];
+        if (index != -1 && (argmin == -1 || fromptr[index] < fromptr[argmin]) ||
+           (fromptr[index] == fromptr[argmin] && index < argmin)) {
+          argmin = index;
+        }
       }
+      toptr[thread_id] = argmin;
     }
   }
 }

From 3cdbd7e1aac65149b44a8a0b5b97bccd4c157357 Mon Sep 17 00:00:00 2001
From: ManasviGoyal <mg.manasvi@gmail.com>
Date: Thu, 30 May 2024 15:37:02 +0200
Subject: [PATCH 02/33] feat: add
 awkward_ListOffsetArray_reduce_local_outoffsets_64 kernel

---
 dev/generate-kernel-signatures.py             |   1 +
 dev/generate-tests.py                         |   1 +
 kernel-test-data.json                         |   2 +-
 src/awkward/_connect/cuda/__init__.py         |   1 +
 ...tOffsetArray_reduce_local_outoffsets_64.cu | 134 ++++++++++++++++++
 5 files changed, 138 insertions(+), 1 deletion(-)
 create mode 100644 src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu

diff --git a/dev/generate-kernel-signatures.py b/dev/generate-kernel-signatures.py
index e020391e01..c70f787c25 100644
--- a/dev/generate-kernel-signatures.py
+++ b/dev/generate-kernel-signatures.py
@@ -101,6 +101,7 @@
     "awkward_ListOffsetArray_drop_none_indexes",
     "awkward_ListOffsetArray_reduce_local_nextparents_64",
     "awkward_ListOffsetArray_reduce_nonlocal_maxcount_offsetscopy_64",
+    "awkward_ListOffsetArray_reduce_local_outoffsets_64",
     "awkward_UnionArray_flatten_length",
     "awkward_UnionArray_flatten_combine",
     "awkward_UnionArray_nestedfill_tags_index",
diff --git a/dev/generate-tests.py b/dev/generate-tests.py
index 37dc859b9a..db89f2655e 100644
--- a/dev/generate-tests.py
+++ b/dev/generate-tests.py
@@ -886,6 +886,7 @@ def gencpuunittests(specdict):
     "awkward_ListOffsetArray_drop_none_indexes",
     "awkward_ListOffsetArray_reduce_local_nextparents_64",
     "awkward_ListOffsetArray_reduce_nonlocal_maxcount_offsetscopy_64",
+    "awkward_ListOffsetArray_reduce_local_outoffsets_64",
     "awkward_UnionArray_flatten_length",
     "awkward_UnionArray_flatten_combine",
     "awkward_UnionArray_nestedfill_tags_index",
diff --git a/kernel-test-data.json b/kernel-test-data.json
index fc0fb07cd5..db4bb86c93 100644
--- a/kernel-test-data.json
+++ b/kernel-test-data.json
@@ -18851,7 +18851,7 @@
         },
         {
             "name": "awkward_ListOffsetArray_reduce_local_outoffsets_64",
-            "status": false,
+            "status": true,
             "tests": [
                 {
                     "error": false,
diff --git a/src/awkward/_connect/cuda/__init__.py b/src/awkward/_connect/cuda/__init__.py
index 354fdcd217..5d7a77b8c5 100644
--- a/src/awkward/_connect/cuda/__init__.py
+++ b/src/awkward/_connect/cuda/__init__.py
@@ -105,6 +105,7 @@ def fetch_template_specializations(kernel_dict):
         "awkward_ListOffsetArray_drop_none_indexes",
         "awkward_ListOffsetArray_reduce_nonlocal_nextstarts_64",
         "awkward_ListOffsetArray_reduce_nonlocal_maxcount_offsetscopy_64",
+        "awkward_ListOffsetArray_reduce_local_outoffsets_64",
         "awkward_ListOffsetArray_rpad_length_axis1",
         "awkward_MaskedArray_getitem_next_jagged_project",
         "awkward_UnionArray_nestedfill_tags_index",
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu
new file mode 100644
index 0000000000..16dec17447
--- /dev/null
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu
@@ -0,0 +1,134 @@
+// BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE
+
+// BEGIN PYTHON
+// def f(grid, block, args):
+//     (outoffsets, parents, lenparents, outlength, invocation_index, err_code) = args
+//     shared_mem_size = block[0] * outoffsets.dtype.itemsize
+//     if block[0] > 0:
+//         segment = math.floor((outlength + block[0] - 1) / block[0])
+//         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
+//     else:
+//         segment = 0
+//         grid_size = 1
+//     print(block, grid_size)
+//     parents = cupy.sort(parents)
+//     partial = cupy.zeros(outlength * grid_size, dtype=outoffsets.dtype)
+//     temp = cupy.zeros(lenparents, dtype=cupy.int64)
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_a", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_b", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, partial, temp, invocation_index, err_code), shared_mem=shared_mem_size)
+//     scan_in_array = cupy.zeros(outlength, dtype=cupy.int64)
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_c", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((segment,), block, (outoffsets, parents, lenparents, outlength, partial, scan_in_array, invocation_index, err_code))
+//     scan_in_array = cupy.cumsum(scan_in_array)
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_d", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, partial, scan_in_array, invocation_index, err_code))
+// out["awkward_ListOffsetArray_reduce_local_outoffsets_64_a", {dtype_specializations}] = None
+// out["awkward_ListOffsetArray_reduce_local_outoffsets_64_b", {dtype_specializations}] = None
+// out["awkward_ListOffsetArray_reduce_local_outoffsets_64_c", {dtype_specializations}] = None
+// out["awkward_ListOffsetArray_reduce_local_outoffsets_64_d", {dtype_specializations}] = None
+// END PYTHON
+
+template <typename T, typename C>
+__global__ void
+awkward_ListOffsetArray_reduce_local_outoffsets_64_a(
+    T* outoffsets,
+    const C* parents,
+    int64_t lenparents,
+    int64_t outlength,
+    T* partial,
+    int64_t* temp,
+    uint64_t invocation_index,
+    uint64_t* err_code) {
+  if (err_code[0] == NO_ERROR) {
+    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (thread_id < outlength) {
+      outoffsets[thread_id] = 0;
+    }
+  }
+}
+
+template <typename T, typename C>
+__global__ void
+awkward_ListOffsetArray_reduce_local_outoffsets_64_b(
+    T* outoffsets,
+    const C* parents,
+    int64_t lenparents,
+    int64_t outlength,
+    T* partial,
+    int64_t* temp,
+    uint64_t invocation_index,
+    uint64_t* err_code) {
+  if (err_code[0] == NO_ERROR) {
+    int64_t idx = threadIdx.x;
+    int64_t thread_id = blockIdx.x * blockDim.x + idx;
+
+    if (thread_id < lenparents) {
+        temp[thread_id] = 1;
+    }
+    __syncthreads();
+
+
+    for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
+        int64_t val = 0;
+        if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+            val = temp[thread_id - stride];
+        }
+        __syncthreads();
+        temp[thread_id] += val;
+        __syncthreads();
+    }
+
+
+    if (thread_id < lenparents) {
+        int64_t parent = parents[thread_id];
+        if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+            partial[blockIdx.x * outlength + parent] = temp[thread_id];
+        }
+    }
+  }
+}
+
+template <typename T, typename C>
+__global__ void
+awkward_ListOffsetArray_reduce_local_outoffsets_64_c(
+    T* outoffsets,
+    const C* parents,
+    int64_t lenparents,
+    int64_t outlength,
+    T* partial,
+    int64_t* scan_in_array,
+    uint64_t invocation_index,
+    uint64_t* err_code) {
+  if (err_code[0] == NO_ERROR) {
+    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (thread_id < outlength) {
+      int64_t count = 0;
+      int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x;
+      for (int64_t i = 0; i < blocks; ++i) {
+        count += partial[i * outlength + thread_id];
+      }
+      scan_in_array[thread_id] = count;
+    }
+  }
+}
+
+template <typename T, typename C>
+__global__ void
+awkward_ListOffsetArray_reduce_local_outoffsets_64_d(
+    T* outoffsets,
+    const C* parents,
+    int64_t lenparents,
+    int64_t outlength,
+    T* partial,
+    int64_t* scan_in_array,
+    uint64_t invocation_index,
+    uint64_t* err_code) {
+  if (err_code[0] == NO_ERROR) {
+    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    outoffsets[0] = 0;
+
+    if (thread_id < outlength) {
+      outoffsets[thread_id + 1] = scan_in_array[thread_id];
+    }
+  }
+}

From c1a846b15f832d0b506aea81417c3869a66e1d02 Mon Sep 17 00:00:00 2001
From: ManasviGoyal <mg.manasvi@gmail.com>
Date: Thu, 30 May 2024 15:39:54 +0200
Subject: [PATCH 03/33] test: integration tests for cuda

---
 .../test_3136_cuda_argmin_and_argmax.py       | 32 +++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 tests-cuda/test_3136_cuda_argmin_and_argmax.py

diff --git a/tests-cuda/test_3136_cuda_argmin_and_argmax.py b/tests-cuda/test_3136_cuda_argmin_and_argmax.py
new file mode 100644
index 0000000000..8f1f6613bc
--- /dev/null
+++ b/tests-cuda/test_3136_cuda_argmin_and_argmax.py
@@ -0,0 +1,32 @@
+from __future__ import annotations
+
+import awkward as ak
+
+to_list = ak.operations.to_list
+
+
+def test_argmin_argmax_axis_None():
+    array = ak.highlevel.Array(
+        [
+            [
+                [2022, 2023, 2025],
+                [],
+                [2027, 2011],
+                [2013],
+            ],
+            [],
+            [[2017, 2019], [2023]],
+        ],
+    )
+    cuda_array = ak.to_backend(array, "cuda")
+
+    assert ak.operations.argmin(cuda_array) == 4
+    assert ak.operations.argmax(cuda_array) == 3
+
+
+def test():
+    array = ak.highlevel.Array([1, 2, 3, None, 4])
+
+    cuda_array = ak.to_backend(array, "cuda")
+
+    assert ak.operations.argmax(cuda_array) == 4

From 7be3f982616280de58289997326b2b47907b7d70 Mon Sep 17 00:00:00 2001
From: ManasviGoyal <mg.manasvi@gmail.com>
Date: Thu, 30 May 2024 16:22:52 +0200
Subject: [PATCH 04/33] test: some more integration tests for cuda

---
 .../test_3136_cuda_argmin_and_argmax.py       | 160 +++++++++++++++++-
 1 file changed, 158 insertions(+), 2 deletions(-)

diff --git a/tests-cuda/test_3136_cuda_argmin_and_argmax.py b/tests-cuda/test_3136_cuda_argmin_and_argmax.py
index 8f1f6613bc..cc60ecfd51 100644
--- a/tests-cuda/test_3136_cuda_argmin_and_argmax.py
+++ b/tests-cuda/test_3136_cuda_argmin_and_argmax.py
@@ -1,11 +1,14 @@
 from __future__ import annotations
 
+import cupy as cp
+import pytest
+
 import awkward as ak
 
 to_list = ak.operations.to_list
 
 
-def test_argmin_argmax_axis_None():
+def test_0835_argmin_argmax_axis_None():
     array = ak.highlevel.Array(
         [
             [
@@ -24,9 +27,162 @@ def test_argmin_argmax_axis_None():
     assert ak.operations.argmax(cuda_array) == 3
 
 
-def test():
+def test_1106_argminmax_axis_None_missing_values():
     array = ak.highlevel.Array([1, 2, 3, None, 4])
 
     cuda_array = ak.to_backend(array, "cuda")
 
     assert ak.operations.argmax(cuda_array) == 4
+
+
+def test_0070_argmin_and_argmax_jagged():
+    v2_array = ak.operations.from_iter(
+        [[2.2, 1.1, 3.3], [], [4.4, 5.5], [5.5], [-4.4, -5.5, -6.6]], highlevel=False
+    )
+
+    cuda_v2_array = ak.to_backend(v2_array, "cuda", highlevel=False)
+
+    assert to_list(ak.argmin(cuda_v2_array, axis=1, highlevel=False)) == [
+        1,
+        None,
+        0,
+        0,
+        2,
+    ]
+    assert (
+        ak.argmin(cuda_v2_array.to_typetracer(), axis=1, highlevel=False).form
+        == ak.argmin(cuda_v2_array, axis=1, highlevel=False).form
+    )
+
+    index2 = ak.index.Index64(cp.array([4, 3, 2, 1, 0], dtype=cp.int64))
+    cuda_v2_array2 = ak.contents.IndexedArray(index2, cuda_v2_array)
+
+    assert to_list(ak.argmin(cuda_v2_array2, axis=1, highlevel=False)) == [
+        2,
+        0,
+        0,
+        None,
+        1,
+    ]
+    assert (
+        ak.argmin(cuda_v2_array2.to_typetracer(), axis=1, highlevel=False).form
+        == ak.argmin(cuda_v2_array2, axis=1, highlevel=False).form
+    )
+
+    index3 = ak.index.Index64(cp.array([4, 3, -1, 4, 0], dtype=cp.int64))
+    cuda_v2_array2 = ak.contents.IndexedOptionArray(index3, cuda_v2_array)
+
+    assert to_list(ak.argmin(cuda_v2_array2, axis=1, highlevel=False)) == [
+        2,
+        0,
+        None,
+        2,
+        1,
+    ]
+    assert (
+        ak.argmin(cuda_v2_array2.to_typetracer(), axis=1, highlevel=False).form
+        == ak.argmin(cuda_v2_array2, axis=1, highlevel=False).form
+    )
+    assert to_list(ak.argmin(cuda_v2_array2, axis=-1, highlevel=False)) == [
+        2,
+        0,
+        None,
+        2,
+        1,
+    ]
+    assert (
+        ak.argmin(cuda_v2_array2.to_typetracer(), axis=-1, highlevel=False).form
+        == ak.argmin(cuda_v2_array2, axis=-1, highlevel=False).form
+    )
+
+
+def test_0070_argmin_and_argmax_missing():
+    array = ak.operations.from_iter(
+        [[[2.2, 1.1, 3.3]], [[]], [None, None, None], [[-4.4, -5.5, -6.6]]],
+        highlevel=False,
+    )
+
+    cuda_array = ak.to_backend(array, "cuda", highlevel=False)
+
+    assert to_list(ak.argmin(cuda_array, axis=2, highlevel=False)) == [
+        [1],
+        [None],
+        [None, None, None],
+        [2],
+    ]
+    assert (
+        ak.argmin(cuda_array.to_typetracer(), axis=2, highlevel=False).form
+        == ak.argmin(cuda_array, axis=2, highlevel=False).form
+    )
+
+
+def test_0115_generic_reducer_operation_ByteMaskedArray():
+    content = ak.operations.from_iter(
+        [
+            [[1.1, 0.0, 2.2], [], [3.3, 4.4]],
+            [],
+            [[5.5]],
+            [[6.6, 9.9, 8.8, 7.7]],
+            [[], [12.2, 11.1, 10.0]],
+        ],
+        highlevel=False,
+    )
+    mask = ak.index.Index8(cp.array([0, 0, 1, 1, 0], dtype=cp.int8))
+    content = ak.to_backend(content, "cuda", highlevel=False)
+
+    cuda_v2_array = ak.contents.ByteMaskedArray(mask, content, valid_when=False)
+
+    assert to_list(cuda_v2_array) == [
+        [[1.1, 0.0, 2.2], [], [3.3, 4.4]],
+        [],
+        None,
+        None,
+        [[], [12.2, 11.1, 10.0]],
+    ]
+    assert to_list(ak.argmin(cuda_v2_array, axis=-1, highlevel=False)) == [
+        [1, None, 0],
+        [],
+        None,
+        None,
+        [None, 2],
+    ]
+    assert (
+        ak.argmin(cuda_v2_array.to_typetracer(), axis=-1, highlevel=False).form
+        == ak.argmin(cuda_v2_array, axis=-1, highlevel=False).form
+    )
+
+
+@pytest.mark.parametrize(
+    "func",
+    [
+        ak.argmin,
+        ak.argmax,
+    ],
+)
+def test_2754_highlevel_behavior_missing_reducers(func):
+    behavior_1 = {"foo": "bar"}
+    behavior_2 = {"baz": "bargh!"}
+
+    array = ak.Array([[1, 2, 3, 4], [5], [10]])
+
+    cuda_array = ak.to_backend(array, "cuda")
+
+    assert isinstance(func(cuda_array, axis=1, highlevel=True), ak.Array)
+    assert isinstance(func(cuda_array, axis=1, highlevel=False), ak.contents.Content)
+    assert (
+        func(
+            ak.Array(cuda_array, behavior=behavior_1),
+            axis=1,
+            highlevel=True,
+            behavior=behavior_2,
+        ).behavior
+        == behavior_2
+    )
+    assert (
+        func(
+            ak.Array(cuda_array, behavior=behavior_1),
+            axis=1,
+            highlevel=True,
+        ).behavior
+        == behavior_1
+    )

From 98fb7ed9cf8db45063cecb658a3287859caf2b6d Mon Sep 17 00:00:00 2001
From: ManasviGoyal <mg.manasvi@gmail.com>
Date: Thu, 6 Jun 2024 09:24:43 +0200
Subject: [PATCH 05/33] feat: add awkward_reduce_count_64 kernel

---
 kernel-test-data.json                         |  4 +-
 ...tOffsetArray_reduce_local_outoffsets_64.cu |  4 -
 .../cuda_kernels/awkward_reduce_count_64.cu   | 89 +++++++++++++------
 3 files changed, 65 insertions(+), 32 deletions(-)

diff --git a/kernel-test-data.json b/kernel-test-data.json
index db4bb86c93..900f81c525 100644
--- a/kernel-test-data.json
+++ b/kernel-test-data.json
@@ -23795,7 +23795,7 @@
         },
         {
             "name": "awkward_reduce_count_64",
-            "status": false,
+            "status": true,
             "tests": [
                 {
                     "error": false,
@@ -23839,7 +23839,7 @@
                     "inputs": {
                         "lenparents": 1696,
                         "outlength": 331,
-                        "parents": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 103, 103, 103, 103, 103, 103, 103, 103, 103, 103, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 137, 137, 137, 137, 137, 137, 137, 137, 137, 137, 154, 154, 154, 154, 154, 154, 154, 154, 154, 154, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 104, 104, 104, 104, 104, 104, 104, 104, 104, 104, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 138, 138, 138, 138, 138, 138, 138, 138, 138, 138, 155, 155, 155, 155, 155, 155, 155, 155, 155, 155, 172, 172, 172, 172, 172, 172, 172, 172, 172, 172, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 173, 173, 173, 173, 173, 173, 173, 173, 173, 173, 190, 190, 190, 190, 190, 190, 190, 190, 190, 190, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 89, 89, 89, 89, 89, 89, 89, 89, 89, 89, 106, 106, 106, 106, 106, 106, 106, 106, 106, 106, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 140, 140, 140, 140, 140, 140, 140, 140, 140, 140, 157, 157, 157, 157, 157, 157, 157, 157, 157, 157, 174, 174, 174, 174, 174, 174, 174, 174, 174, 174, 191, 191, 191, 191, 191, 191, 191, 191, 191, 191, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 141, 141, 141, 141, 141, 141, 141, 141, 141, 141, 158, 158, 158, 158, 158, 158, 158, 158, 158, 158, 175, 175, 175, 175, 175, 175, 175, 175, 175, 175, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 159, 159, 159, 159, 159, 159, 159, 159, 159, 159, 176, 176, 176, 176, 176, 176, 176, 176, 176, 176, 193, 193, 193, 193, 193, 193, 193, 193, 193, 193, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 143, 143, 143, 143, 143, 143, 143, 143, 143, 143, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 177, 177, 177, 177, 177, 177, 177, 177, 177, 177, 194, 194, 194, 194, 194, 194, 194, 194, 194, 194, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16]
+                        "parents": [194, 194, 194, 194, 194, 194, 194, 194, 194, 194, 193, 193, 193, 193, 193, 193, 193, 193, 193, 193, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 191, 191, 191, 191, 191, 191, 191, 191, 191, 191, 190, 190, 190, 190, 190, 190, 190, 190, 190, 190, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 177, 177, 177, 177, 177, 177, 177, 177, 177, 177, 176, 176, 176, 176, 176, 176, 176, 176, 176, 176, 175, 175, 175, 175, 175, 175, 175, 175, 175, 175, 174, 174, 174, 174, 174, 174, 174, 174, 174, 174, 173, 173, 173, 173, 173, 173, 173, 173, 173, 173, 172, 172, 172, 172, 172, 172, 172, 172, 172, 172, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 159, 159, 159, 159, 159, 159, 159, 159, 159, 159, 158, 158, 158, 158, 158, 158, 158, 158, 158, 158, 157, 157, 157, 157, 157, 157, 157, 157, 157, 157, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 155, 155, 155, 155, 155, 155, 155, 155, 155, 155, 154, 154, 154, 154, 154, 154, 154, 154, 154, 154, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 143, 143, 143, 143, 143, 143, 143, 143, 143, 143, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 141, 141, 141, 141, 141, 141, 141, 141, 141, 141, 140, 140, 140, 140, 140, 140, 140, 140, 140, 140, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 138, 138, 138, 138, 138, 138, 138, 138, 138, 138, 137, 137, 137, 137, 137, 137, 137, 137, 137, 137, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 106, 106, 106, 106, 106, 106, 106, 106, 106, 106, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 104, 104, 104, 104, 104, 104, 104, 104, 104, 104, 103, 103, 103, 103, 103, 103, 103, 103, 103, 103, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 89, 89, 89, 89, 89, 89, 89, 89, 89, 89, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
                     },
                     "outputs": {
                         "toptr": [626, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 0, 0, 0, 0, 0, 0, 10, 10, 10, 10, 10, 10, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 10, 10, 10, 10, 10, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 10, 10, 10, 10, 10, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 10, 10, 10, 10, 10, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 10, 10, 10, 10, 10, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 10, 10, 10, 10, 10, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 10, 10, 10, 10, 10, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 10, 10, 10, 10, 10, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 10, 10, 10, 10, 10, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 10, 10, 10, 10, 10, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu
index 16dec17447..e94183229b 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu
@@ -10,8 +10,6 @@
 //     else:
 //         segment = 0
 //         grid_size = 1
-//     print(block, grid_size)
-//     parents = cupy.sort(parents)
 //     partial = cupy.zeros(outlength * grid_size, dtype=outoffsets.dtype)
 //     temp = cupy.zeros(lenparents, dtype=cupy.int64)
 //     cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_a", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
@@ -66,7 +64,6 @@ awkward_ListOffsetArray_reduce_local_outoffsets_64_b(
     }
     __syncthreads();
 
-
     for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
         int64_t val = 0;
         if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
@@ -77,7 +74,6 @@ awkward_ListOffsetArray_reduce_local_outoffsets_64_b(
         __syncthreads();
     }
 
-
     if (thread_id < lenparents) {
         int64_t parent = parents[thread_id];
         if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu
index 311f04012b..d880c36f0e 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu
@@ -2,68 +2,105 @@
 
 // BEGIN PYTHON
 // def f(grid, block, args):
-//     (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args
-//     atomicAdd_toptr = cupy.array(toptr, dtype=cupy.uint64)
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_a", toptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_b", toptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_c", toptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code))
+//     (toptr, parents, lenparents, outlength, invocation_index, err_code) = args
+//     shared_mem_size = block[0] * toptr.dtype.itemsize
+//     if block[0] > 0:
+//         segment = math.floor((outlength + block[0] - 1) / block[0])
+//         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
+//     else:
+//         segment = 0
+//         grid_size = 1
+//     partial = cupy.zeros(outlength * grid_size, dtype=toptr.dtype)
+//     temp = cupy.zeros(lenparents, dtype=toptr.dtype)
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_a", cupy.dtype(toptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_b", cupy.dtype(toptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code), shared_mem=shared_mem_size)
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_c", cupy.dtype(toptr.dtype).type, parents.dtype]))((segment,), block, (toptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
 // out["awkward_reduce_count_64_a", {dtype_specializations}] = None
 // out["awkward_reduce_count_64_b", {dtype_specializations}] = None
 // out["awkward_reduce_count_64_c", {dtype_specializations}] = None
 // END PYTHON
 
-template <typename T, typename U>
+template <typename T, typename C>
 __global__ void
 awkward_reduce_count_64_a(
     T* toptr,
-    const bool* fromptr,
-    const U* parents,
+    const C* parents,
     int64_t lenparents,
     int64_t outlength,
-    uint64_t* atomicAdd_toptr,
+    T* partial,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
     if (thread_id < outlength) {
-      atomicAdd_toptr[thread_id] = 0;
+      toptr[thread_id] = 0;
     }
   }
 }
 
-template <typename T, typename U>
+template <typename T, typename C>
 __global__ void
 awkward_reduce_count_64_b(
     T* toptr,
-    const bool* fromptr,
-    const U* parents,
+    const C* parents,
     int64_t lenparents,
     int64_t outlength,
-    uint64_t* atomicAdd_toptr,
+    T* partial,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
-    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    int64_t idx = threadIdx.x;
+    int64_t thread_id = blockIdx.x * blockDim.x + idx;
+
     if (thread_id < lenparents) {
-      atomicAdd(atomicAdd_toptr + parents[thread_id], (uint64_t)1);
+        temp[thread_id] = 1;
+    }
+    __syncthreads();
+
+
+    for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
+        int64_t val = 0;
+        if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+            val = temp[thread_id - stride];
+        }
+        __syncthreads();
+        temp[thread_id] += val;
+        __syncthreads();
+    }
+
+    if (thread_id < lenparents) {
+        int64_t parent = parents[thread_id];
+        if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+            partial[blockIdx.x * outlength + parent] = temp[thread_id];
+        }
     }
   }
 }
 
-template <typename T, typename U>
+template <typename T, typename C>
 __global__ void
-awkward_reduce_count_64_c(T* toptr,
-                          const bool* fromptr,
-                          const U* parents,
-                          int64_t lenparents,
-                          int64_t outlength,
-                          uint64_t* atomicAdd_toptr,
-                          uint64_t invocation_index,
-                          uint64_t* err_code) {
+awkward_reduce_count_64_c(
+    T* toptr,
+    const C* parents,
+    int64_t lenparents,
+    int64_t outlength,
+    T* partial,
+    T* temp,
+    uint64_t invocation_index,
+    uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
     if (thread_id < outlength) {
-      toptr[thread_id] = (T)atomicAdd_toptr[thread_id];
+      int64_t count = 0;
+      int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x;
+      for (int64_t i = 0; i < blocks; ++i) {
+        count += partial[i * outlength + thread_id];
+      }
+      toptr[thread_id] = count;
     }
   }
 }

From 0ed94efb9ee6b549c9544decf3bcce77635d5c72 Mon Sep 17 00:00:00 2001
From: ManasviGoyal <mg.manasvi@gmail.com>
Date: Thu, 6 Jun 2024 09:52:13 +0200
Subject: [PATCH 06/33] fix: indexing and indentation

---
 ...tOffsetArray_reduce_local_outoffsets_64.cu | 27 +++++++--------
 .../cuda_kernels/awkward_reduce_argmax.cu     | 34 +++++++++----------
 .../cuda_kernels/awkward_reduce_argmin.cu     | 34 +++++++++----------
 .../cuda_kernels/awkward_reduce_count_64.cu   | 28 +++++++--------
 4 files changed, 58 insertions(+), 65 deletions(-)

diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu
index e94183229b..5ae3d2eb56 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu
@@ -3,7 +3,6 @@
 // BEGIN PYTHON
 // def f(grid, block, args):
 //     (outoffsets, parents, lenparents, outlength, invocation_index, err_code) = args
-//     shared_mem_size = block[0] * outoffsets.dtype.itemsize
 //     if block[0] > 0:
 //         segment = math.floor((outlength + block[0] - 1) / block[0])
 //         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
@@ -13,7 +12,7 @@
 //     partial = cupy.zeros(outlength * grid_size, dtype=outoffsets.dtype)
 //     temp = cupy.zeros(lenparents, dtype=cupy.int64)
 //     cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_a", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_b", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, partial, temp, invocation_index, err_code), shared_mem=shared_mem_size)
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_b", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
 //     scan_in_array = cupy.zeros(outlength, dtype=cupy.int64)
 //     cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_c", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((segment,), block, (outoffsets, parents, lenparents, outlength, partial, scan_in_array, invocation_index, err_code))
 //     scan_in_array = cupy.cumsum(scan_in_array)
@@ -60,25 +59,25 @@ awkward_ListOffsetArray_reduce_local_outoffsets_64_b(
     int64_t thread_id = blockIdx.x * blockDim.x + idx;
 
     if (thread_id < lenparents) {
-        temp[thread_id] = 1;
+      temp[idx] = 1;
     }
     __syncthreads();
 
     for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
-        int64_t val = 0;
-        if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
-            val = temp[thread_id - stride];
-        }
-        __syncthreads();
-        temp[thread_id] += val;
-        __syncthreads();
+      int64_t val = 0;
+      if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+        val = temp[idx - stride];
+      }
+      __syncthreads();
+      temp[idx] += val;
+      __syncthreads();
     }
 
     if (thread_id < lenparents) {
-        int64_t parent = parents[thread_id];
-        if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
-            partial[blockIdx.x * outlength + parent] = temp[thread_id];
-        }
+      int64_t parent = parents[thread_id];
+      if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+        partial[blockIdx.x * outlength + parent] = temp[idx];
+      }
     }
   }
 }
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu
index ee57d912fc..122894795e 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu
@@ -3,17 +3,17 @@
 // BEGIN PYTHON
 // def f(grid, block, args):
 //     (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args
-//     shared_mem_size = block[0] * toptr.dtype.itemsize
 //     if block[0] > 0:
 //         segment = math.floor((outlength + block[0] - 1) / block[0])
-//         partial_size = outlength * ((lenparents + block[0] - 1) / block[0])
+//         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
 //     else:
 //         segment = 0
-//         partial_size = 0
-//     partial = cupy.full(math.floor(partial_size), -1, dtype=toptr.dtype)
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, partial, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, partial, invocation_index, err_code), shared_mem=shared_mem_size)
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, partial, invocation_index, err_code))
+//         grid_size = 1
+//     partial = cupy.full(outlength * grid_size, -1, dtype=toptr.dtype)
+//     temp = cupy.zeros(lenparents, dtype=toptr.dtype)
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
 // out["awkward_reduce_argmax_a", {dtype_specializations}] = None
 // out["awkward_reduce_argmax_b", {dtype_specializations}] = None
 // out["awkward_reduce_argmax_c", {dtype_specializations}] = None
@@ -28,6 +28,7 @@ awkward_reduce_argmax_a(
     int64_t lenparents,
     int64_t outlength,
     T* partial,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
@@ -48,30 +49,26 @@ awkward_reduce_argmax_b(
     int64_t lenparents,
     int64_t outlength,
     T* partial,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
-    extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
-    T *shared_mem = reinterpret_cast<T *>(shared_memory);
-
     int64_t idx = threadIdx.x;
     int64_t thread_id = blockIdx.x * blockDim.x + idx;
 
     if (thread_id < lenparents) {
-      shared_mem[idx] = thread_id;
-    } else {
-      shared_mem[idx] = -1;
+      temp[idx] = thread_id;
     }
     __syncthreads();
 
     for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
       int64_t index = -1;
       if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
-        index = shared_mem[idx - stride];
+        index = temp[idx - stride];
       }
-      if (index != -1 && (shared_mem[idx] == -1 || fromptr[index] > fromptr[shared_mem[idx]] ||
-         (fromptr[index] == fromptr[shared_mem[idx]] && index < shared_mem[idx]))) {
-        shared_mem[idx] = index;
+      if (index != -1 && (temp[idx] == -1 || fromptr[index] > fromptr[temp[idx]] ||
+         (fromptr[index] == fromptr[temp[idx]] && index < temp[idx]))) {
+        temp[idx] = index;
       }
       __syncthreads();
     }
@@ -79,7 +76,7 @@ awkward_reduce_argmax_b(
     if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
       if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
-        partial[blockIdx.x * outlength + parent] = shared_mem[idx];
+        partial[blockIdx.x * outlength + parent] = temp[idx];
       }
     }
   }
@@ -94,6 +91,7 @@ awkward_reduce_argmax_c(
     int64_t lenparents,
     int64_t outlength,
     T* partial,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu
index 262a51b57a..40a8437218 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu
@@ -3,17 +3,17 @@
 // BEGIN PYTHON
 // def f(grid, block, args):
 //     (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args
-//     shared_mem_size = block[0] * toptr.dtype.itemsize
 //     if block[0] > 0:
 //         segment = math.floor((outlength + block[0] - 1) / block[0])
-//         partial_size = outlength * ((lenparents + block[0] - 1) / block[0])
+//         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
 //     else:
 //         segment = 0
-//         partial_size = 0
-//     partial = cupy.full(math.floor(partial_size), -1, dtype=toptr.dtype)
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmin_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, partial, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmin_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, partial, invocation_index, err_code), shared_mem=shared_mem_size)
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmin_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, partial, invocation_index, err_code))
+//         grid_size = 1
+//     partial = cupy.full(outlength * grid_size, -1, dtype=toptr.dtype)
+//     temp = cupy.zeros(lenparents, dtype=toptr.dtype)
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmin_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmin_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmin_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
 // out["awkward_reduce_argmin_a", {dtype_specializations}] = None
 // out["awkward_reduce_argmin_b", {dtype_specializations}] = None
 // out["awkward_reduce_argmin_c", {dtype_specializations}] = None
@@ -28,6 +28,7 @@ awkward_reduce_argmin_a(
     int64_t lenparents,
     int64_t outlength,
     T* partial,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
@@ -48,30 +49,26 @@ awkward_reduce_argmin_b(
     int64_t lenparents,
     int64_t outlength,
     T* partial,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
-    extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
-    T *shared_mem = reinterpret_cast<T *>(shared_memory);
-
     int64_t idx = threadIdx.x;
     int64_t thread_id = blockIdx.x * blockDim.x + idx;
 
     if (thread_id < lenparents) {
-      shared_mem[idx] = thread_id;
-    } else {
-      shared_mem[idx] = -1;
+      temp[idx] = thread_id;
     }
     __syncthreads();
 
     for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
       int64_t index = -1;
       if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
-        index = shared_mem[idx - stride];
+        index = temp[idx - stride];
       }
-      if (index != -1 && (shared_mem[idx] == -1 || fromptr[index] < fromptr[shared_mem[idx]] ||
-         (fromptr[index] == fromptr[shared_mem[idx]] && index < shared_mem[idx]))) {
-        shared_mem[idx] = index;
+      if (index != -1 && (temp[idx] == -1 || fromptr[index] < fromptr[temp[idx]] ||
+         (fromptr[index] == fromptr[temp[idx]] && index < temp[idx]))) {
+        temp[idx] = index;
       }
       __syncthreads();
     }
@@ -79,7 +76,7 @@ awkward_reduce_argmin_b(
     if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
       if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
-        partial[blockIdx.x * outlength + parent] = shared_mem[idx];
+        partial[blockIdx.x * outlength + parent] = temp[idx];
       }
     }
   }
@@ -94,6 +91,7 @@ awkward_reduce_argmin_c(
     int64_t lenparents,
     int64_t outlength,
     T* partial,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu
index d880c36f0e..cdf870c63c 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu
@@ -3,7 +3,6 @@
 // BEGIN PYTHON
 // def f(grid, block, args):
 //     (toptr, parents, lenparents, outlength, invocation_index, err_code) = args
-//     shared_mem_size = block[0] * toptr.dtype.itemsize
 //     if block[0] > 0:
 //         segment = math.floor((outlength + block[0] - 1) / block[0])
 //         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
@@ -13,7 +12,7 @@
 //     partial = cupy.zeros(outlength * grid_size, dtype=toptr.dtype)
 //     temp = cupy.zeros(lenparents, dtype=toptr.dtype)
 //     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_a", cupy.dtype(toptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_b", cupy.dtype(toptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code), shared_mem=shared_mem_size)
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_b", cupy.dtype(toptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
 //     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_c", cupy.dtype(toptr.dtype).type, parents.dtype]))((segment,), block, (toptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
 // out["awkward_reduce_count_64_a", {dtype_specializations}] = None
 // out["awkward_reduce_count_64_b", {dtype_specializations}] = None
@@ -56,26 +55,25 @@ awkward_reduce_count_64_b(
     int64_t thread_id = blockIdx.x * blockDim.x + idx;
 
     if (thread_id < lenparents) {
-        temp[thread_id] = 1;
+      temp[idx] = 1;
     }
     __syncthreads();
 
-
     for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
-        int64_t val = 0;
-        if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
-            val = temp[thread_id - stride];
-        }
-        __syncthreads();
-        temp[thread_id] += val;
-        __syncthreads();
+      int64_t val = 0;
+      if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+        val = temp[idx - stride];
+      }
+      __syncthreads();
+      temp[idx] += val;
+      __syncthreads();
     }
 
     if (thread_id < lenparents) {
-        int64_t parent = parents[thread_id];
-        if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
-            partial[blockIdx.x * outlength + parent] = temp[thread_id];
-        }
+      int64_t parent = parents[thread_id];
+      if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+        partial[blockIdx.x * outlength + parent] = temp[idx];
+      }
     }
   }
 }

From 02c03bc70b73df950ebf70efeea1951e4c7569f6 Mon Sep 17 00:00:00 2001
From: ManasviGoyal <mg.manasvi@gmail.com>
Date: Thu, 6 Jun 2024 10:00:05 +0200
Subject: [PATCH 07/33] feat: add awkward_reduce_countnonzero kernel

---
 kernel-test-data.json                         |  2 +-
 .../awkward_reduce_countnonzero.cu            | 72 ++++++++++++++-----
 2 files changed, 54 insertions(+), 20 deletions(-)

diff --git a/kernel-test-data.json b/kernel-test-data.json
index 900f81c525..9616d7eb7b 100644
--- a/kernel-test-data.json
+++ b/kernel-test-data.json
@@ -23698,7 +23698,7 @@
         },
         {
             "name": "awkward_reduce_countnonzero",
-            "status": false,
+            "status": true,
             "tests": [
                 {
                     "error": false,
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu
index 6b07dfa208..11bb84b18f 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu
@@ -3,10 +3,17 @@
 // BEGIN PYTHON
 // def f(grid, block, args):
 //     (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args
-//     atomicAdd_toptr = cupy.array(toptr, dtype=cupy.uint64)
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_countnonzero_a", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_countnonzero_b", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_countnonzero_c", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code))
+//     if block[0] > 0:
+//         segment = math.floor((outlength + block[0] - 1) / block[0])
+//         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
+//     else:
+//         segment = 0
+//         grid_size = 1
+//     partial = cupy.zeros(outlength * grid_size, dtype=toptr.dtype)
+//     temp = cupy.zeros(lenparents, dtype=toptr.dtype)
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_countnonzero_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_countnonzero_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_countnonzero_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
 // out["awkward_reduce_countnonzero_a", {dtype_specializations}] = None
 // out["awkward_reduce_countnonzero_b", {dtype_specializations}] = None
 // out["awkward_reduce_countnonzero_c", {dtype_specializations}] = None
@@ -20,14 +27,15 @@ awkward_reduce_countnonzero_a(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    uint64_t* atomicAdd_toptr,
+    T* partial,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      atomicAdd_toptr[thread_id] = 0;
+      toptr[thread_id] = 0;
     }
   }
 }
@@ -40,34 +48,60 @@ awkward_reduce_countnonzero_b(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    uint64_t* atomicAdd_toptr,
+    T* partial,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
-    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    int64_t idx = threadIdx.x;
+    int64_t thread_id = blockIdx.x * blockDim.x + idx;
 
     if (thread_id < lenparents) {
-      atomicAdd(atomicAdd_toptr + parents[thread_id],
-                (uint64_t)(fromptr[thread_id] != 0));
+      temp[idx] = (fromptr[thread_id] != 0) ? 1 : 0;
+    }
+    __syncthreads();
+
+    for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
+      int64_t val = 0;
+      if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+        val = temp[idx - stride];
+      }
+      __syncthreads();
+      temp[idx] += val;
+      __syncthreads();
+    }
+
+    if (thread_id < lenparents) {
+      int64_t parent = parents[thread_id];
+      if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+        partial[blockIdx.x * outlength + parent] = temp[idx];
+      }
     }
   }
 }
 
 template <typename T, typename C, typename U>
 __global__ void
-awkward_reduce_countnonzero_c(T* toptr,
-                              const C* fromptr,
-                              const U* parents,
-                              int64_t lenparents,
-                              int64_t outlength,
-                              uint64_t* atomicAdd_toptr,
-                              uint64_t invocation_index,
-                              uint64_t* err_code) {
+awkward_reduce_countnonzero_c(
+    T* toptr,
+    const C* fromptr,
+    const U* parents,
+    int64_t lenparents,
+    int64_t outlength,
+    T* partial,
+    T* temp,
+    uint64_t invocation_index,
+    uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      toptr[thread_id] = (T)atomicAdd_toptr[thread_id];
+      int64_t count = 0;
+      int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x;
+      for (int64_t i = 0; i < blocks; ++i) {
+        count += partial[i * outlength + thread_id];
+      }
+      toptr[thread_id] = count;
     }
   }
 }

From 34fc82b3aa800c028c6e8a91f1be5e30e5f53347 Mon Sep 17 00:00:00 2001
From: ManasviGoyal <mg.manasvi@gmail.com>
Date: Thu, 6 Jun 2024 10:16:13 +0200
Subject: [PATCH 08/33] feat: add reduce sum, min and max kernels

---
 kernel-test-data.json                         |  6 +-
 .../cuda/cuda_kernels/awkward_reduce_max.cu   | 71 +++++++++++++++++--
 .../cuda/cuda_kernels/awkward_reduce_min.cu   | 71 +++++++++++++++++--
 .../cuda/cuda_kernels/awkward_reduce_sum.cu   | 57 +++++++++++----
 4 files changed, 178 insertions(+), 27 deletions(-)

diff --git a/kernel-test-data.json b/kernel-test-data.json
index 9616d7eb7b..21148f61dd 100644
--- a/kernel-test-data.json
+++ b/kernel-test-data.json
@@ -23510,7 +23510,7 @@
         },
         {
             "name": "awkward_reduce_max",
-            "status": false,
+            "status": true,
             "tests": [
                 {
                     "error": false,
@@ -24173,7 +24173,7 @@
         },
         {
             "name": "awkward_reduce_sum",
-            "status": false,
+            "status": true,
             "tests": [
                 {
                     "error": false,
@@ -25342,7 +25342,7 @@
         },
         {
             "name": "awkward_reduce_min",
-            "status": false,
+            "status": true,
             "tests": [
                 {
                     "error": false,
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu
index 3c20b653ac..4ac7df43ba 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu
@@ -3,10 +3,20 @@
 // BEGIN PYTHON
 // def f(grid, block, args):
 //     (toptr, fromptr, parents, lenparents, outlength, identity, invocation_index, err_code) = args
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_max_a", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, identity, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_max_b", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, identity, invocation_index, err_code))
+//     if block[0] > 0:
+//         segment = math.floor((outlength + block[0] - 1) / block[0])
+//         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
+//     else:
+//         segment = 0
+//         grid_size = 1
+//     partial = cupy.full(outlength * grid_size, identity, dtype=toptr.dtype)
+//     temp = cupy.zeros(lenparents, dtype=toptr.dtype)
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_max_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, identity, partial, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_max_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, identity, partial, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_max_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, identity, partial, temp, invocation_index, err_code))
 // out["awkward_reduce_max_a", {dtype_specializations}] = None
 // out["awkward_reduce_max_b", {dtype_specializations}] = None
+// out["awkward_reduce_max_c", {dtype_specializations}] = None
 // END PYTHON
 
 template <typename T, typename C, typename U>
@@ -18,10 +28,13 @@ awkward_reduce_max_a(
     int64_t lenparents,
     int64_t outlength,
     T identity,
+    T* partial,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
     if (thread_id < outlength) {
       toptr[thread_id] = identity;
     }
@@ -37,15 +50,61 @@ awkward_reduce_max_b(
     int64_t lenparents,
     int64_t outlength,
     T identity,
+    T* partial,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
-    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    int64_t idx = threadIdx.x;
+    int64_t thread_id = blockIdx.x * blockDim.x + idx;
+
+    if (thread_id < lenparents) {
+      temp[idx] = fromptr[thread_id];
+    }
+    __syncthreads();
+
+    for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
+      T val = identity;
+      if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+        val = temp[idx - stride];
+      }
+      __syncthreads();
+      temp[idx] = val > temp[idx] ? val : temp[idx];
+      __syncthreads();
+    }
 
     if (thread_id < lenparents) {
-      C x = fromptr[thread_id];
-      toptr[parents[thread_id]] =
-          (x > toptr[parents[thread_id]] ? x : toptr[parents[thread_id]]);
+      int64_t parent = parents[thread_id];
+      if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+        partial[blockIdx.x * outlength + parent] = temp[idx];
+      }
+    }
+  }
+}
+
+template <typename T, typename C, typename U>
+__global__ void
+awkward_reduce_max_c(
+    T* toptr,
+    const C* fromptr,
+    const U* parents,
+    int64_t lenparents,
+    int64_t outlength,
+    T identity,
+    T* partial,
+    T* temp,
+    uint64_t invocation_index,
+    uint64_t* err_code) {
+  if (err_code[0] == NO_ERROR) {
+    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (thread_id < outlength) {
+      T maximum = identity;
+      int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x;
+      for (int64_t i = 0; i < blocks; ++i) {
+        maximum = maximum > partial[i * outlength + thread_id] ? maximum : partial[i * outlength + thread_id];
+      }
+      toptr[thread_id] = maximum;
     }
   }
 }
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu
index ae0e2dcb61..f524485e58 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu
@@ -3,10 +3,20 @@
 // BEGIN PYTHON
 // def f(grid, block, args):
 //     (toptr, fromptr, parents, lenparents, outlength, identity, invocation_index, err_code) = args
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_min_a", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, identity, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_min_b", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, identity, invocation_index, err_code))
+//     if block[0] > 0:
+//         segment = math.floor((outlength + block[0] - 1) / block[0])
+//         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
+//     else:
+//         segment = 0
+//         grid_size = 1
+//     partial = cupy.full(outlength * grid_size, identity, dtype=toptr.dtype)
+//     temp = cupy.zeros(lenparents, dtype=toptr.dtype)
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_min_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, identity, partial, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_min_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, identity, partial, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_min_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, identity, partial, temp, invocation_index, err_code))
 // out["awkward_reduce_min_a", {dtype_specializations}] = None
 // out["awkward_reduce_min_b", {dtype_specializations}] = None
+// out["awkward_reduce_min_c", {dtype_specializations}] = None
 // END PYTHON
 
 template <typename T, typename C, typename U>
@@ -18,10 +28,13 @@ awkward_reduce_min_a(
     int64_t lenparents,
     int64_t outlength,
     T identity,
+    T* partial,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
     if (thread_id < outlength) {
       toptr[thread_id] = identity;
     }
@@ -37,15 +50,61 @@ awkward_reduce_min_b(
     int64_t lenparents,
     int64_t outlength,
     T identity,
+    T* partial,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
-    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    int64_t idx = threadIdx.x;
+    int64_t thread_id = blockIdx.x * blockDim.x + idx;
+
+    if (thread_id < lenparents) {
+      temp[idx] = fromptr[thread_id];
+    }
+    __syncthreads();
+
+    for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
+      T val = identity;
+      if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+        val = temp[idx - stride];
+      }
+      __syncthreads();
+      temp[idx] = val < temp[idx] ? val : temp[idx];
+      __syncthreads();
+    }
 
     if (thread_id < lenparents) {
-      C x = fromptr[thread_id];
-      toptr[parents[thread_id]] =
-          (x < toptr[parents[thread_id]] ? x : toptr[parents[thread_id]]);
+      int64_t parent = parents[thread_id];
+      if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+        partial[blockIdx.x * outlength + parent] = temp[idx];
+      }
+    }
+  }
+}
+
+template <typename T, typename C, typename U>
+__global__ void
+awkward_reduce_min_c(
+    T* toptr,
+    const C* fromptr,
+    const U* parents,
+    int64_t lenparents,
+    int64_t outlength,
+    T identity,
+    T* partial,
+    T* temp,
+    uint64_t invocation_index,
+    uint64_t* err_code) {
+  if (err_code[0] == NO_ERROR) {
+    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (thread_id < outlength) {
+      T minimum = identity;
+      int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x;
+      for (int64_t i = 0; i < blocks; ++i) {
+        minimum = minimum < partial[i * outlength + thread_id] ? minimum : partial[i * outlength + thread_id];
+      }
+      toptr[thread_id] = minimum;
     }
   }
 }
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum.cu
index 13c5a31dbf..e641d728b4 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum.cu
@@ -3,10 +3,17 @@
 // BEGIN PYTHON
 // def f(grid, block, args):
 //     (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args
-//     atomicAdd_toptr = cupy.array(toptr, dtype=cupy.uint64)
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_a", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_b", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_c", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code))
+//     if block[0] > 0:
+//         segment = math.floor((outlength + block[0] - 1) / block[0])
+//         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
+//     else:
+//         segment = 0
+//         grid_size = 1
+//     partial = cupy.zeros(outlength * grid_size, dtype=toptr.dtype)
+//     temp = cupy.zeros(lenparents, dtype=toptr.dtype)
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
 // out["awkward_reduce_sum_a", {dtype_specializations}] = None
 // out["awkward_reduce_sum_b", {dtype_specializations}] = None
 // out["awkward_reduce_sum_c", {dtype_specializations}] = None
@@ -20,14 +27,15 @@ awkward_reduce_sum_a(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    uint64_t* atomicAdd_toptr,
+    T* partial,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      atomicAdd_toptr[thread_id] = 0;
+      toptr[thread_id] = 0;
     }
   }
 }
@@ -40,15 +48,34 @@ awkward_reduce_sum_b(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    uint64_t* atomicAdd_toptr,
+    T* partial,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
-    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    int64_t idx = threadIdx.x;
+    int64_t thread_id = blockIdx.x * blockDim.x + idx;
+
+    if (thread_id < lenparents) {
+      temp[idx] = fromptr[thread_id];
+    }
+    __syncthreads();
+
+    for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
+      T val = 0;
+      if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+        val = temp[idx - stride];
+      }
+      __syncthreads();
+      temp[idx] += val;
+      __syncthreads();
+    }
 
     if (thread_id < lenparents) {
-      atomicAdd(atomicAdd_toptr + parents[thread_id],
-                (uint64_t)fromptr[thread_id]);
+      int64_t parent = parents[thread_id];
+      if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+        partial[blockIdx.x * outlength + parent] = temp[idx];
+      }
     }
   }
 }
@@ -61,14 +88,20 @@ awkward_reduce_sum_c(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    uint64_t* atomicAdd_toptr,
+    T* partial,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      toptr[thread_id] = (T)atomicAdd_toptr[thread_id];
+      T sum = 0;
+      int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x;
+      for (int64_t i = 0; i < blocks; ++i) {
+        sum += partial[i * outlength + thread_id];
+      }
+      toptr[thread_id] = sum;
     }
   }
 }

From 4e00f0723c9c98d32ee2d47c0fbe92b9ac6cdf42 Mon Sep 17 00:00:00 2001
From: ManasviGoyal <mg.manasvi@gmail.com>
Date: Thu, 6 Jun 2024 10:37:24 +0200
Subject: [PATCH 09/33] feat: add reduce prod and sum_int_bool

---
 dev/generate-kernel-signatures.py             |   1 +
 dev/generate-tests.py                         |   1 +
 kernel-test-data.json                         | 794 +++++++++++++++++-
 src/awkward/_connect/cuda/__init__.py         |   1 +
 .../cuda/cuda_kernels/awkward_reduce_prod.cu  | 107 +++
 .../awkward_reduce_sum_int32_bool_64.cu       |  72 +-
 .../awkward_reduce_sum_int64_bool_64.cu       |  72 +-
 7 files changed, 1009 insertions(+), 39 deletions(-)
 create mode 100644 src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod.cu

diff --git a/dev/generate-kernel-signatures.py b/dev/generate-kernel-signatures.py
index c70f787c25..4f02a5ca42 100644
--- a/dev/generate-kernel-signatures.py
+++ b/dev/generate-kernel-signatures.py
@@ -117,6 +117,7 @@
     "awkward_reduce_sum_int32_bool_64",
     "awkward_reduce_sum_int64_bool_64",
     "awkward_reduce_sum_bool",
+    "awkward_reduce_prod",
     "awkward_reduce_prod_bool",
     "awkward_reduce_countnonzero",
     "awkward_sorting_ranges",
diff --git a/dev/generate-tests.py b/dev/generate-tests.py
index db89f2655e..7267d5659c 100644
--- a/dev/generate-tests.py
+++ b/dev/generate-tests.py
@@ -902,6 +902,7 @@ def gencpuunittests(specdict):
     "awkward_reduce_sum_int32_bool_64",
     "awkward_reduce_sum_int64_bool_64",
     "awkward_reduce_sum_bool",
+    "awkward_reduce_prod",
     "awkward_reduce_prod_bool",
     "awkward_reduce_countnonzero",
     "awkward_sorting_ranges",
diff --git a/kernel-test-data.json b/kernel-test-data.json
index 21148f61dd..0df339f461 100644
--- a/kernel-test-data.json
+++ b/kernel-test-data.json
@@ -24567,9 +24567,801 @@
                 }
             ]
         },
+        {
+            "name": "awkward_reduce_sum_int32_bool_64",
+            "status": true,
+            "tests": [
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [],
+                        "lenparents": 0,
+                        "outlength": 0,
+                        "parents": []
+                    },
+                    "outputs": {
+                        "toptr": []
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [0],
+                        "lenparents": 1,
+                        "outlength": 1,
+                        "parents": [0]
+                    },
+                    "outputs": {
+                        "toptr": [0]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [0, 5, 20, 1, 6, 21, 2, 7, 22, 3, 8, 23, 4, 9, 24],
+                        "lenparents": 15,
+                        "outlength": 10,
+                        "parents": [0, 5, 5, 1, 6, 6, 2, 7, 7, 3, 8, 8, 4, 9, 9]
+                    },
+                    "outputs": {
+                        "toptr": [0, 1, 1, 1, 1, 2, 2, 2, 2, 2]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [2, 3, 5, 7, 11, 13, 17, 19, 23],
+                        "lenparents": 9,
+                        "outlength": 6,
+                        "parents": [0, 0, 0, 2, 2, 3, 4, 4, 5]
+                    },
+                    "outputs": {
+                        "toptr": [3, 0, 2, 1, 2, 1]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [1, 0, 0, 1, 0, 0],
+                        "lenparents": 6,
+                        "outlength": 4,
+                        "parents": [0, 0, 0, 2, 2, 3]
+                    },
+                    "outputs": {
+                        "toptr": [1, 0, 1, 0]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24],
+                        "lenparents": 15,
+                        "outlength": 3,
+                        "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2]
+                    },
+                    "outputs": {
+                        "toptr": [4, 5, 5]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
+                        "lenparents": 30,
+                        "outlength": 6,
+                        "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5]
+                    },
+                    "outputs": {
+                        "toptr": [4, 5, 5, 5, 5, 5]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [0, 1, 3, 4, 5, 6],
+                        "lenparents": 6,
+                        "outlength": 4,
+                        "parents": [0, 0, 1, 3, 3, 3]
+                    },
+                    "outputs": {
+                        "toptr": [1, 1, 0, 3]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [0, 5, 10, 15, 25, 1, 11, 16, 26, 2, 12, 17, 27, 8, 18, 28, 4, 9, 14, 29],
+                        "lenparents": 20,
+                        "outlength": 10,
+                        "parents": [0, 0, 0, 5, 5, 1, 1, 6, 6, 2, 2, 7, 7, 3, 8, 8, 4, 4, 4, 9]
+                    },
+                    "outputs": {
+                        "toptr": [2, 2, 2, 1, 3, 2, 2, 2, 2, 1]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [15, 20, 25, 16, 21, 26, 17, 22, 27, 18, 23, 28, 19, 24, 29],
+                        "lenparents": 15,
+                        "outlength": 15,
+                        "parents": [0, 5, 10, 1, 6, 11, 2, 7, 12, 3, 8, 13, 4, 9, 14]
+                    },
+                    "outputs": {
+                        "toptr": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [0, 15, 5, 10, 25, 1, 16, 11, 26, 2, 17, 12, 27, 18, 8, 28, 4, 9, 14, 29],
+                        "lenparents": 20,
+                        "outlength": 15,
+                        "parents": [0, 0, 5, 10, 10, 1, 1, 11, 11, 2, 2, 12, 12, 3, 8, 13, 4, 9, 14, 14]
+                    },
+                    "outputs": {
+                        "toptr": [1, 2, 2, 1, 1, 1, 0, 0, 1, 1, 2, 2, 2, 1, 2]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [0, 15, 5, 20, 10, 25, 1, 16, 6, 21, 11, 26, 2, 17, 7, 22, 12, 27, 3, 18, 8, 23, 13, 28, 4, 19, 9, 24, 14, 29],
+                        "lenparents": 30,
+                        "outlength": 15,
+                        "parents": [0, 0, 5, 5, 10, 10, 1, 1, 6, 6, 11, 11, 2, 2, 7, 7, 12, 12, 3, 3, 8, 8, 13, 13, 4, 4, 9, 9, 14, 14]
+                    },
+                    "outputs": {
+                        "toptr": [1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [0, 5, 10, 15, 20, 25, 1, 6, 11, 16, 21, 26, 2, 7, 12, 17, 22, 27, 3, 8, 13, 18, 23, 28, 4, 9, 14, 19, 24, 29],
+                        "lenparents": 30,
+                        "outlength": 10,
+                        "parents": [0, 0, 0, 5, 5, 5, 1, 1, 1, 6, 6, 6, 2, 2, 2, 7, 7, 7, 3, 3, 3, 8, 8, 8, 4, 4, 4, 9, 9, 9]
+                    },
+                    "outputs": {
+                        "toptr": [2, 3, 3, 3, 3, 3, 3, 3, 3, 3]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [1, 2, 4, 8, 16, 32, 64, 128, 0, 0, 0, 0],
+                        "lenparents": 12,
+                        "outlength": 3,
+                        "parents": [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
+                    },
+                    "outputs": {
+                        "toptr": [4, 4, 0]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [1, 2, 3, 4, 5, 1, 2, 3, 4, 5],
+                        "lenparents": 10,
+                        "outlength": 2,
+                        "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+                    },
+                    "outputs": {
+                        "toptr": [5, 5]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [1, 2, 3, 4, 5, 6],
+                        "lenparents": 6,
+                        "outlength": 1,
+                        "parents": [0, 0, 0, 0, 0, 0]
+                    },
+                    "outputs": {
+                        "toptr": [6]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [2, 7, 13, 17, 23, 3, 11, 19, 5],
+                        "lenparents": 9,
+                        "outlength": 8,
+                        "parents": [0, 0, 0, 6, 6, 1, 1, 7, 2]
+                    },
+                    "outputs": {
+                        "toptr": [3, 2, 1, 0, 0, 0, 2, 1]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [1, 16, 0, 2, 32, 0, 4, 64, 0, 8, 128, 0],
+                        "lenparents": 12,
+                        "outlength": 4,
+                        "parents": [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]
+                    },
+                    "outputs": {
+                        "toptr": [2, 2, 2, 2]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [0, 1, 2, 3, 4, 5],
+                        "lenparents": 6,
+                        "outlength": 4,
+                        "parents": [0, 0, 0, 2, 2, 3]
+                    },
+                    "outputs": {
+                        "toptr": [2, 0, 2, 1]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [0, 4, 1, 3, 5, 6],
+                        "lenparents": 6,
+                        "outlength": 4,
+                        "parents": [0, 0, 1, 1, 1, 3]
+                    },
+                    "outputs": {
+                        "toptr": [1, 3, 0, 1]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [1, 4, 9, 16, 25, 1, 4, 9, 16, 25],
+                        "lenparents": 10,
+                        "outlength": 2,
+                        "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+                    },
+                    "outputs": {
+                        "toptr": [5, 5]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [1, 4, 9, 16, 26, 1, 4, 10, 16, 24],
+                        "lenparents": 10,
+                        "outlength": 2,
+                        "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+                    },
+                    "outputs": {
+                        "toptr": [5, 5]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [0, 5, 20, 1, 6, 21, 2, 7, 22, 3, 8, 23, 4, 9, 24],
+                        "lenparents": 15,
+                        "outlength": 10,
+                        "parents": [0, 0, 5, 1, 1, 6, 2, 2, 7, 3, 3, 8, 4, 4, 9]
+                    },
+                    "outputs": {
+                        "toptr": [1, 2, 2, 2, 2, 1, 1, 1, 1, 1]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [15, 20, 25, 16, 21, 26, 17, 22, 27, 18, 23, 28, 19, 24, 29],
+                        "lenparents": 15,
+                        "outlength": 5,
+                        "parents": [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4]
+                    },
+                    "outputs": {
+                        "toptr": [3, 3, 3, 3, 3]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [1, 2, 3],
+                        "lenparents": 3,
+                        "outlength": 1,
+                        "parents": [0, 0, 0]
+                    },
+                    "outputs": {
+                        "toptr": [3]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [0, 1, 2, 4, 5, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 25, 26, 27, 28, 29],
+                        "lenparents": 20,
+                        "outlength": 6,
+                        "parents": [0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 5, 5, 5, 5, 5]
+                    },
+                    "outputs": {
+                        "toptr": [3, 3, 4, 4, 0, 5]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [2, 2, 4, 5, 5],
+                        "lenparents": 5,
+                        "outlength": 3,
+                        "parents": [0, 0, 0, 2, 2]
+                    },
+                    "outputs": {
+                        "toptr": [3, 0, 2]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
+                        "lenparents": 15,
+                        "outlength": 3,
+                        "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2]
+                    },
+                    "outputs": {
+                        "toptr": [5, 5, 5]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [4, 1, 0, 1, 4, 5, 1, 0, 1, 3],
+                        "lenparents": 10,
+                        "outlength": 2,
+                        "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+                    },
+                    "outputs": {
+                        "toptr": [4, 4]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [4, 1, 0, 1, 4, 4, 1, 0, 1, 4],
+                        "lenparents": 10,
+                        "outlength": 2,
+                        "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+                    },
+                    "outputs": {
+                        "toptr": [4, 4]
+                    }
+                }
+            ]
+        },
+        {
+            "name": "awkward_reduce_sum_int64_bool_64",
+            "status": true,
+            "tests": [
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [],
+                        "lenparents": 0,
+                        "outlength": 0,
+                        "parents": []
+                    },
+                    "outputs": {
+                        "toptr": []
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [0],
+                        "lenparents": 1,
+                        "outlength": 1,
+                        "parents": [0]
+                    },
+                    "outputs": {
+                        "toptr": [0]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [0, 5, 20, 1, 6, 21, 2, 7, 22, 3, 8, 23, 4, 9, 24],
+                        "lenparents": 15,
+                        "outlength": 10,
+                        "parents": [0, 5, 5, 1, 6, 6, 2, 7, 7, 3, 8, 8, 4, 9, 9]
+                    },
+                    "outputs": {
+                        "toptr": [0, 1, 1, 1, 1, 2, 2, 2, 2, 2]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [2, 3, 5, 7, 11, 13, 17, 19, 23],
+                        "lenparents": 9,
+                        "outlength": 6,
+                        "parents": [0, 0, 0, 2, 2, 3, 4, 4, 5]
+                    },
+                    "outputs": {
+                        "toptr": [3, 0, 2, 1, 2, 1]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [1, 0, 0, 1, 0, 0],
+                        "lenparents": 6,
+                        "outlength": 4,
+                        "parents": [0, 0, 0, 2, 2, 3]
+                    },
+                    "outputs": {
+                        "toptr": [1, 0, 1, 0]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24],
+                        "lenparents": 15,
+                        "outlength": 3,
+                        "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2]
+                    },
+                    "outputs": {
+                        "toptr": [4, 5, 5]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
+                        "lenparents": 30,
+                        "outlength": 6,
+                        "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5]
+                    },
+                    "outputs": {
+                        "toptr": [4, 5, 5, 5, 5, 5]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [0, 1, 3, 4, 5, 6],
+                        "lenparents": 6,
+                        "outlength": 4,
+                        "parents": [0, 0, 1, 3, 3, 3]
+                    },
+                    "outputs": {
+                        "toptr": [1, 1, 0, 3]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [0, 5, 10, 15, 25, 1, 11, 16, 26, 2, 12, 17, 27, 8, 18, 28, 4, 9, 14, 29],
+                        "lenparents": 20,
+                        "outlength": 10,
+                        "parents": [0, 0, 0, 5, 5, 1, 1, 6, 6, 2, 2, 7, 7, 3, 8, 8, 4, 4, 4, 9]
+                    },
+                    "outputs": {
+                        "toptr": [2, 2, 2, 1, 3, 2, 2, 2, 2, 1]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [15, 20, 25, 16, 21, 26, 17, 22, 27, 18, 23, 28, 19, 24, 29],
+                        "lenparents": 15,
+                        "outlength": 15,
+                        "parents": [0, 5, 10, 1, 6, 11, 2, 7, 12, 3, 8, 13, 4, 9, 14]
+                    },
+                    "outputs": {
+                        "toptr": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [0, 15, 5, 10, 25, 1, 16, 11, 26, 2, 17, 12, 27, 18, 8, 28, 4, 9, 14, 29],
+                        "lenparents": 20,
+                        "outlength": 15,
+                        "parents": [0, 0, 5, 10, 10, 1, 1, 11, 11, 2, 2, 12, 12, 3, 8, 13, 4, 9, 14, 14]
+                    },
+                    "outputs": {
+                        "toptr": [1, 2, 2, 1, 1, 1, 0, 0, 1, 1, 2, 2, 2, 1, 2]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [0, 15, 5, 20, 10, 25, 1, 16, 6, 21, 11, 26, 2, 17, 7, 22, 12, 27, 3, 18, 8, 23, 13, 28, 4, 19, 9, 24, 14, 29],
+                        "lenparents": 30,
+                        "outlength": 15,
+                        "parents": [0, 0, 5, 5, 10, 10, 1, 1, 6, 6, 11, 11, 2, 2, 7, 7, 12, 12, 3, 3, 8, 8, 13, 13, 4, 4, 9, 9, 14, 14]
+                    },
+                    "outputs": {
+                        "toptr": [1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [0, 5, 10, 15, 20, 25, 1, 6, 11, 16, 21, 26, 2, 7, 12, 17, 22, 27, 3, 8, 13, 18, 23, 28, 4, 9, 14, 19, 24, 29],
+                        "lenparents": 30,
+                        "outlength": 10,
+                        "parents": [0, 0, 0, 5, 5, 5, 1, 1, 1, 6, 6, 6, 2, 2, 2, 7, 7, 7, 3, 3, 3, 8, 8, 8, 4, 4, 4, 9, 9, 9]
+                    },
+                    "outputs": {
+                        "toptr": [2, 3, 3, 3, 3, 3, 3, 3, 3, 3]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [1, 2, 4, 8, 16, 32, 64, 128, 0, 0, 0, 0],
+                        "lenparents": 12,
+                        "outlength": 3,
+                        "parents": [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
+                    },
+                    "outputs": {
+                        "toptr": [4, 4, 0]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [1, 2, 3, 4, 5, 1, 2, 3, 4, 5],
+                        "lenparents": 10,
+                        "outlength": 2,
+                        "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+                    },
+                    "outputs": {
+                        "toptr": [5, 5]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [1, 2, 3, 4, 5, 6],
+                        "lenparents": 6,
+                        "outlength": 1,
+                        "parents": [0, 0, 0, 0, 0, 0]
+                    },
+                    "outputs": {
+                        "toptr": [6]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [2, 7, 13, 17, 23, 3, 11, 19, 5],
+                        "lenparents": 9,
+                        "outlength": 8,
+                        "parents": [0, 0, 0, 6, 6, 1, 1, 7, 2]
+                    },
+                    "outputs": {
+                        "toptr": [3, 2, 1, 0, 0, 0, 2, 1]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [1, 16, 0, 2, 32, 0, 4, 64, 0, 8, 128, 0],
+                        "lenparents": 12,
+                        "outlength": 4,
+                        "parents": [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]
+                    },
+                    "outputs": {
+                        "toptr": [2, 2, 2, 2]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [0, 1, 2, 3, 4, 5],
+                        "lenparents": 6,
+                        "outlength": 4,
+                        "parents": [0, 0, 0, 2, 2, 3]
+                    },
+                    "outputs": {
+                        "toptr": [2, 0, 2, 1]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [0, 4, 1, 3, 5, 6],
+                        "lenparents": 6,
+                        "outlength": 4,
+                        "parents": [0, 0, 1, 1, 1, 3]
+                    },
+                    "outputs": {
+                        "toptr": [1, 3, 0, 1]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [1, 4, 9, 16, 25, 1, 4, 9, 16, 25],
+                        "lenparents": 10,
+                        "outlength": 2,
+                        "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+                    },
+                    "outputs": {
+                        "toptr": [5, 5]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [1, 4, 9, 16, 26, 1, 4, 10, 16, 24],
+                        "lenparents": 10,
+                        "outlength": 2,
+                        "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+                    },
+                    "outputs": {
+                        "toptr": [5, 5]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [0, 5, 20, 1, 6, 21, 2, 7, 22, 3, 8, 23, 4, 9, 24],
+                        "lenparents": 15,
+                        "outlength": 10,
+                        "parents": [0, 0, 5, 1, 1, 6, 2, 2, 7, 3, 3, 8, 4, 4, 9]
+                    },
+                    "outputs": {
+                        "toptr": [1, 2, 2, 2, 2, 1, 1, 1, 1, 1]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [15, 20, 25, 16, 21, 26, 17, 22, 27, 18, 23, 28, 19, 24, 29],
+                        "lenparents": 15,
+                        "outlength": 5,
+                        "parents": [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4]
+                    },
+                    "outputs": {
+                        "toptr": [3, 3, 3, 3, 3]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [1, 2, 3],
+                        "lenparents": 3,
+                        "outlength": 1,
+                        "parents": [0, 0, 0]
+                    },
+                    "outputs": {
+                        "toptr": [3]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [0, 1, 2, 4, 5, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 25, 26, 27, 28, 29],
+                        "lenparents": 20,
+                        "outlength": 6,
+                        "parents": [0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 5, 5, 5, 5, 5]
+                    },
+                    "outputs": {
+                        "toptr": [3, 3, 4, 4, 0, 5]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [2, 2, 4, 5, 5],
+                        "lenparents": 5,
+                        "outlength": 3,
+                        "parents": [0, 0, 0, 2, 2]
+                    },
+                    "outputs": {
+                        "toptr": [3, 0, 2]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
+                        "lenparents": 15,
+                        "outlength": 3,
+                        "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2]
+                    },
+                    "outputs": {
+                        "toptr": [5, 5, 5]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [4, 1, 0, 1, 4, 5, 1, 0, 1, 3],
+                        "lenparents": 10,
+                        "outlength": 2,
+                        "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+                    },
+                    "outputs": {
+                        "toptr": [4, 4]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [4, 1, 0, 1, 4, 4, 1, 0, 1, 4],
+                        "lenparents": 10,
+                        "outlength": 2,
+                        "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+                    },
+                    "outputs": {
+                        "toptr": [4, 4]
+                    }
+                }
+            ]
+        },
         {
             "name": "awkward_reduce_prod",
-            "status": false,
+            "status": true,
             "tests": [
                 {
                     "error": false,
diff --git a/src/awkward/_connect/cuda/__init__.py b/src/awkward/_connect/cuda/__init__.py
index 5d7a77b8c5..447002c1c4 100644
--- a/src/awkward/_connect/cuda/__init__.py
+++ b/src/awkward/_connect/cuda/__init__.py
@@ -118,6 +118,7 @@ def fetch_template_specializations(kernel_dict):
         "awkward_reduce_sum_int32_bool_64",
         "awkward_reduce_sum_int64_bool_64",
         "awkward_reduce_sum_bool",
+        "awkward_reduce_prod",
         "awkward_reduce_prod_bool",
         "awkward_reduce_argmax",
         "awkward_reduce_argmin",
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod.cu
new file mode 100644
index 0000000000..1f7e6d4ff0
--- /dev/null
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod.cu
@@ -0,0 +1,107 @@
+// BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE
+
+// BEGIN PYTHON
+// def f(grid, block, args):
+//     (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args
+//     if block[0] > 0:
+//         segment = math.floor((outlength + block[0] - 1) / block[0])
+//         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
+//     else:
+//         segment = 0
+//         grid_size = 1
+//     partial = cupy.ones(outlength * grid_size, dtype=toptr.dtype)
+//     temp = cupy.ones(lenparents, dtype=toptr.dtype)
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
+// out["awkward_reduce_prod_a", {dtype_specializations}] = None
+// out["awkward_reduce_prod_b", {dtype_specializations}] = None
+// out["awkward_reduce_prod_c", {dtype_specializations}] = None
+// END PYTHON
+
+template <typename T, typename C, typename U>
+__global__ void
+awkward_reduce_prod_a(
+    T* toptr,
+    const C* fromptr,
+    const U* parents,
+    int64_t lenparents,
+    int64_t outlength,
+    T* partial,
+    T* temp,
+    uint64_t invocation_index,
+    uint64_t* err_code) {
+  if (err_code[0] == NO_ERROR) {
+    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (thread_id < outlength) {
+      toptr[thread_id] = 1;
+    }
+  }
+}
+
+template <typename T, typename C, typename U>
+__global__ void
+awkward_reduce_prod_b(
+    T* toptr,
+    const C* fromptr,
+    const U* parents,
+    int64_t lenparents,
+    int64_t outlength,
+    T* partial,
+    T* temp,
+    uint64_t invocation_index,
+    uint64_t* err_code) {
+  if (err_code[0] == NO_ERROR) {
+    int64_t idx = threadIdx.x;
+    int64_t thread_id = blockIdx.x * blockDim.x + idx;
+
+    if (thread_id < lenparents) {
+      temp[idx] = fromptr[thread_id];
+    }
+    __syncthreads();
+
+    for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
+      T val = 1;
+      if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+        val = temp[idx - stride];
+      }
+      __syncthreads();
+      temp[idx] *= val;
+      __syncthreads();
+    }
+
+    if (thread_id < lenparents) {
+      int64_t parent = parents[thread_id];
+      if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+        partial[blockIdx.x * outlength + parent] = temp[idx];
+      }
+    }
+  }
+}
+
+template <typename T, typename C, typename U>
+__global__ void
+awkward_reduce_prod_c(
+    T* toptr,
+    const C* fromptr,
+    const U* parents,
+    int64_t lenparents,
+    int64_t outlength,
+    T* partial,
+    T* temp,
+    uint64_t invocation_index,
+    uint64_t* err_code) {
+  if (err_code[0] == NO_ERROR) {
+    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (thread_id < outlength) {
+      T prod = 1;
+      int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x;
+      for (int64_t i = 0; i < blocks; ++i) {
+        prod *= partial[i * outlength + thread_id];
+      }
+      toptr[thread_id] = prod;
+    }
+  }
+}
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu
index 8bdb3fccc2..52cc05492d 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu
@@ -3,10 +3,17 @@
 // BEGIN PYTHON
 // def f(grid, block, args):
 //     (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args
-//     atomicAdd_toptr = cupy.array(toptr, dtype=cupy.uint64)
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int32_bool_64_a", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int32_bool_64_b", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int32_bool_64_c", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code))
+//     if block[0] > 0:
+//         segment = math.floor((outlength + block[0] - 1) / block[0])
+//         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
+//     else:
+//         segment = 0
+//         grid_size = 1
+//     partial = cupy.zeros(outlength * grid_size, dtype=toptr.dtype)
+//     temp = cupy.zeros(lenparents, dtype=toptr.dtype)
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int32_bool_64_a", int32, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int32_bool_64_b", int32, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int32_bool_64_c", int32, bool_, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
 // out["awkward_reduce_sum_int32_bool_64_a", {dtype_specializations}] = None
 // out["awkward_reduce_sum_int32_bool_64_b", {dtype_specializations}] = None
 // out["awkward_reduce_sum_int32_bool_64_c", {dtype_specializations}] = None
@@ -20,14 +27,15 @@ awkward_reduce_sum_int32_bool_64_a(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    uint64_t* atomicAdd_toptr,
+    T* partial,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      atomicAdd_toptr[thread_id] = 0;
+      toptr[thread_id] = 0;
     }
   }
 }
@@ -40,34 +48,60 @@ awkward_reduce_sum_int32_bool_64_b(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    uint64_t* atomicAdd_toptr,
+    T* partial,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
-    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    int64_t idx = threadIdx.x;
+    int64_t thread_id = blockIdx.x * blockDim.x + idx;
 
     if (thread_id < lenparents) {
-      atomicAdd(atomicAdd_toptr + parents[thread_id],
-                (uint64_t)(fromptr[thread_id] != 0));
+      temp[idx] = (fromptr[thread_id] != 0) ? 1 : 0;;
+    }
+    __syncthreads();
+
+    for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
+      T val = 0;
+      if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+        val = temp[idx - stride];
+      }
+      __syncthreads();
+      temp[idx] += val;
+      __syncthreads();
+    }
+
+    if (thread_id < lenparents) {
+      int64_t parent = parents[thread_id];
+      if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+        partial[blockIdx.x * outlength + parent] = temp[idx];
+      }
     }
   }
 }
 
 template <typename T, typename C, typename U>
 __global__ void
-awkward_reduce_sum_int32_bool_64_c(T* toptr,
-                                   const C* fromptr,
-                                   const U* parents,
-                                   int64_t lenparents,
-                                   int64_t outlength,
-                                   uint64_t* atomicAdd_toptr,
-                                   uint64_t invocation_index,
-                                   uint64_t* err_code) {
+awkward_reduce_sum_int32_bool_64_c(
+    T* toptr,
+    const C* fromptr,
+    const U* parents,
+    int64_t lenparents,
+    int64_t outlength,
+    T* partial,
+    T* temp,
+    uint64_t invocation_index,
+    uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      toptr[thread_id] = (T)atomicAdd_toptr[thread_id];
+      T sum = 0;
+      int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x;
+      for (int64_t i = 0; i < blocks; ++i) {
+        sum += partial[i * outlength + thread_id];
+      }
+      toptr[thread_id] = sum;
     }
   }
 }
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu
index 041558a663..a215bb92f3 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu
@@ -3,10 +3,17 @@
 // BEGIN PYTHON
 // def f(grid, block, args):
 //     (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args
-//     atomicAdd_toptr = cupy.array(toptr, dtype=cupy.uint64)
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int64_bool_64_a", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int64_bool_64_b", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int64_bool_64_c", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code))
+//     if block[0] > 0:
+//         segment = math.floor((outlength + block[0] - 1) / block[0])
+//         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
+//     else:
+//         segment = 0
+//         grid_size = 1
+//     partial = cupy.zeros(outlength * grid_size, dtype=toptr.dtype)
+//     temp = cupy.zeros(lenparents, dtype=toptr.dtype)
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int64_bool_64_a", int64, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int64_bool_64_b", int64, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int64_bool_64_c", int64, bool_, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
 // out["awkward_reduce_sum_int64_bool_64_a", {dtype_specializations}] = None
 // out["awkward_reduce_sum_int64_bool_64_b", {dtype_specializations}] = None
 // out["awkward_reduce_sum_int64_bool_64_c", {dtype_specializations}] = None
@@ -20,14 +27,15 @@ awkward_reduce_sum_int64_bool_64_a(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    uint64_t* atomicAdd_toptr,
+    T* partial,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      atomicAdd_toptr[thread_id] = 0;
+      toptr[thread_id] = 0;
     }
   }
 }
@@ -40,34 +48,60 @@ awkward_reduce_sum_int64_bool_64_b(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    uint64_t* atomicAdd_toptr,
+    T* partial,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
-    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    int64_t idx = threadIdx.x;
+    int64_t thread_id = blockIdx.x * blockDim.x + idx;
 
     if (thread_id < lenparents) {
-      atomicAdd(atomicAdd_toptr + parents[thread_id],
-                (uint64_t)(fromptr[thread_id] != 0));
+      temp[idx] = (fromptr[thread_id] != 0) ? 1 : 0;;
+    }
+    __syncthreads();
+
+    for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
+      T val = 0;
+      if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+        val = temp[idx - stride];
+      }
+      __syncthreads();
+      temp[idx] += val;
+      __syncthreads();
+    }
+
+    if (thread_id < lenparents) {
+      int64_t parent = parents[thread_id];
+      if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+        partial[blockIdx.x * outlength + parent] = temp[idx];
+      }
     }
   }
 }
 
 template <typename T, typename C, typename U>
 __global__ void
-awkward_reduce_sum_int64_bool_64_c(T* toptr,
-                                   const C* fromptr,
-                                   const U* parents,
-                                   int64_t lenparents,
-                                   int64_t outlength,
-                                   uint64_t* atomicAdd_toptr,
-                                   uint64_t invocation_index,
-                                   uint64_t* err_code) {
+awkward_reduce_sum_int64_bool_64_c(
+    T* toptr,
+    const C* fromptr,
+    const U* parents,
+    int64_t lenparents,
+    int64_t outlength,
+    T* partial,
+    T* temp,
+    uint64_t invocation_index,
+    uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      toptr[thread_id] = (T)atomicAdd_toptr[thread_id];
+      T sum = 0;
+      int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x;
+      for (int64_t i = 0; i < blocks; ++i) {
+        sum += partial[i * outlength + thread_id];
+      }
+      toptr[thread_id] = sum;
     }
   }
 }

From b28a605e976616133c8c9c823089a4a2bcd7a90d Mon Sep 17 00:00:00 2001
From: ManasviGoyal <mg.manasvi@gmail.com>
Date: Thu, 6 Jun 2024 10:45:27 +0200
Subject: [PATCH 10/33] feat: add sum_bool and prod_bool kernels

---
 kernel-test-data.json                         |  4 +-
 .../cuda_kernels/awkward_reduce_prod_bool.cu  | 72 ++++++++++++++-----
 .../cuda_kernels/awkward_reduce_sum_bool.cu   | 72 ++++++++++++++-----
 3 files changed, 108 insertions(+), 40 deletions(-)

diff --git a/kernel-test-data.json b/kernel-test-data.json
index 0df339f461..21f7b1932f 100644
--- a/kernel-test-data.json
+++ b/kernel-test-data.json
@@ -22933,7 +22933,7 @@
         },
         {
             "name": "awkward_reduce_sum_bool",
-            "status": false,
+            "status": true,
             "tests": [
                 {
                     "error": false,
@@ -23056,7 +23056,7 @@
         },
         {
             "name": "awkward_reduce_prod_bool",
-            "status": false,
+            "status": true,
             "tests": [
                 {
                     "error": false,
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod_bool.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod_bool.cu
index 74843af6c0..af10c4f40d 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod_bool.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod_bool.cu
@@ -3,10 +3,17 @@
 // BEGIN PYTHON
 // def f(grid, block, args):
 //     (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args
-//     atomicAdd_toptr = cupy.array(toptr, dtype=cupy.uint64)
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_bool_a", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_bool_b", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_bool_c", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code))
+//     if block[0] > 0:
+//         segment = math.floor((outlength + block[0] - 1) / block[0])
+//         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
+//     else:
+//         segment = 0
+//         grid_size = 1
+//     partial = cupy.ones(outlength * grid_size, dtype=toptr.dtype)
+//     temp = cupy.ones(lenparents, dtype=toptr.dtype)
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_bool_a", bool_, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_bool_b", bool_, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_bool_c", bool_, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
 // out["awkward_reduce_prod_bool_a", {dtype_specializations}] = None
 // out["awkward_reduce_prod_bool_b", {dtype_specializations}] = None
 // out["awkward_reduce_prod_bool_c", {dtype_specializations}] = None
@@ -20,14 +27,15 @@ awkward_reduce_prod_bool_a(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    uint64_t* atomicAdd_toptr,
+    T* partial,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      atomicAdd_toptr[thread_id] = true;
+      toptr[thread_id] = 1;
     }
   }
 }
@@ -40,34 +48,60 @@ awkward_reduce_prod_bool_b(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    uint64_t* atomicAdd_toptr,
+    T* partial,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
-    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    int64_t idx = threadIdx.x;
+    int64_t thread_id = blockIdx.x * blockDim.x + idx;
 
     if (thread_id < lenparents) {
-      atomicAnd(atomicAdd_toptr + parents[thread_id],
-                (uint64_t)(fromptr[thread_id] != 0));
+      temp[idx] = fromptr[thread_id];
+    }
+    __syncthreads();
+
+    for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
+      T val = 1;
+      if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+        val = temp[idx - stride];
+      }
+      __syncthreads();
+      temp[idx] &= (val != 0);
+      __syncthreads();
+    }
+
+    if (thread_id < lenparents) {
+      int64_t parent = parents[thread_id];
+      if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+        partial[blockIdx.x * outlength + parent] = temp[idx];
+      }
     }
   }
 }
 
 template <typename T, typename C, typename U>
 __global__ void
-awkward_reduce_prod_bool_c(T* toptr,
-                           const C* fromptr,
-                           const U* parents,
-                           int64_t lenparents,
-                           int64_t outlength,
-                           uint64_t* atomicAdd_toptr,
-                           uint64_t invocation_index,
-                           uint64_t* err_code) {
+awkward_reduce_prod_bool_c(
+    T* toptr,
+    const C* fromptr,
+    const U* parents,
+    int64_t lenparents,
+    int64_t outlength,
+    T* partial,
+    T* temp,
+    uint64_t invocation_index,
+    uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      toptr[thread_id] = (T)atomicAdd_toptr[thread_id];
+      T prod = 1;
+      int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x;
+      for (int64_t i = 0; i < blocks; ++i) {
+        prod &= (partial[i * outlength + thread_id] != 0);
+      }
+      toptr[thread_id] = prod;
     }
   }
 }
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_bool.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_bool.cu
index 0e062a6c78..cee00fd95f 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_bool.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_bool.cu
@@ -3,10 +3,17 @@
 // BEGIN PYTHON
 // def f(grid, block, args):
 //     (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args
-//     atomicAdd_toptr = cupy.array(toptr, dtype=cupy.uint64)
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_bool_a", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_bool_b", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_bool_c", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code))
+//     if block[0] > 0:
+//         segment = math.floor((outlength + block[0] - 1) / block[0])
+//         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
+//     else:
+//         segment = 0
+//         grid_size = 1
+//     partial = cupy.zeros(outlength * grid_size, dtype=toptr.dtype)
+//     temp = cupy.zeros(lenparents, dtype=toptr.dtype)
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_bool_a", bool_, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_bool_b", bool_, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_bool_c", bool_, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
 // out["awkward_reduce_sum_bool_a", {dtype_specializations}] = None
 // out["awkward_reduce_sum_bool_b", {dtype_specializations}] = None
 // out["awkward_reduce_sum_bool_c", {dtype_specializations}] = None
@@ -20,14 +27,15 @@ awkward_reduce_sum_bool_a(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    uint64_t* atomicAdd_toptr,
+    T* partial,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      atomicAdd_toptr[thread_id] = false;
+      toptr[thread_id] = 0;
     }
   }
 }
@@ -40,34 +48,60 @@ awkward_reduce_sum_bool_b(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    uint64_t* atomicAdd_toptr,
+    T* partial,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
-    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    int64_t idx = threadIdx.x;
+    int64_t thread_id = blockIdx.x * blockDim.x + idx;
 
     if (thread_id < lenparents) {
-      atomicOr(atomicAdd_toptr + parents[thread_id],
-               (uint64_t)(fromptr[thread_id] != 0));
+      temp[idx] = fromptr[thread_id];
+    }
+    __syncthreads();
+
+    for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
+      T val = 0;
+      if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+        val = temp[idx - stride];
+      }
+      __syncthreads();
+      temp[idx] |= (val != 0);
+      __syncthreads();
+    }
+
+    if (thread_id < lenparents) {
+      int64_t parent = parents[thread_id];
+      if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+        partial[blockIdx.x * outlength + parent] = temp[idx];
+      }
     }
   }
 }
 
 template <typename T, typename C, typename U>
 __global__ void
-awkward_reduce_sum_bool_c(T* toptr,
-                          const C* fromptr,
-                          const U* parents,
-                          int64_t lenparents,
-                          int64_t outlength,
-                          uint64_t* atomicAdd_toptr,
-                          uint64_t invocation_index,
-                          uint64_t* err_code) {
+awkward_reduce_sum_bool_c(
+    T* toptr,
+    const C* fromptr,
+    const U* parents,
+    int64_t lenparents,
+    int64_t outlength,
+    T* partial,
+    T* temp,
+    uint64_t invocation_index,
+    uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      toptr[thread_id] = (T)atomicAdd_toptr[thread_id];
+      T sum = 0;
+      int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x;
+      for (int64_t i = 0; i < blocks; ++i) {
+        sum |= (partial[i * outlength + thread_id] != 0);
+      }
+      toptr[thread_id] = sum;
     }
   }
 }

From 9e7abc710864bac255b2e73f79c95c07ccfe2779 Mon Sep 17 00:00:00 2001
From: ManasviGoyal <mg.manasvi@gmail.com>
Date: Thu, 6 Jun 2024 11:54:57 +0200
Subject: [PATCH 11/33] fix: use cpt.assert_allclose

---
 dev/generate-tests.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/dev/generate-tests.py b/dev/generate-tests.py
index 7267d5659c..cb75fb1819 100644
--- a/dev/generate-tests.py
+++ b/dev/generate-tests.py
@@ -953,6 +953,7 @@ def gencudakerneltests(specdict):
 
                 f.write(
                     "import cupy\n"
+                    "import cupy.testing as cpt\n"
                     "import pytest\n\n"
                     "import awkward as ak\n"
                     "import awkward._connect.cuda as ak_cu\n"
@@ -1022,7 +1023,7 @@ def gencudakerneltests(specdict):
                             if isinstance(val, list):
                                 f.write(
                                     " " * 4
-                                    + f"assert cupy.array_equal({arg}[:len(pytest_{arg})], cupy.array(pytest_{arg}))\n"
+                                    + f"cpt.assert_allclose({arg}[:len(pytest_{arg})], cupy.array(pytest_{arg}))\n"
                                 )
                             else:
                                 f.write(" " * 4 + f"assert {arg} == pytest_{arg}\n")
@@ -1082,6 +1083,7 @@ def gencudaunittests(specdict):
                 f.write(
                     "import re\n"
                     "import cupy\n"
+                    "import cupy.testing as cpt\n"
                     "import pytest\n\n"
                     "import awkward as ak\n"
                     "import awkward._connect.cuda as ak_cu\n"
@@ -1218,7 +1220,7 @@ def gencudaunittests(specdict):
                                 if isinstance(val, list):
                                     f.write(
                                         " " * 4
-                                        + f"assert cupy.array_equal({arg}[:len(pytest_{arg})], cupy.array(pytest_{arg}))\n"
+                                        + f"cpt.assert_allclose({arg}[:len(pytest_{arg})], cupy.array(pytest_{arg}))\n"
                                     )
                                 else:
                                     f.write(" " * 4 + f"assert {arg} == pytest_{arg}\n")

From 458165c885428ccf08b34288287bbab316e09238 Mon Sep 17 00:00:00 2001
From: ManasviGoyal <mg.manasvi@gmail.com>
Date: Thu, 6 Jun 2024 14:47:45 +0200
Subject: [PATCH 12/33] test: reducer integration tests

---
 tests-cuda/test_3136_cuda_reducers.py | 379 ++++++++++++++++++++++++++
 1 file changed, 379 insertions(+)
 create mode 100644 tests-cuda/test_3136_cuda_reducers.py

diff --git a/tests-cuda/test_3136_cuda_reducers.py b/tests-cuda/test_3136_cuda_reducers.py
new file mode 100644
index 0000000000..3bc554e1a1
--- /dev/null
+++ b/tests-cuda/test_3136_cuda_reducers.py
@@ -0,0 +1,379 @@
+from __future__ import annotations
+
+import cupy.testing as cpt
+import numpy as np
+
+import awkward as ak
+
+to_list = ak.operations.to_list
+
+
+def test_sumprod_types():
+    def prod(xs):
+        out = 1
+        for x in xs:
+            out *= x
+        return out
+
+    array = np.array([[True, False, False], [True, False, False]])
+    content2 = ak.contents.NumpyArray(array.reshape(-1))
+    offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64))
+    depth1 = ak.contents.ListOffsetArray(offsets3, content2)
+
+    depth1 = ak.to_backend(depth1, "cuda")
+
+    assert sum(to_list(np.sum(array, axis=-1))) == sum(
+        to_list(ak.sum(depth1, axis=-1, highlevel=False))
+    )
+    assert prod(to_list(np.prod(array, axis=-1))) == prod(
+        to_list(ak.prod(depth1, axis=-1, highlevel=False))
+    )
+
+    array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.int8)
+    content2 = ak.contents.NumpyArray(array.reshape(-1))
+    offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64))
+    depth1 = ak.contents.ListOffsetArray(offsets3, content2)
+
+    depth1 = ak.to_backend(depth1, "cuda")
+
+    assert (
+        np.sum(array, axis=-1).dtype
+        == ak.to_numpy(ak.sum(depth1, axis=-1, highlevel=False)).dtype
+    )
+    assert (
+        np.prod(array, axis=-1).dtype
+        == ak.to_numpy(ak.prod(depth1, axis=-1, highlevel=False)).dtype
+    )
+    assert sum(to_list(np.sum(array, axis=-1))) == sum(
+        to_list(ak.sum(depth1, axis=-1, highlevel=False))
+    )
+    assert prod(to_list(np.prod(array, axis=-1))) == prod(
+        to_list(ak.prod(depth1, axis=-1, highlevel=False))
+    )
+
+    array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.uint8)
+    content2 = ak.contents.NumpyArray(array.reshape(-1))
+    offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64))
+    depth1 = ak.contents.ListOffsetArray(offsets3, content2)
+
+    depth1 = ak.to_backend(depth1, "cuda")
+
+    assert (
+        np.sum(array, axis=-1).dtype
+        == ak.to_numpy(ak.sum(depth1, axis=-1, highlevel=False)).dtype
+    )
+    assert (
+        np.prod(array, axis=-1).dtype
+        == ak.to_numpy(ak.prod(depth1, axis=-1, highlevel=False)).dtype
+    )
+    assert sum(to_list(np.sum(array, axis=-1))) == sum(
+        to_list(ak.sum(depth1, axis=-1, highlevel=False))
+    )
+    assert prod(to_list(np.prod(array, axis=-1))) == prod(
+        to_list(ak.prod(depth1, axis=-1, highlevel=False))
+    )
+
+    array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.int16)
+    content2 = ak.contents.NumpyArray(array.reshape(-1))
+    offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64))
+    depth1 = ak.contents.ListOffsetArray(offsets3, content2)
+
+    depth1 = ak.to_backend(depth1, "cuda")
+
+    assert (
+        np.sum(array, axis=-1).dtype
+        == ak.to_numpy(ak.sum(depth1, axis=-1, highlevel=False)).dtype
+    )
+    assert (
+        np.prod(array, axis=-1).dtype
+        == ak.to_numpy(ak.prod(depth1, axis=-1, highlevel=False)).dtype
+    )
+    assert sum(to_list(np.sum(array, axis=-1))) == sum(
+        to_list(ak.sum(depth1, axis=-1, highlevel=False))
+    )
+    assert prod(to_list(np.prod(array, axis=-1))) == prod(
+        to_list(ak.prod(depth1, axis=-1, highlevel=False))
+    )
+
+    array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.uint16)
+    content2 = ak.contents.NumpyArray(array.reshape(-1))
+    offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64))
+    depth1 = ak.contents.ListOffsetArray(offsets3, content2)
+
+    depth1 = ak.to_backend(depth1, "cuda")
+
+    assert (
+        np.sum(array, axis=-1).dtype
+        == ak.to_numpy(ak.sum(depth1, axis=-1, highlevel=False)).dtype
+    )
+    assert (
+        np.prod(array, axis=-1).dtype
+        == ak.to_numpy(ak.prod(depth1, axis=-1, highlevel=False)).dtype
+    )
+    assert sum(to_list(np.sum(array, axis=-1))) == sum(
+        to_list(ak.sum(depth1, axis=-1, highlevel=False))
+    )
+    assert prod(to_list(np.prod(array, axis=-1))) == prod(
+        to_list(ak.prod(depth1, axis=-1, highlevel=False))
+    )
+
+    array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.int32)
+    content2 = ak.contents.NumpyArray(array.reshape(-1))
+    offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64))
+    depth1 = ak.contents.ListOffsetArray(offsets3, content2)
+
+    depth1 = ak.to_backend(depth1, "cuda")
+
+    assert (
+        np.sum(array, axis=-1).dtype
+        == ak.to_numpy(ak.sum(depth1, axis=-1, highlevel=False)).dtype
+    )
+    assert (
+        np.prod(array, axis=-1).dtype
+        == ak.to_numpy(ak.prod(depth1, axis=-1, highlevel=False)).dtype
+    )
+    assert sum(to_list(np.sum(array, axis=-1))) == sum(
+        to_list(ak.sum(depth1, axis=-1, highlevel=False))
+    )
+    assert prod(to_list(np.prod(array, axis=-1))) == prod(
+        to_list(ak.prod(depth1, axis=-1, highlevel=False))
+    )
+
+    array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.uint32)
+    content2 = ak.contents.NumpyArray(array.reshape(-1))
+    offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64))
+    depth1 = ak.contents.ListOffsetArray(offsets3, content2)
+
+    depth1 = ak.to_backend(depth1, "cuda")
+
+    assert (
+        np.sum(array, axis=-1).dtype
+        == ak.to_numpy(ak.sum(depth1, axis=-1, highlevel=False)).dtype
+    )
+    assert (
+        np.prod(array, axis=-1).dtype
+        == ak.to_numpy(ak.prod(depth1, axis=-1, highlevel=False)).dtype
+    )
+    assert sum(to_list(np.sum(array, axis=-1))) == sum(
+        to_list(ak.sum(depth1, axis=-1, highlevel=False))
+    )
+    assert prod(to_list(np.prod(array, axis=-1))) == prod(
+        to_list(ak.prod(depth1, axis=-1, highlevel=False))
+    )
+
+    array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.int64)
+    content2 = ak.contents.NumpyArray(array.reshape(-1))
+    offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64))
+    depth1 = ak.contents.ListOffsetArray(offsets3, content2)
+
+    depth1 = ak.to_backend(depth1, "cuda")
+
+    assert (
+        np.sum(array, axis=-1).dtype
+        == ak.to_numpy(ak.sum(depth1, axis=-1, highlevel=False)).dtype
+    )
+    assert (
+        np.prod(array, axis=-1).dtype
+        == ak.to_numpy(ak.prod(depth1, axis=-1, highlevel=False)).dtype
+    )
+    assert sum(to_list(np.sum(array, axis=-1))) == sum(
+        to_list(ak.sum(depth1, axis=-1, highlevel=False))
+    )
+    assert prod(to_list(np.prod(array, axis=-1))) == prod(
+        to_list(ak.prod(depth1, axis=-1, highlevel=False))
+    )
+
+    array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.uint64)
+    content2 = ak.contents.NumpyArray(array.reshape(-1))
+    offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64))
+    depth1 = ak.contents.ListOffsetArray(offsets3, content2)
+
+    depth1 = ak.to_backend(depth1, "cuda")
+
+    assert (
+        np.sum(array, axis=-1).dtype
+        == ak.to_numpy(ak.sum(depth1, axis=-1, highlevel=False)).dtype
+    )
+    assert (
+        np.prod(array, axis=-1).dtype
+        == ak.to_numpy(ak.prod(depth1, axis=-1, highlevel=False)).dtype
+    )
+    assert sum(to_list(np.sum(array, axis=-1))) == sum(
+        to_list(ak.sum(depth1, axis=-1, highlevel=False))
+    )
+    assert prod(to_list(np.prod(array, axis=-1))) == prod(
+        to_list(ak.prod(depth1, axis=-1, highlevel=False))
+    )
+
+
+def test_sumprod_types_FIXME():
+    array = np.array([[True, False, False], [True, False, False]])
+    content2 = ak.contents.NumpyArray(array.reshape(-1))
+    offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64))
+    depth1 = ak.contents.ListOffsetArray(offsets3, content2)
+    depth1 = ak.to_backend(depth1, "cuda")
+
+    assert (
+        np.sum(array, axis=-1).dtype
+        == ak.to_numpy(ak.sum(depth1, axis=-1, highlevel=False)).dtype
+    )
+    assert (
+        np.prod(array, axis=-1).dtype
+        == ak.to_numpy(ak.prod(depth1, axis=-1, highlevel=False)).dtype
+    )
+
+
+array = ak.Array(
+    [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda"
+)
+
+
+def test_sum():
+    cpt.assert_allclose(ak.sum(array, axis=None), 63.0)
+    assert ak.almost_equal(
+        ak.sum(array, axis=None, keepdims=True),
+        ak.to_regular(ak.Array([[63.0]], backend="cuda")),
+    )
+    assert ak.almost_equal(
+        ak.sum(array, axis=None, keepdims=True, mask_identity=True),
+        ak.to_regular(
+            ak.Array([[63.0]], backend="cuda").mask[ak.Array([[True]], backend="cuda")]
+        ),
+    )
+    assert ak.sum(array[2], axis=None, mask_identity=True) is None
+
+
+def test_prod():
+    cpt.assert_allclose(ak.prod(array[1:], axis=None), 4838400.0)
+    assert ak.prod(array, axis=None) == 0
+    assert ak.almost_equal(
+        ak.prod(array, axis=None, keepdims=True),
+        ak.to_regular(ak.Array([[0.0]], backend="cuda")),
+    )
+    assert ak.almost_equal(
+        ak.prod(array[1:], axis=None, keepdims=True),
+        ak.to_regular(ak.Array([[4838400.0]], backend="cuda")),
+    )
+    assert ak.almost_equal(
+        ak.prod(array[1:], axis=None, keepdims=True, mask_identity=True),
+        ak.to_regular(
+            ak.Array([[4838400.0]], backend="cuda").mask[
+                ak.Array([[True]], backend="cuda")
+            ]
+        ),
+    )
+    assert ak.prod(array[2], axis=None, mask_identity=True) is None
+
+
+def test_min():
+    cpt.assert_allclose(ak.min(array, axis=None), 0.0)
+    assert ak.almost_equal(
+        ak.min(array, axis=None, keepdims=True, mask_identity=False),
+        ak.to_regular(ak.Array([[0.0]], backend="cuda")),
+    )
+    assert ak.almost_equal(
+        ak.min(array, axis=None, keepdims=True, initial=-100.0, mask_identity=False),
+        ak.to_regular(ak.Array([[-100.0]], backend="cuda")),
+    )
+
+    assert ak.almost_equal(
+        ak.min(array, axis=None, keepdims=True, mask_identity=True),
+        ak.to_regular(
+            ak.Array([[0.0]], backend="cuda").mask[ak.Array([[True]], backend="cuda")]
+        ),
+    )
+    assert ak.almost_equal(
+        ak.min(array[-1:], axis=None, keepdims=True, mask_identity=True),
+        ak.to_regular(
+            ak.Array(ak.Array([[np.inf]], backend="cuda")).mask[
+                ak.Array([[False]], backend="cuda")
+            ]
+        ),
+    )
+    assert ak.min(array[2], axis=None, mask_identity=True) is None
+
+
+def test_max():
+    cpt.assert_allclose(ak.max(array, axis=None), 10.0)
+    assert ak.almost_equal(
+        ak.max(array, axis=None, keepdims=True, mask_identity=False),
+        ak.to_regular(ak.Array([[10.0]], backend="cuda")),
+    )
+    assert ak.almost_equal(
+        ak.max(array, axis=None, keepdims=True, initial=100.0, mask_identity=False),
+        ak.to_regular(ak.Array([[100.0]], backend="cuda")),
+    )
+    assert ak.almost_equal(
+        ak.max(array, axis=None, keepdims=True, mask_identity=True),
+        ak.to_regular(
+            ak.Array([[10.0]], backend="cuda").mask[ak.Array([[True]], backend="cuda")]
+        ),
+    )
+    assert ak.almost_equal(
+        ak.max(array[-1:], axis=None, keepdims=True, mask_identity=True),
+        ak.to_regular(
+            ak.Array(ak.Array([[np.inf]], backend="cuda")).mask[
+                ak.Array([[False]], backend="cuda")
+            ]
+        ),
+    )
+    assert ak.max(array[2], axis=None, mask_identity=True) is None
+
+
+array = ak.Array(
+    [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda"
+)
+
+
+def test_count():
+    assert ak.count(array, axis=None) == 12
+    assert ak.almost_equal(
+        ak.count(array, axis=None, keepdims=True, mask_identity=False),
+        ak.to_regular(ak.Array([[12]], backend="cuda")),
+    )
+    assert ak.almost_equal(
+        ak.count(array, axis=None, keepdims=True, mask_identity=True),
+        ak.to_regular(
+            ak.Array([[12]], backend="cuda").mask[ak.Array([[True]], backend="cuda")]
+        ),
+    )
+    assert ak.almost_equal(
+        ak.count(array[-1:], axis=None, keepdims=True, mask_identity=True),
+        ak.to_regular(
+            ak.Array([[0]], backend="cuda").mask[ak.Array([[False]], backend="cuda")]
+        ),
+    )
+    assert ak.count(array[2], axis=None, mask_identity=True) is None
+    assert ak.count(array[2], axis=None, mask_identity=False) == 0
+
+
+def test_count_nonzero():
+    assert ak.count_nonzero(array, axis=None) == 11
+    assert ak.almost_equal(
+        ak.count_nonzero(array, axis=None, keepdims=True, mask_identity=False),
+        ak.to_regular(ak.Array([[11]], backend="cuda")),
+    )
+    assert ak.almost_equal(
+        ak.count_nonzero(array, axis=None, keepdims=True, mask_identity=True),
+        ak.to_regular(
+            ak.Array([[11]], backend="cuda").mask[ak.Array([[True]], backend="cuda")]
+        ),
+    )
+    assert ak.almost_equal(
+        ak.count_nonzero(array[-1:], axis=None, keepdims=True, mask_identity=True),
+        ak.to_regular(
+            ak.Array([[0]], backend="cuda").mask[ak.Array([[False]], backend="cuda")]
+        ),
+    )
+    assert ak.count_nonzero(array[2], axis=None, mask_identity=True) is None
+    assert ak.count_nonzero(array[2], axis=None, mask_identity=False) == 0
+
+
+def test_std_no_mask_axis_none():
+    assert ak.almost_equal(
+        ak.std(array[-1:], axis=None, keepdims=True, mask_identity=True),
+        ak.to_regular(
+            ak.Array([[0.0]], backend="cuda").mask[ak.Array([[False]], backend="cuda")]
+        ),
+    )
+    assert ak.std(array[2], axis=None, mask_identity=True) is None

From c75cb7922ffa06a8ad1d3265b3d2fe3c14b38ccb Mon Sep 17 00:00:00 2001
From: ManasviGoyal <mg.manasvi@gmail.com>
Date: Thu, 6 Jun 2024 16:30:46 +0200
Subject: [PATCH 13/33] fix: typr conversion

---
 .../cuda/cuda_kernels/awkward_reduce_max.cu        | 14 +++++++-------
 .../cuda/cuda_kernels/awkward_reduce_min.cu        | 14 +++++++-------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu
index 4ac7df43ba..a411d1970a 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu
@@ -10,7 +10,7 @@
 //         segment = 0
 //         grid_size = 1
 //     partial = cupy.full(outlength * grid_size, identity, dtype=toptr.dtype)
-//     temp = cupy.zeros(lenparents, dtype=toptr.dtype)
+//     temp = cupy.full(lenparents, identity, dtype=toptr.dtype)
 //     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_max_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, identity, partial, temp, invocation_index, err_code))
 //     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_max_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, identity, partial, temp, invocation_index, err_code))
 //     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_max_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, identity, partial, temp, invocation_index, err_code))
@@ -27,7 +27,7 @@ awkward_reduce_max_a(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    T identity,
+    int64_t identity,
     T* partial,
     T* temp,
     uint64_t invocation_index,
@@ -36,7 +36,7 @@ awkward_reduce_max_a(
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      toptr[thread_id] = identity;
+      toptr[thread_id] = static_cast<T>(identity);
     }
   }
 }
@@ -49,7 +49,7 @@ awkward_reduce_max_b(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    T identity,
+    int64_t identity,
     T* partial,
     T* temp,
     uint64_t invocation_index,
@@ -64,7 +64,7 @@ awkward_reduce_max_b(
     __syncthreads();
 
     for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
-      T val = identity;
+      T val = static_cast<T>(identity);
       if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
         val = temp[idx - stride];
       }
@@ -90,7 +90,7 @@ awkward_reduce_max_c(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    T identity,
+    int64_t identity,
     T* partial,
     T* temp,
     uint64_t invocation_index,
@@ -99,7 +99,7 @@ awkward_reduce_max_c(
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      T maximum = identity;
+      T maximum = static_cast<T>(identity);
       int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x;
       for (int64_t i = 0; i < blocks; ++i) {
         maximum = maximum > partial[i * outlength + thread_id] ? maximum : partial[i * outlength + thread_id];
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu
index f524485e58..828097a14f 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu
@@ -10,7 +10,7 @@
 //         segment = 0
 //         grid_size = 1
 //     partial = cupy.full(outlength * grid_size, identity, dtype=toptr.dtype)
-//     temp = cupy.zeros(lenparents, dtype=toptr.dtype)
+//     temp = cupy.full(lenparents, identity, dtype=toptr.dtype)
 //     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_min_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, identity, partial, temp, invocation_index, err_code))
 //     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_min_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, identity, partial, temp, invocation_index, err_code))
 //     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_min_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, identity, partial, temp, invocation_index, err_code))
@@ -27,7 +27,7 @@ awkward_reduce_min_a(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    T identity,
+    int64_t identity,
     T* partial,
     T* temp,
     uint64_t invocation_index,
@@ -36,7 +36,7 @@ awkward_reduce_min_a(
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      toptr[thread_id] = identity;
+      toptr[thread_id] = static_cast<T>(identity);
     }
   }
 }
@@ -49,7 +49,7 @@ awkward_reduce_min_b(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    T identity,
+    int64_t identity,
     T* partial,
     T* temp,
     uint64_t invocation_index,
@@ -64,7 +64,7 @@ awkward_reduce_min_b(
     __syncthreads();
 
     for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
-      T val = identity;
+      T val = static_cast<T>(identity);
       if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
         val = temp[idx - stride];
       }
@@ -90,7 +90,7 @@ awkward_reduce_min_c(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    T identity,
+    int64_t identity,
     T* partial,
     T* temp,
     uint64_t invocation_index,
@@ -99,7 +99,7 @@ awkward_reduce_min_c(
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      T minimum = identity;
+      T minimum = static_cast<T>(identity);
       int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x;
       for (int64_t i = 0; i < blocks; ++i) {
         minimum = minimum < partial[i * outlength + thread_id] ? minimum : partial[i * outlength + thread_id];

From 427670c15b97c645b19ac939ff726522171b344a Mon Sep 17 00:00:00 2001
From: ManasviGoyal <mg.manasvi@gmail.com>
Date: Fri, 7 Jun 2024 11:56:24 +0200
Subject: [PATCH 14/33] fix: use atomic to avoid race conditions

---
 ...tOffsetArray_reduce_local_outoffsets_64.cu | 32 ++++++++-----------
 .../cuda_kernels/awkward_reduce_argmax.cu     | 31 ++++++------------
 .../cuda_kernels/awkward_reduce_argmin.cu     | 31 ++++++------------
 .../cuda_kernels/awkward_reduce_count_64.cu   | 27 ++++++----------
 .../awkward_reduce_countnonzero.cu            | 27 ++++++----------
 .../awkward_reduce_sum_int32_bool_64.cu       | 27 ++++++----------
 .../awkward_reduce_sum_int64_bool_64.cu       | 27 ++++++----------
 7 files changed, 73 insertions(+), 129 deletions(-)

diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu
index 5ae3d2eb56..3dcdf14727 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu
@@ -7,16 +7,15 @@
 //         segment = math.floor((outlength + block[0] - 1) / block[0])
 //         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
 //     else:
-//         segment = 0
 //         grid_size = 1
-//     partial = cupy.zeros(outlength * grid_size, dtype=outoffsets.dtype)
+//     atomic_outoffsets = cupy.array(outoffsets, dtype=cupy.uint64)
 //     temp = cupy.zeros(lenparents, dtype=cupy.int64)
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_a", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_b", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_a", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, atomic_outoffsets, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_b", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, atomic_outoffsets, temp, invocation_index, err_code))
 //     scan_in_array = cupy.zeros(outlength, dtype=cupy.int64)
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_c", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((segment,), block, (outoffsets, parents, lenparents, outlength, partial, scan_in_array, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_c", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, atomic_outoffsets, scan_in_array, invocation_index, err_code))
 //     scan_in_array = cupy.cumsum(scan_in_array)
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_d", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, partial, scan_in_array, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_d", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, atomic_outoffsets, scan_in_array, invocation_index, err_code))
 // out["awkward_ListOffsetArray_reduce_local_outoffsets_64_a", {dtype_specializations}] = None
 // out["awkward_ListOffsetArray_reduce_local_outoffsets_64_b", {dtype_specializations}] = None
 // out["awkward_ListOffsetArray_reduce_local_outoffsets_64_c", {dtype_specializations}] = None
@@ -30,7 +29,7 @@ awkward_ListOffsetArray_reduce_local_outoffsets_64_a(
     const C* parents,
     int64_t lenparents,
     int64_t outlength,
-    T* partial,
+    uint64_t* atomic_outoffsets,
     int64_t* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -38,7 +37,7 @@ awkward_ListOffsetArray_reduce_local_outoffsets_64_a(
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      outoffsets[thread_id] = 0;
+      atomic_outoffsets[thread_id] = 0;
     }
   }
 }
@@ -50,7 +49,7 @@ awkward_ListOffsetArray_reduce_local_outoffsets_64_b(
     const C* parents,
     int64_t lenparents,
     int64_t outlength,
-    T* partial,
+    uint64_t* atomic_outoffsets,
     int64_t* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -76,7 +75,7 @@ awkward_ListOffsetArray_reduce_local_outoffsets_64_b(
     if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
       if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
-        partial[blockIdx.x * outlength + parent] = temp[idx];
+        atomicAdd(&atomic_outoffsets[parent], temp[idx]);
       }
     }
   }
@@ -89,7 +88,7 @@ awkward_ListOffsetArray_reduce_local_outoffsets_64_c(
     const C* parents,
     int64_t lenparents,
     int64_t outlength,
-    T* partial,
+    uint64_t* atomic_outoffsets,
     int64_t* scan_in_array,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -97,12 +96,7 @@ awkward_ListOffsetArray_reduce_local_outoffsets_64_c(
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      int64_t count = 0;
-      int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x;
-      for (int64_t i = 0; i < blocks; ++i) {
-        count += partial[i * outlength + thread_id];
-      }
-      scan_in_array[thread_id] = count;
+      scan_in_array[thread_id] = atomic_outoffsets[thread_id];
     }
   }
 }
@@ -114,7 +108,7 @@ awkward_ListOffsetArray_reduce_local_outoffsets_64_d(
     const C* parents,
     int64_t lenparents,
     int64_t outlength,
-    T* partial,
+    uint64_t* atomic_outoffsets,
     int64_t* scan_in_array,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -123,7 +117,7 @@ awkward_ListOffsetArray_reduce_local_outoffsets_64_d(
     outoffsets[0] = 0;
 
     if (thread_id < outlength) {
-      outoffsets[thread_id + 1] = scan_in_array[thread_id];
+      outoffsets[thread_id + 1] = static_cast<T>(scan_in_array[thread_id]);
     }
   }
 }
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu
index 122894795e..d2fe929a6b 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu
@@ -4,16 +4,14 @@
 // def f(grid, block, args):
 //     (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args
 //     if block[0] > 0:
-//         segment = math.floor((outlength + block[0] - 1) / block[0])
 //         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
 //     else:
-//         segment = 0
 //         grid_size = 1
-//     partial = cupy.full(outlength * grid_size, -1, dtype=toptr.dtype)
+//     atomic_toptr = cupy.array(toptr, dtype=cupy.uint64)
 //     temp = cupy.zeros(lenparents, dtype=toptr.dtype)
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
 // out["awkward_reduce_argmax_a", {dtype_specializations}] = None
 // out["awkward_reduce_argmax_b", {dtype_specializations}] = None
 // out["awkward_reduce_argmax_c", {dtype_specializations}] = None
@@ -27,7 +25,7 @@ awkward_reduce_argmax_a(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    T* partial,
+    uint64_t* atomic_toptr,
     T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -35,7 +33,7 @@ awkward_reduce_argmax_a(
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      toptr[thread_id] = -1;
+      atomic_toptr[thread_id] = -1;
     }
   }
 }
@@ -48,7 +46,7 @@ awkward_reduce_argmax_b(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    T* partial,
+    uint64_t* atomic_toptr,
     T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -76,7 +74,7 @@ awkward_reduce_argmax_b(
     if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
       if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
-        partial[blockIdx.x * outlength + parent] = temp[idx];
+        atomicExch(&atomic_toptr[parent], temp[idx]);
       }
     }
   }
@@ -90,7 +88,7 @@ awkward_reduce_argmax_c(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    T* partial,
+    uint64_t* atomic_toptr,
     T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -98,16 +96,7 @@ awkward_reduce_argmax_c(
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      int64_t argmax = -1;
-      int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x;
-      for (int64_t i = 0; i < blocks; ++i) {
-        int64_t index = partial[i * outlength + thread_id];
-        if (index != -1 && (argmax == -1 || fromptr[index] > fromptr[argmax]) ||
-           (fromptr[index] == fromptr[argmax] && index < argmax)) {
-          argmax = index;
-        }
-      }
-      toptr[thread_id] = argmax;
+      toptr[thread_id] = static_cast<T>(atomic_toptr[thread_id]);
     }
   }
 }
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu
index 40a8437218..754ec84cfb 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu
@@ -4,16 +4,14 @@
 // def f(grid, block, args):
 //     (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args
 //     if block[0] > 0:
-//         segment = math.floor((outlength + block[0] - 1) / block[0])
 //         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
 //     else:
-//         segment = 0
 //         grid_size = 1
-//     partial = cupy.full(outlength * grid_size, -1, dtype=toptr.dtype)
+//     atomic_toptr = cupy.array(toptr, dtype=cupy.uint64)
 //     temp = cupy.zeros(lenparents, dtype=toptr.dtype)
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmin_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmin_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmin_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmin_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmin_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmin_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
 // out["awkward_reduce_argmin_a", {dtype_specializations}] = None
 // out["awkward_reduce_argmin_b", {dtype_specializations}] = None
 // out["awkward_reduce_argmin_c", {dtype_specializations}] = None
@@ -27,7 +25,7 @@ awkward_reduce_argmin_a(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    T* partial,
+    uint64_t* atomic_toptr,
     T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -35,7 +33,7 @@ awkward_reduce_argmin_a(
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      toptr[thread_id] = -1;
+      atomic_toptr[thread_id] = -1;
     }
   }
 }
@@ -48,7 +46,7 @@ awkward_reduce_argmin_b(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    T* partial,
+    uint64_t* atomic_toptr,
     T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -76,7 +74,7 @@ awkward_reduce_argmin_b(
     if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
       if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
-        partial[blockIdx.x * outlength + parent] = temp[idx];
+        atomicExch(&atomic_toptr[parent], temp[idx]);
       }
     }
   }
@@ -90,7 +88,7 @@ awkward_reduce_argmin_c(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    T* partial,
+    uint64_t* atomic_toptr,
     T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -98,16 +96,7 @@ awkward_reduce_argmin_c(
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      int64_t argmin = -1;
-      int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x;
-      for (int64_t i = 0; i < blocks; ++i) {
-        int64_t index = partial[i * outlength + thread_id];
-        if (index != -1 && (argmin == -1 || fromptr[index] < fromptr[argmin]) ||
-           (fromptr[index] == fromptr[argmin] && index < argmin)) {
-          argmin = index;
-        }
-      }
-      toptr[thread_id] = argmin;
+      toptr[thread_id] = static_cast<T>(atomic_toptr[thread_id]);
     }
   }
 }
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu
index cdf870c63c..f2a306f5c6 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu
@@ -4,16 +4,14 @@
 // def f(grid, block, args):
 //     (toptr, parents, lenparents, outlength, invocation_index, err_code) = args
 //     if block[0] > 0:
-//         segment = math.floor((outlength + block[0] - 1) / block[0])
 //         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
 //     else:
-//         segment = 0
 //         grid_size = 1
-//     partial = cupy.zeros(outlength * grid_size, dtype=toptr.dtype)
+//     atomic_toptr = cupy.array(toptr, dtype=cupy.uint64)
 //     temp = cupy.zeros(lenparents, dtype=toptr.dtype)
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_a", cupy.dtype(toptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_b", cupy.dtype(toptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_c", cupy.dtype(toptr.dtype).type, parents.dtype]))((segment,), block, (toptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_a", cupy.dtype(toptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_b", cupy.dtype(toptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_c", cupy.dtype(toptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
 // out["awkward_reduce_count_64_a", {dtype_specializations}] = None
 // out["awkward_reduce_count_64_b", {dtype_specializations}] = None
 // out["awkward_reduce_count_64_c", {dtype_specializations}] = None
@@ -26,7 +24,7 @@ awkward_reduce_count_64_a(
     const C* parents,
     int64_t lenparents,
     int64_t outlength,
-    T* partial,
+    uint64_t* atomic_toptr,
     T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -34,7 +32,7 @@ awkward_reduce_count_64_a(
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      toptr[thread_id] = 0;
+      atomic_toptr[thread_id] = 0;
     }
   }
 }
@@ -46,7 +44,7 @@ awkward_reduce_count_64_b(
     const C* parents,
     int64_t lenparents,
     int64_t outlength,
-    T* partial,
+    uint64_t* atomic_toptr,
     T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -72,7 +70,7 @@ awkward_reduce_count_64_b(
     if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
       if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
-        partial[blockIdx.x * outlength + parent] = temp[idx];
+        atomicAdd(&atomic_toptr[parent], temp[idx]);
       }
     }
   }
@@ -85,7 +83,7 @@ awkward_reduce_count_64_c(
     const C* parents,
     int64_t lenparents,
     int64_t outlength,
-    T* partial,
+    uint64_t* atomic_toptr,
     T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -93,12 +91,7 @@ awkward_reduce_count_64_c(
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      int64_t count = 0;
-      int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x;
-      for (int64_t i = 0; i < blocks; ++i) {
-        count += partial[i * outlength + thread_id];
-      }
-      toptr[thread_id] = count;
+      toptr[thread_id] = static_cast<T>(atomic_toptr[thread_id]);
     }
   }
 }
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu
index 11bb84b18f..1652e0b918 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu
@@ -4,16 +4,14 @@
 // def f(grid, block, args):
 //     (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args
 //     if block[0] > 0:
-//         segment = math.floor((outlength + block[0] - 1) / block[0])
 //         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
 //     else:
-//         segment = 0
 //         grid_size = 1
-//     partial = cupy.zeros(outlength * grid_size, dtype=toptr.dtype)
+//     atomic_toptr = cupy.array(toptr, dtype=cupy.uint64)
 //     temp = cupy.zeros(lenparents, dtype=toptr.dtype)
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_countnonzero_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_countnonzero_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_countnonzero_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_countnonzero_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_countnonzero_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_countnonzero_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
 // out["awkward_reduce_countnonzero_a", {dtype_specializations}] = None
 // out["awkward_reduce_countnonzero_b", {dtype_specializations}] = None
 // out["awkward_reduce_countnonzero_c", {dtype_specializations}] = None
@@ -27,7 +25,7 @@ awkward_reduce_countnonzero_a(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    T* partial,
+    uint64_t* atomic_toptr,
     T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -35,7 +33,7 @@ awkward_reduce_countnonzero_a(
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      toptr[thread_id] = 0;
+      atomic_toptr[thread_id] = 0;
     }
   }
 }
@@ -48,7 +46,7 @@ awkward_reduce_countnonzero_b(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    T* partial,
+    uint64_t* atomic_toptr,
     T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -74,7 +72,7 @@ awkward_reduce_countnonzero_b(
     if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
       if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
-        partial[blockIdx.x * outlength + parent] = temp[idx];
+        atomicAdd(&atomic_toptr[parent], temp[idx]);
       }
     }
   }
@@ -88,7 +86,7 @@ awkward_reduce_countnonzero_c(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    T* partial,
+    uint64_t* atomic_toptr,
     T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -96,12 +94,7 @@ awkward_reduce_countnonzero_c(
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      int64_t count = 0;
-      int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x;
-      for (int64_t i = 0; i < blocks; ++i) {
-        count += partial[i * outlength + thread_id];
-      }
-      toptr[thread_id] = count;
+      toptr[thread_id] = static_cast<T>(atomic_toptr[thread_id]);
     }
   }
 }
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu
index 52cc05492d..15e983b35b 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu
@@ -4,16 +4,14 @@
 // def f(grid, block, args):
 //     (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args
 //     if block[0] > 0:
-//         segment = math.floor((outlength + block[0] - 1) / block[0])
 //         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
 //     else:
-//         segment = 0
 //         grid_size = 1
-//     partial = cupy.zeros(outlength * grid_size, dtype=toptr.dtype)
+//     atomic_toptr = cupy.array(toptr, dtype=cupy.uint64)
 //     temp = cupy.zeros(lenparents, dtype=toptr.dtype)
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int32_bool_64_a", int32, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int32_bool_64_b", int32, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int32_bool_64_c", int32, bool_, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int32_bool_64_a", int32, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int32_bool_64_b", int32, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int32_bool_64_c", int32, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
 // out["awkward_reduce_sum_int32_bool_64_a", {dtype_specializations}] = None
 // out["awkward_reduce_sum_int32_bool_64_b", {dtype_specializations}] = None
 // out["awkward_reduce_sum_int32_bool_64_c", {dtype_specializations}] = None
@@ -27,7 +25,7 @@ awkward_reduce_sum_int32_bool_64_a(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    T* partial,
+    uint64_t* atomic_toptr,
     T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -35,7 +33,7 @@ awkward_reduce_sum_int32_bool_64_a(
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      toptr[thread_id] = 0;
+      atomic_toptr[thread_id] = 0;
     }
   }
 }
@@ -48,7 +46,7 @@ awkward_reduce_sum_int32_bool_64_b(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    T* partial,
+    uint64_t* atomic_toptr,
     T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -74,7 +72,7 @@ awkward_reduce_sum_int32_bool_64_b(
     if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
       if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
-        partial[blockIdx.x * outlength + parent] = temp[idx];
+        atomicAdd(&atomic_toptr[parent], temp[idx]);
       }
     }
   }
@@ -88,7 +86,7 @@ awkward_reduce_sum_int32_bool_64_c(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    T* partial,
+    uint64_t* atomic_toptr,
     T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -96,12 +94,7 @@ awkward_reduce_sum_int32_bool_64_c(
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      T sum = 0;
-      int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x;
-      for (int64_t i = 0; i < blocks; ++i) {
-        sum += partial[i * outlength + thread_id];
-      }
-      toptr[thread_id] = sum;
+      toptr[thread_id] = static_cast<T>(atomic_toptr[thread_id]);
     }
   }
 }
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu
index a215bb92f3..d381c526a9 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu
@@ -4,16 +4,14 @@
 // def f(grid, block, args):
 //     (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args
 //     if block[0] > 0:
-//         segment = math.floor((outlength + block[0] - 1) / block[0])
 //         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
 //     else:
-//         segment = 0
 //         grid_size = 1
-//     partial = cupy.zeros(outlength * grid_size, dtype=toptr.dtype)
+//     atomic_toptr = cupy.array(toptr, dtype=cupy.uint64)
 //     temp = cupy.zeros(lenparents, dtype=toptr.dtype)
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int64_bool_64_a", int64, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int64_bool_64_b", int64, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int64_bool_64_c", int64, bool_, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int64_bool_64_a", int64, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int64_bool_64_b", int64, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int64_bool_64_c", int64, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
 // out["awkward_reduce_sum_int64_bool_64_a", {dtype_specializations}] = None
 // out["awkward_reduce_sum_int64_bool_64_b", {dtype_specializations}] = None
 // out["awkward_reduce_sum_int64_bool_64_c", {dtype_specializations}] = None
@@ -27,7 +25,7 @@ awkward_reduce_sum_int64_bool_64_a(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    T* partial,
+    uint64_t* atomic_toptr,
     T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -35,7 +33,7 @@ awkward_reduce_sum_int64_bool_64_a(
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      toptr[thread_id] = 0;
+      atomic_toptr[thread_id] = 0;
     }
   }
 }
@@ -48,7 +46,7 @@ awkward_reduce_sum_int64_bool_64_b(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    T* partial,
+    uint64_t* atomic_toptr,
     T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -74,7 +72,7 @@ awkward_reduce_sum_int64_bool_64_b(
     if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
       if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
-        partial[blockIdx.x * outlength + parent] = temp[idx];
+        atomicAdd(&atomic_toptr[parent], temp[idx]);
       }
     }
   }
@@ -88,7 +86,7 @@ awkward_reduce_sum_int64_bool_64_c(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    T* partial,
+    uint64_t* atomic_toptr,
     T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -96,12 +94,7 @@ awkward_reduce_sum_int64_bool_64_c(
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      T sum = 0;
-      int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x;
-      for (int64_t i = 0; i < blocks; ++i) {
-        sum += partial[i * outlength + thread_id];
-      }
-      toptr[thread_id] = sum;
+      toptr[thread_id] = static_cast<T>(atomic_toptr[thread_id]);
     }
   }
 }

From 127e035b4e03bde5b27677162ee1f5c734244cf0 Mon Sep 17 00:00:00 2001
From: ManasviGoyal <mg.manasvi@gmail.com>
Date: Mon, 10 Jun 2024 17:02:49 +0200
Subject: [PATCH 15/33] fix: remove unnessary variable

---
 .../cuda/cuda_kernels/awkward_reduce_sum.cu   | 39 +------------
 .../cuda_kernels/awkward_reduce_sum_bool.cu   | 27 ++++-----
 .../awkward_reduce_sum_int32_bool_64.cu       | 34 ++---------
 .../awkward_reduce_sum_int64_bool_64.cu       | 34 ++---------
 .../_connect/cuda/cuda_kernels/cuda_common.cu | 57 +++++++++++++++++++
 5 files changed, 78 insertions(+), 113 deletions(-)

diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum.cu
index e641d728b4..bcbad2e07a 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum.cu
@@ -4,19 +4,14 @@
 // def f(grid, block, args):
 //     (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args
 //     if block[0] > 0:
-//         segment = math.floor((outlength + block[0] - 1) / block[0])
 //         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
 //     else:
-//         segment = 0
 //         grid_size = 1
-//     partial = cupy.zeros(outlength * grid_size, dtype=toptr.dtype)
 //     temp = cupy.zeros(lenparents, dtype=toptr.dtype)
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, temp, invocation_index, err_code))
 // out["awkward_reduce_sum_a", {dtype_specializations}] = None
 // out["awkward_reduce_sum_b", {dtype_specializations}] = None
-// out["awkward_reduce_sum_c", {dtype_specializations}] = None
 // END PYTHON
 
 template <typename T, typename C, typename U>
@@ -27,7 +22,6 @@ awkward_reduce_sum_a(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    T* partial,
     T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -48,7 +42,6 @@ awkward_reduce_sum_b(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    T* partial,
     T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -74,34 +67,8 @@ awkward_reduce_sum_b(
     if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
       if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
-        partial[blockIdx.x * outlength + parent] = temp[idx];
+        atomicAdd(&toptr[parent], temp[idx]);
       }
     }
   }
 }
-
-template <typename T, typename C, typename U>
-__global__ void
-awkward_reduce_sum_c(
-    T* toptr,
-    const C* fromptr,
-    const U* parents,
-    int64_t lenparents,
-    int64_t outlength,
-    T* partial,
-    T* temp,
-    uint64_t invocation_index,
-    uint64_t* err_code) {
-  if (err_code[0] == NO_ERROR) {
-    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (thread_id < outlength) {
-      T sum = 0;
-      int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x;
-      for (int64_t i = 0; i < blocks; ++i) {
-        sum += partial[i * outlength + thread_id];
-      }
-      toptr[thread_id] = sum;
-    }
-  }
-}
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_bool.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_bool.cu
index cee00fd95f..d233a9b9ed 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_bool.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_bool.cu
@@ -4,16 +4,14 @@
 // def f(grid, block, args):
 //     (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args
 //     if block[0] > 0:
-//         segment = math.floor((outlength + block[0] - 1) / block[0])
 //         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
 //     else:
-//         segment = 0
 //         grid_size = 1
-//     partial = cupy.zeros(outlength * grid_size, dtype=toptr.dtype)
+//     atomic_toptr = cupy.array(toptr, dtype=cupy.uint32)
 //     temp = cupy.zeros(lenparents, dtype=toptr.dtype)
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_bool_a", bool_, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_bool_b", bool_, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_bool_c", bool_, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_bool_a", bool_, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_bool_b", bool_, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_bool_c", bool_, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
 // out["awkward_reduce_sum_bool_a", {dtype_specializations}] = None
 // out["awkward_reduce_sum_bool_b", {dtype_specializations}] = None
 // out["awkward_reduce_sum_bool_c", {dtype_specializations}] = None
@@ -27,7 +25,7 @@ awkward_reduce_sum_bool_a(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    T* partial,
+    uint32_t* atomic_toptr,
     T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -35,7 +33,7 @@ awkward_reduce_sum_bool_a(
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      toptr[thread_id] = 0;
+      atomic_toptr[thread_id] = 0;
     }
   }
 }
@@ -48,7 +46,7 @@ awkward_reduce_sum_bool_b(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    T* partial,
+    uint32_t* atomic_toptr,
     T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -74,7 +72,7 @@ awkward_reduce_sum_bool_b(
     if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
       if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
-        partial[blockIdx.x * outlength + parent] = temp[idx];
+        atomicOr(&atomic_toptr[parent], temp[idx]);
       }
     }
   }
@@ -88,7 +86,7 @@ awkward_reduce_sum_bool_c(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    T* partial,
+    uint32_t* atomic_toptr,
     T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -96,12 +94,7 @@ awkward_reduce_sum_bool_c(
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      T sum = 0;
-      int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x;
-      for (int64_t i = 0; i < blocks; ++i) {
-        sum |= (partial[i * outlength + thread_id] != 0);
-      }
-      toptr[thread_id] = sum;
+      toptr[thread_id] = (T)(atomic_toptr[thread_id]);
     }
   }
 }
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu
index 15e983b35b..9aa8636e9b 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu
@@ -7,14 +7,11 @@
 //         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
 //     else:
 //         grid_size = 1
-//     atomic_toptr = cupy.array(toptr, dtype=cupy.uint64)
 //     temp = cupy.zeros(lenparents, dtype=toptr.dtype)
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int32_bool_64_a", int32, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int32_bool_64_b", int32, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int32_bool_64_c", int32, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int32_bool_64_a", int32, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int32_bool_64_b", int32, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, temp, invocation_index, err_code))
 // out["awkward_reduce_sum_int32_bool_64_a", {dtype_specializations}] = None
 // out["awkward_reduce_sum_int32_bool_64_b", {dtype_specializations}] = None
-// out["awkward_reduce_sum_int32_bool_64_c", {dtype_specializations}] = None
 // END PYTHON
 
 template <typename T, typename C, typename U>
@@ -25,7 +22,6 @@ awkward_reduce_sum_int32_bool_64_a(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    uint64_t* atomic_toptr,
     T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -33,7 +29,7 @@ awkward_reduce_sum_int32_bool_64_a(
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      atomic_toptr[thread_id] = 0;
+      toptr[thread_id] = 0;
     }
   }
 }
@@ -46,7 +42,6 @@ awkward_reduce_sum_int32_bool_64_b(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    uint64_t* atomic_toptr,
     T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -72,29 +67,8 @@ awkward_reduce_sum_int32_bool_64_b(
     if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
       if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
-        atomicAdd(&atomic_toptr[parent], temp[idx]);
+        atomicAdd(&toptr[parent], temp[idx]);
       }
     }
   }
 }
-
-template <typename T, typename C, typename U>
-__global__ void
-awkward_reduce_sum_int32_bool_64_c(
-    T* toptr,
-    const C* fromptr,
-    const U* parents,
-    int64_t lenparents,
-    int64_t outlength,
-    uint64_t* atomic_toptr,
-    T* temp,
-    uint64_t invocation_index,
-    uint64_t* err_code) {
-  if (err_code[0] == NO_ERROR) {
-    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (thread_id < outlength) {
-      toptr[thread_id] = static_cast<T>(atomic_toptr[thread_id]);
-    }
-  }
-}
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu
index d381c526a9..9f6399eac2 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu
@@ -7,14 +7,11 @@
 //         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
 //     else:
 //         grid_size = 1
-//     atomic_toptr = cupy.array(toptr, dtype=cupy.uint64)
 //     temp = cupy.zeros(lenparents, dtype=toptr.dtype)
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int64_bool_64_a", int64, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int64_bool_64_b", int64, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int64_bool_64_c", int64, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int64_bool_64_a", int64, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int64_bool_64_b", int64, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, temp, invocation_index, err_code))
 // out["awkward_reduce_sum_int64_bool_64_a", {dtype_specializations}] = None
 // out["awkward_reduce_sum_int64_bool_64_b", {dtype_specializations}] = None
-// out["awkward_reduce_sum_int64_bool_64_c", {dtype_specializations}] = None
 // END PYTHON
 
 template <typename T, typename C, typename U>
@@ -25,7 +22,6 @@ awkward_reduce_sum_int64_bool_64_a(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    uint64_t* atomic_toptr,
     T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -33,7 +29,7 @@ awkward_reduce_sum_int64_bool_64_a(
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      atomic_toptr[thread_id] = 0;
+      toptr[thread_id] = 0;
     }
   }
 }
@@ -46,7 +42,6 @@ awkward_reduce_sum_int64_bool_64_b(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    uint64_t* atomic_toptr,
     T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -72,29 +67,8 @@ awkward_reduce_sum_int64_bool_64_b(
     if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
       if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
-        atomicAdd(&atomic_toptr[parent], temp[idx]);
+        atomicAdd(&toptr[parent], temp[idx]);
       }
     }
   }
 }
-
-template <typename T, typename C, typename U>
-__global__ void
-awkward_reduce_sum_int64_bool_64_c(
-    T* toptr,
-    const C* fromptr,
-    const U* parents,
-    int64_t lenparents,
-    int64_t outlength,
-    uint64_t* atomic_toptr,
-    T* temp,
-    uint64_t invocation_index,
-    uint64_t* err_code) {
-  if (err_code[0] == NO_ERROR) {
-    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (thread_id < outlength) {
-      toptr[thread_id] = static_cast<T>(atomic_toptr[thread_id]);
-    }
-  }
-}
diff --git a/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu b/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu
index 8a02094f34..89ad707471 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu
@@ -32,6 +32,27 @@ typedef unsigned long long uintmax_t;
   atomicMin(err_code,                  \
             invocation_index*(1 << ERROR_BITS) + (int)(ERROR_KERNEL_CODE));
 
+// BEGIN PYTHON
+// def min_max_type(dtype):
+//   supported_types = {
+//       'bool': cupy.int32,
+//       'int8': cupy.int32,
+//       'int16': cupy.int32,
+//       'int32': cupy.int32,
+//       'int64': cupy.int64,
+//       'uint8': cupy.uint32,
+//       'uint16': cupy.uint32,
+//       'uint32': cupy.uint32,
+//       'uint64': cupy.uint64,
+//       'float32': cupy.float32,
+//       'float64': cupy.float64
+//   }
+//   if str(dtype) in supported_types:
+//       return supported_types[str(dtype)]
+//   else:
+//       raise ValueError("Unsupported dtype.", dtype)
+// END PYTHON
+
 // BEGIN PYTHON
 // def inclusive_scan(grid, block, args):
 //     (d_in, invocation_index, err_code) = args
@@ -144,3 +165,39 @@ exclusive_scan_kernel(T* input,
     }
   }
 }
+
+__device__ __forceinline__ float atomicMin(float* addr, float value) {
+  float old; old = !signbit(value) ? __int_as_float(atomicMin((int*)addr, __float_as_int(value))) : __uint_as_float(atomicMax((unsigned int*)addr, __float_as_uint(value)));
+  return old;
+}
+__device__ __forceinline__ float atomicMax(float* addr, float value) {
+  float old; old = !signbit(value) ? __int_as_float(atomicMax((int*)addr, __float_as_int(value))) : __uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(value)));
+  return old;
+}
+
+
+typedef long long int64_t;
+
+
+template <typename T>
+struct is_int64_t {
+    static const bool value = false;
+};
+
+
+template <>
+struct is_int64_t<int64_t> {
+    static const bool value = true;
+};
+
+
+
+__device__ int64_t atomicAdd(int64_t* address, int64_t val) {
+  uint64_t* address_as_ull = (uint64_t*)address;
+  uint64_t old = *address_as_ull, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed, assumed + (uint64_t)val);
+  } while (assumed != old);
+  return (int64_t)old;
+}

From 8dee2aec2ede72be9763733a8fd44b56bf15c920 Mon Sep 17 00:00:00 2001
From: ManasviGoyal <mg.manasvi@gmail.com>
Date: Mon, 10 Jun 2024 17:18:57 +0200
Subject: [PATCH 16/33] fix: minor fixes

---
 .../cuda_kernels/awkward_reduce_argmax.cu     |  2 +-
 .../cuda_kernels/awkward_reduce_argmin.cu     |  2 +-
 .../cuda_kernels/awkward_reduce_count_64.cu   | 33 +++---------------
 .../awkward_reduce_countnonzero.cu            | 34 +++----------------
 .../_connect/cuda/cuda_kernels/cuda_common.cu | 16 ---------
 5 files changed, 10 insertions(+), 77 deletions(-)

diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu
index d2fe929a6b..0202d7276f 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu
@@ -96,7 +96,7 @@ awkward_reduce_argmax_c(
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      toptr[thread_id] = static_cast<T>(atomic_toptr[thread_id]);
+      toptr[thread_id] = (T)(atomic_toptr[thread_id]);
     }
   }
 }
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu
index 754ec84cfb..e2215b1b11 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu
@@ -96,7 +96,7 @@ awkward_reduce_argmin_c(
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      toptr[thread_id] = static_cast<T>(atomic_toptr[thread_id]);
+      toptr[thread_id] = (T)(atomic_toptr[thread_id]);
     }
   }
 }
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu
index f2a306f5c6..2d317aebcd 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu
@@ -7,14 +7,11 @@
 //         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
 //     else:
 //         grid_size = 1
-//     atomic_toptr = cupy.array(toptr, dtype=cupy.uint64)
 //     temp = cupy.zeros(lenparents, dtype=toptr.dtype)
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_a", cupy.dtype(toptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_b", cupy.dtype(toptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_c", cupy.dtype(toptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_a", cupy.dtype(toptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, parents, lenparents, outlength, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_b", cupy.dtype(toptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, parents, lenparents, outlength, temp, invocation_index, err_code))
 // out["awkward_reduce_count_64_a", {dtype_specializations}] = None
 // out["awkward_reduce_count_64_b", {dtype_specializations}] = None
-// out["awkward_reduce_count_64_c", {dtype_specializations}] = None
 // END PYTHON
 
 template <typename T, typename C>
@@ -24,7 +21,6 @@ awkward_reduce_count_64_a(
     const C* parents,
     int64_t lenparents,
     int64_t outlength,
-    uint64_t* atomic_toptr,
     T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -32,7 +28,7 @@ awkward_reduce_count_64_a(
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      atomic_toptr[thread_id] = 0;
+      toptr[thread_id] = 0;
     }
   }
 }
@@ -44,7 +40,6 @@ awkward_reduce_count_64_b(
     const C* parents,
     int64_t lenparents,
     int64_t outlength,
-    uint64_t* atomic_toptr,
     T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -70,28 +65,8 @@ awkward_reduce_count_64_b(
     if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
       if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
-        atomicAdd(&atomic_toptr[parent], temp[idx]);
+        atomicAdd(&toptr[parent], temp[idx]);
       }
     }
   }
 }
-
-template <typename T, typename C>
-__global__ void
-awkward_reduce_count_64_c(
-    T* toptr,
-    const C* parents,
-    int64_t lenparents,
-    int64_t outlength,
-    uint64_t* atomic_toptr,
-    T* temp,
-    uint64_t invocation_index,
-    uint64_t* err_code) {
-  if (err_code[0] == NO_ERROR) {
-    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (thread_id < outlength) {
-      toptr[thread_id] = static_cast<T>(atomic_toptr[thread_id]);
-    }
-  }
-}
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu
index 1652e0b918..7af29ad6c7 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu
@@ -7,14 +7,11 @@
 //         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
 //     else:
 //         grid_size = 1
-//     atomic_toptr = cupy.array(toptr, dtype=cupy.uint64)
 //     temp = cupy.zeros(lenparents, dtype=toptr.dtype)
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_countnonzero_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_countnonzero_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_countnonzero_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_countnonzero_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_countnonzero_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, temp, invocation_index, err_code))
 // out["awkward_reduce_countnonzero_a", {dtype_specializations}] = None
 // out["awkward_reduce_countnonzero_b", {dtype_specializations}] = None
-// out["awkward_reduce_countnonzero_c", {dtype_specializations}] = None
 // END PYTHON
 
 template <typename T, typename C, typename U>
@@ -25,7 +22,6 @@ awkward_reduce_countnonzero_a(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    uint64_t* atomic_toptr,
     T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -33,7 +29,7 @@ awkward_reduce_countnonzero_a(
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      atomic_toptr[thread_id] = 0;
+      toptr[thread_id] = 0;
     }
   }
 }
@@ -46,7 +42,6 @@ awkward_reduce_countnonzero_b(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    uint64_t* atomic_toptr,
     T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -72,29 +67,8 @@ awkward_reduce_countnonzero_b(
     if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
       if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
-        atomicAdd(&atomic_toptr[parent], temp[idx]);
+        atomicAdd(&toptr[parent], temp[idx]);
       }
     }
   }
 }
-
-template <typename T, typename C, typename U>
-__global__ void
-awkward_reduce_countnonzero_c(
-    T* toptr,
-    const C* fromptr,
-    const U* parents,
-    int64_t lenparents,
-    int64_t outlength,
-    uint64_t* atomic_toptr,
-    T* temp,
-    uint64_t invocation_index,
-    uint64_t* err_code) {
-  if (err_code[0] == NO_ERROR) {
-    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (thread_id < outlength) {
-      toptr[thread_id] = static_cast<T>(atomic_toptr[thread_id]);
-    }
-  }
-}
diff --git a/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu b/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu
index 89ad707471..9fbeea3c1f 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu
@@ -176,22 +176,6 @@ __device__ __forceinline__ float atomicMax(float* addr, float value) {
 }
 
 
-typedef long long int64_t;
-
-
-template <typename T>
-struct is_int64_t {
-    static const bool value = false;
-};
-
-
-template <>
-struct is_int64_t<int64_t> {
-    static const bool value = true;
-};
-
-
-
 __device__ int64_t atomicAdd(int64_t* address, int64_t val) {
   uint64_t* address_as_ull = (uint64_t*)address;
   uint64_t old = *address_as_ull, assumed;

From 896770f1c4665891f04d1dd3e8ee67288405f6ef Mon Sep 17 00:00:00 2001
From: ManasviGoyal <mg.manasvi@gmail.com>
Date: Wed, 12 Jun 2024 14:27:03 +0200
Subject: [PATCH 17/33] fix: all reducer for atomics

---
 kernel-test-data.json                         | 130 +++++++-
 ...tOffsetArray_reduce_local_outoffsets_64.cu |  51 +--
 .../cuda_kernels/awkward_reduce_argmax.cu     |  12 +-
 .../cuda_kernels/awkward_reduce_argmin.cu     |  12 +-
 .../cuda_kernels/awkward_reduce_count_64.cu   |   8 +-
 .../awkward_reduce_countnonzero.cu            |   8 +-
 .../cuda/cuda_kernels/awkward_reduce_max.cu   |  50 +--
 .../cuda/cuda_kernels/awkward_reduce_min.cu   |  54 +---
 .../cuda/cuda_kernels/awkward_reduce_prod.cu  |  33 +-
 .../cuda_kernels/awkward_reduce_prod_bool.cu  |  33 +-
 .../cuda/cuda_kernels/awkward_reduce_sum.cu   |   8 +-
 .../cuda_kernels/awkward_reduce_sum_bool.cu   |   8 +-
 .../awkward_reduce_sum_int32_bool_64.cu       |   8 +-
 .../awkward_reduce_sum_int64_bool_64.cu       |   8 +-
 .../_connect/cuda/cuda_kernels/cuda_common.cu | 293 ++++++++++++++++--
 tests-cuda/test_3136_cuda_reducers.py         |   1 -
 16 files changed, 488 insertions(+), 229 deletions(-)

diff --git a/kernel-test-data.json b/kernel-test-data.json
index efa4033e7b..e843bf9b05 100644
--- a/kernel-test-data.json
+++ b/kernel-test-data.json
@@ -13108,6 +13108,76 @@
                 }
             ]
         },
+        {
+            "name": "awkward_ListArray_combinations",
+            "status": true,
+            "tests": [
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromindex": [],
+                        "length": 0,
+                        "n": 0,
+                        "replacement": false,
+                        "starts": [],
+                        "stops": []
+                    },
+                    "outputs": {
+                        "tocarry": [[0], [0]],
+                        "toindex": [0]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromindex": [0],
+                        "length": 1,
+                        "n": 2,
+                        "replacement": false,
+                        "starts": [0],
+                        "stops": [2]
+                    },
+                    "outputs": {
+                        "tocarry": [[0, 1], [0, 1]],
+                        "toindex": [1, 1]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromindex": [0, 3, 3, 5, 7],
+                        "length": 5,
+                        "n": 2,
+                        "replacement": false,
+                        "starts": [0, 4, 4, 7, 8],
+                        "stops": [4, 4, 7, 8, 13]
+                    },
+                    "outputs": {
+                        "tocarry": [[0, 6, 6, 9, 9, 19], [0, 6, 6, 9, 9, 19]],
+                        "toindex": [0, 6, 6, 9, 9, 19]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromindex": [0, 4, 4, 7, 8],
+                        "length": 5,
+                        "n": 2,
+                        "replacement": false,
+                        "starts": [0, 3, 3, 10, 10],
+                        "stops": [3, 3, 5, 10, 13]
+                    },
+                    "outputs": {
+                        "tocarry": [[0, 3, 3, 4, 4, 7], [0, 6, 6, 9, 9, 19]],
+                        "toindex": [0, 6, 6, 9, 9, 19]
+                    }
+                }
+            ]
+        },
         {
             "name": "awkward_ListArray_getitem_jagged_carrylen",
             "status": true,
@@ -24366,6 +24436,64 @@
                 }
             ]
         },
+        {
+            "name": "awkward_reduce_sum_complex",
+            "status": true,
+            "tests": [
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [],
+                        "lenparents": 0,
+                        "outlength": 0,
+                        "parents": []
+                    },
+                    "outputs": {
+                        "toptr": []
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [0, 0],
+                        "lenparents": 1,
+                        "outlength": 2,
+                        "parents": [0]
+                    },
+                    "outputs": {
+                        "toptr": [0, 0]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [2, 2, 3, 3, 5, 5, 7, 7, 11, 11, 13, 13, 17, 17, 19, 19, 23, 23],
+                        "lenparents": 9,
+                        "outlength": 12,
+                        "parents": [0, 0, 0, 2, 2, 3, 4, 4, 5]
+                    },
+                    "outputs": {
+                        "toptr": [10, 10, 0, 0, 18, 18, 13, 13, 36, 36, 23, 23]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1],
+                        "lenparents": 6,
+                        "outlength": 8,
+                        "parents": [0, 0, 0, 2, 2, 3]
+                    },
+                    "outputs": {
+                        "toptr": [1, 3, 0, 0, 1, 2, 0, 1]
+                    }
+                }
+            ]
+        },
         {
             "name": "awkward_reduce_sum",
             "status": true,
@@ -26349,7 +26477,7 @@
                     "error": false,
                     "message": "",
                     "inputs": {
-                        "fromptr": [0, 4, 1, 3, 5, 6],
+                        "fromptr": [0, 4, 1, 1, 5, 6],
                         "identity": 9223372036854775807,
                         "lenparents": 6,
                         "outlength": 4,
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu
index 3dcdf14727..42e8119d46 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu
@@ -8,18 +8,15 @@
 //         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
 //     else:
 //         grid_size = 1
-//     atomic_outoffsets = cupy.array(outoffsets, dtype=cupy.uint64)
 //     temp = cupy.zeros(lenparents, dtype=cupy.int64)
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_a", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, atomic_outoffsets, temp, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_b", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, atomic_outoffsets, temp, invocation_index, err_code))
-//     scan_in_array = cupy.zeros(outlength, dtype=cupy.int64)
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_c", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, atomic_outoffsets, scan_in_array, invocation_index, err_code))
+//     scan_in_array = cupy.zeros(outlength, dtype=cupy.uint64)
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_a", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, scan_in_array, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_b", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, scan_in_array, temp, invocation_index, err_code))
 //     scan_in_array = cupy.cumsum(scan_in_array)
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_d", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, atomic_outoffsets, scan_in_array, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_c", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, scan_in_array, temp, invocation_index, err_code))
 // out["awkward_ListOffsetArray_reduce_local_outoffsets_64_a", {dtype_specializations}] = None
 // out["awkward_ListOffsetArray_reduce_local_outoffsets_64_b", {dtype_specializations}] = None
 // out["awkward_ListOffsetArray_reduce_local_outoffsets_64_c", {dtype_specializations}] = None
-// out["awkward_ListOffsetArray_reduce_local_outoffsets_64_d", {dtype_specializations}] = None
 // END PYTHON
 
 template <typename T, typename C>
@@ -29,7 +26,7 @@ awkward_ListOffsetArray_reduce_local_outoffsets_64_a(
     const C* parents,
     int64_t lenparents,
     int64_t outlength,
-    uint64_t* atomic_outoffsets,
+    uint64_t* scan_in_array,
     int64_t* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -37,7 +34,7 @@ awkward_ListOffsetArray_reduce_local_outoffsets_64_a(
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      atomic_outoffsets[thread_id] = 0;
+      outoffsets[thread_id] = 0;
     }
   }
 }
@@ -49,7 +46,7 @@ awkward_ListOffsetArray_reduce_local_outoffsets_64_b(
     const C* parents,
     int64_t lenparents,
     int64_t outlength,
-    uint64_t* atomic_outoffsets,
+    uint64_t* scan_in_array,
     int64_t* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -58,24 +55,24 @@ awkward_ListOffsetArray_reduce_local_outoffsets_64_b(
     int64_t thread_id = blockIdx.x * blockDim.x + idx;
 
     if (thread_id < lenparents) {
-      temp[idx] = 1;
+      temp[thread_id] = 1;
     }
     __syncthreads();
 
     for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
       int64_t val = 0;
       if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
-        val = temp[idx - stride];
+        val = temp[thread_id - stride];
       }
       __syncthreads();
-      temp[idx] += val;
+      temp[thread_id] += val;
       __syncthreads();
     }
 
     if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
       if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
-        atomicAdd(&atomic_outoffsets[parent], temp[idx]);
+        atomicAdd(&scan_in_array[parent], temp[thread_id]);
       }
     }
   }
@@ -88,28 +85,8 @@ awkward_ListOffsetArray_reduce_local_outoffsets_64_c(
     const C* parents,
     int64_t lenparents,
     int64_t outlength,
-    uint64_t* atomic_outoffsets,
-    int64_t* scan_in_array,
-    uint64_t invocation_index,
-    uint64_t* err_code) {
-  if (err_code[0] == NO_ERROR) {
-    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (thread_id < outlength) {
-      scan_in_array[thread_id] = atomic_outoffsets[thread_id];
-    }
-  }
-}
-
-template <typename T, typename C>
-__global__ void
-awkward_ListOffsetArray_reduce_local_outoffsets_64_d(
-    T* outoffsets,
-    const C* parents,
-    int64_t lenparents,
-    int64_t outlength,
-    uint64_t* atomic_outoffsets,
-    int64_t* scan_in_array,
+    uint64_t* scan_in_array,
+    int64_t* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
@@ -117,7 +94,7 @@ awkward_ListOffsetArray_reduce_local_outoffsets_64_d(
     outoffsets[0] = 0;
 
     if (thread_id < outlength) {
-      outoffsets[thread_id + 1] = static_cast<T>(scan_in_array[thread_id]);
+      outoffsets[thread_id + 1] = (T)(scan_in_array[thread_id]);
     }
   }
 }
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu
index 0202d7276f..df515f05a4 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu
@@ -55,18 +55,18 @@ awkward_reduce_argmax_b(
     int64_t thread_id = blockIdx.x * blockDim.x + idx;
 
     if (thread_id < lenparents) {
-      temp[idx] = thread_id;
+      temp[thread_id] = thread_id;
     }
     __syncthreads();
 
     for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
       int64_t index = -1;
       if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
-        index = temp[idx - stride];
+        index = temp[thread_id - stride];
       }
-      if (index != -1 && (temp[idx] == -1 || fromptr[index] > fromptr[temp[idx]] ||
-         (fromptr[index] == fromptr[temp[idx]] && index < temp[idx]))) {
-        temp[idx] = index;
+      if (index != -1 && (temp[thread_id] == -1 || fromptr[index] > fromptr[temp[thread_id]] ||
+         (fromptr[index] == fromptr[temp[thread_id]] && index < temp[thread_id]))) {
+        temp[thread_id] = index;
       }
       __syncthreads();
     }
@@ -74,7 +74,7 @@ awkward_reduce_argmax_b(
     if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
       if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
-        atomicExch(&atomic_toptr[parent], temp[idx]);
+        atomicExch(&atomic_toptr[parent], temp[thread_id]);
       }
     }
   }
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu
index e2215b1b11..af1d3fd93d 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu
@@ -55,18 +55,18 @@ awkward_reduce_argmin_b(
     int64_t thread_id = blockIdx.x * blockDim.x + idx;
 
     if (thread_id < lenparents) {
-      temp[idx] = thread_id;
+      temp[thread_id] = thread_id;
     }
     __syncthreads();
 
     for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
       int64_t index = -1;
       if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
-        index = temp[idx - stride];
+        index = temp[thread_id - stride];
       }
-      if (index != -1 && (temp[idx] == -1 || fromptr[index] < fromptr[temp[idx]] ||
-         (fromptr[index] == fromptr[temp[idx]] && index < temp[idx]))) {
-        temp[idx] = index;
+      if (index != -1 && (temp[thread_id] == -1 || fromptr[index] < fromptr[temp[thread_id]] ||
+         (fromptr[index] == fromptr[temp[thread_id]] && index < temp[thread_id]))) {
+        temp[thread_id] = index;
       }
       __syncthreads();
     }
@@ -74,7 +74,7 @@ awkward_reduce_argmin_b(
     if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
       if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
-        atomicExch(&atomic_toptr[parent], temp[idx]);
+        atomicExch(&atomic_toptr[parent], temp[thread_id]);
       }
     }
   }
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu
index 2d317aebcd..9c55e69600 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu
@@ -48,24 +48,24 @@ awkward_reduce_count_64_b(
     int64_t thread_id = blockIdx.x * blockDim.x + idx;
 
     if (thread_id < lenparents) {
-      temp[idx] = 1;
+      temp[thread_id] = 1;
     }
     __syncthreads();
 
     for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
       int64_t val = 0;
       if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
-        val = temp[idx - stride];
+        val = temp[thread_id - stride];
       }
       __syncthreads();
-      temp[idx] += val;
+      temp[thread_id] += val;
       __syncthreads();
     }
 
     if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
       if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
-        atomicAdd(&toptr[parent], temp[idx]);
+        atomicAdd(&toptr[parent], temp[thread_id]);
       }
     }
   }
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu
index 7af29ad6c7..ffcb0b8bd3 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu
@@ -50,24 +50,24 @@ awkward_reduce_countnonzero_b(
     int64_t thread_id = blockIdx.x * blockDim.x + idx;
 
     if (thread_id < lenparents) {
-      temp[idx] = (fromptr[thread_id] != 0) ? 1 : 0;
+      temp[thread_id] = (fromptr[thread_id] != 0) ? 1 : 0;
     }
     __syncthreads();
 
     for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
       int64_t val = 0;
       if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
-        val = temp[idx - stride];
+        val = temp[thread_id - stride];
       }
       __syncthreads();
-      temp[idx] += val;
+      temp[thread_id] += val;
       __syncthreads();
     }
 
     if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
       if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
-        atomicAdd(&toptr[parent], temp[idx]);
+        atomicAdd(&toptr[parent], temp[thread_id]);
       }
     }
   }
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu
index a411d1970a..4afbe3f04c 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu
@@ -4,19 +4,14 @@
 // def f(grid, block, args):
 //     (toptr, fromptr, parents, lenparents, outlength, identity, invocation_index, err_code) = args
 //     if block[0] > 0:
-//         segment = math.floor((outlength + block[0] - 1) / block[0])
 //         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
 //     else:
-//         segment = 0
 //         grid_size = 1
-//     partial = cupy.full(outlength * grid_size, identity, dtype=toptr.dtype)
 //     temp = cupy.full(lenparents, identity, dtype=toptr.dtype)
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_max_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, identity, partial, temp, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_max_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, identity, partial, temp, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_max_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, identity, partial, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_max_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, identity, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_max_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, identity, temp, invocation_index, err_code))
 // out["awkward_reduce_max_a", {dtype_specializations}] = None
 // out["awkward_reduce_max_b", {dtype_specializations}] = None
-// out["awkward_reduce_max_c", {dtype_specializations}] = None
 // END PYTHON
 
 template <typename T, typename C, typename U>
@@ -27,14 +22,12 @@ awkward_reduce_max_a(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    int64_t identity,
-    T* partial,
+    T identity,
     T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-
     if (thread_id < outlength) {
       toptr[thread_id] = static_cast<T>(identity);
     }
@@ -49,8 +42,7 @@ awkward_reduce_max_b(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    int64_t identity,
-    T* partial,
+    T identity,
     T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -59,52 +51,26 @@ awkward_reduce_max_b(
     int64_t thread_id = blockIdx.x * blockDim.x + idx;
 
     if (thread_id < lenparents) {
-      temp[idx] = fromptr[thread_id];
+      temp[thread_id] = fromptr[thread_id];
     }
     __syncthreads();
 
     for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
       T val = static_cast<T>(identity);
+
       if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
         val = temp[idx - stride];
       }
       __syncthreads();
-      temp[idx] = val > temp[idx] ? val : temp[idx];
+      temp[thread_id] = val > temp[thread_id] ? val : temp[thread_id];
       __syncthreads();
     }
 
     if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
       if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
-        partial[blockIdx.x * outlength + parent] = temp[idx];
-      }
-    }
-  }
-}
-
-template <typename T, typename C, typename U>
-__global__ void
-awkward_reduce_max_c(
-    T* toptr,
-    const C* fromptr,
-    const U* parents,
-    int64_t lenparents,
-    int64_t outlength,
-    int64_t identity,
-    T* partial,
-    T* temp,
-    uint64_t invocation_index,
-    uint64_t* err_code) {
-  if (err_code[0] == NO_ERROR) {
-    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (thread_id < outlength) {
-      T maximum = static_cast<T>(identity);
-      int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x;
-      for (int64_t i = 0; i < blocks; ++i) {
-        maximum = maximum > partial[i * outlength + thread_id] ? maximum : partial[i * outlength + thread_id];
+        atomicMax(&toptr[parent], temp[thread_id]);
       }
-      toptr[thread_id] = maximum;
     }
   }
 }
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu
index 828097a14f..34325d91f1 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu
@@ -4,19 +4,14 @@
 // def f(grid, block, args):
 //     (toptr, fromptr, parents, lenparents, outlength, identity, invocation_index, err_code) = args
 //     if block[0] > 0:
-//         segment = math.floor((outlength + block[0] - 1) / block[0])
 //         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
 //     else:
-//         segment = 0
 //         grid_size = 1
-//     partial = cupy.full(outlength * grid_size, identity, dtype=toptr.dtype)
 //     temp = cupy.full(lenparents, identity, dtype=toptr.dtype)
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_min_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, identity, partial, temp, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_min_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, identity, partial, temp, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_min_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, identity, partial, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_min_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, identity, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_min_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, identity, temp, invocation_index, err_code))
 // out["awkward_reduce_min_a", {dtype_specializations}] = None
 // out["awkward_reduce_min_b", {dtype_specializations}] = None
-// out["awkward_reduce_min_c", {dtype_specializations}] = None
 // END PYTHON
 
 template <typename T, typename C, typename U>
@@ -27,8 +22,7 @@ awkward_reduce_min_a(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    int64_t identity,
-    T* partial,
+    T identity,
     T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -36,7 +30,7 @@ awkward_reduce_min_a(
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      toptr[thread_id] = static_cast<T>(identity);
+      toptr[thread_id] = identity;
     }
   }
 }
@@ -49,8 +43,7 @@ awkward_reduce_min_b(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    int64_t identity,
-    T* partial,
+    T identity,
     T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -59,52 +52,25 @@ awkward_reduce_min_b(
     int64_t thread_id = blockIdx.x * blockDim.x + idx;
 
     if (thread_id < lenparents) {
-      temp[idx] = fromptr[thread_id];
+      temp[thread_id] = fromptr[thread_id];
     }
     __syncthreads();
 
     for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
-      T val = static_cast<T>(identity);
+      T val = identity;
       if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
-        val = temp[idx - stride];
+        val = temp[thread_id - stride];
       }
       __syncthreads();
-      temp[idx] = val < temp[idx] ? val : temp[idx];
+      temp[thread_id] = val < temp[thread_id] ? val : temp[thread_id];
       __syncthreads();
     }
 
     if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
       if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
-        partial[blockIdx.x * outlength + parent] = temp[idx];
+        atomicMin(&toptr[parent], temp[thread_id]);
       }
     }
   }
 }
-
-template <typename T, typename C, typename U>
-__global__ void
-awkward_reduce_min_c(
-    T* toptr,
-    const C* fromptr,
-    const U* parents,
-    int64_t lenparents,
-    int64_t outlength,
-    int64_t identity,
-    T* partial,
-    T* temp,
-    uint64_t invocation_index,
-    uint64_t* err_code) {
-  if (err_code[0] == NO_ERROR) {
-    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (thread_id < outlength) {
-      T minimum = static_cast<T>(identity);
-      int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x;
-      for (int64_t i = 0; i < blocks; ++i) {
-        minimum = minimum < partial[i * outlength + thread_id] ? minimum : partial[i * outlength + thread_id];
-      }
-      toptr[thread_id] = minimum;
-    }
-  }
-}
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod.cu
index 1f7e6d4ff0..9248e20efc 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod.cu
@@ -4,16 +4,14 @@
 // def f(grid, block, args):
 //     (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args
 //     if block[0] > 0:
-//         segment = math.floor((outlength + block[0] - 1) / block[0])
 //         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
 //     else:
-//         segment = 0
 //         grid_size = 1
-//     partial = cupy.ones(outlength * grid_size, dtype=toptr.dtype)
+//     atomic_toptr = cupy.array(toptr, dtype=toptr.dtype)
 //     temp = cupy.ones(lenparents, dtype=toptr.dtype)
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
 // out["awkward_reduce_prod_a", {dtype_specializations}] = None
 // out["awkward_reduce_prod_b", {dtype_specializations}] = None
 // out["awkward_reduce_prod_c", {dtype_specializations}] = None
@@ -27,7 +25,7 @@ awkward_reduce_prod_a(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    T* partial,
+    T* atomic_toptr,
     T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -35,7 +33,7 @@ awkward_reduce_prod_a(
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      toptr[thread_id] = 1;
+      atomic_toptr[thread_id] = 1;
     }
   }
 }
@@ -48,7 +46,7 @@ awkward_reduce_prod_b(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    T* partial,
+    T* atomic_toptr,
     T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -57,24 +55,24 @@ awkward_reduce_prod_b(
     int64_t thread_id = blockIdx.x * blockDim.x + idx;
 
     if (thread_id < lenparents) {
-      temp[idx] = fromptr[thread_id];
+      temp[thread_id] = fromptr[thread_id];
     }
     __syncthreads();
 
     for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
       T val = 1;
       if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
-        val = temp[idx - stride];
+        val = temp[thread_id - stride];
       }
       __syncthreads();
-      temp[idx] *= val;
+      temp[thread_id] *= val;
       __syncthreads();
     }
 
     if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
       if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
-        partial[blockIdx.x * outlength + parent] = temp[idx];
+        atomicMul(&atomic_toptr[parent], temp[thread_id]);
       }
     }
   }
@@ -88,7 +86,7 @@ awkward_reduce_prod_c(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    T* partial,
+    T* atomic_toptr,
     T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -96,12 +94,7 @@ awkward_reduce_prod_c(
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      T prod = 1;
-      int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x;
-      for (int64_t i = 0; i < blocks; ++i) {
-        prod *= partial[i * outlength + thread_id];
-      }
-      toptr[thread_id] = prod;
+      toptr[thread_id] = (T)(atomic_toptr[thread_id]);
     }
   }
 }
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod_bool.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod_bool.cu
index af10c4f40d..9d85b366c7 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod_bool.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod_bool.cu
@@ -4,16 +4,14 @@
 // def f(grid, block, args):
 //     (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args
 //     if block[0] > 0:
-//         segment = math.floor((outlength + block[0] - 1) / block[0])
 //         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
 //     else:
-//         segment = 0
 //         grid_size = 1
-//     partial = cupy.ones(outlength * grid_size, dtype=toptr.dtype)
+//     atomic_toptr = cupy.array(toptr, dtype=cupy.uint32)
 //     temp = cupy.ones(lenparents, dtype=toptr.dtype)
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_bool_a", bool_, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_bool_b", bool_, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_bool_c", bool_, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_bool_a", bool_, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_bool_b", bool_, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_bool_c", bool_, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
 // out["awkward_reduce_prod_bool_a", {dtype_specializations}] = None
 // out["awkward_reduce_prod_bool_b", {dtype_specializations}] = None
 // out["awkward_reduce_prod_bool_c", {dtype_specializations}] = None
@@ -27,7 +25,7 @@ awkward_reduce_prod_bool_a(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    T* partial,
+    uint32_t* atomic_toptr,
     T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -35,7 +33,7 @@ awkward_reduce_prod_bool_a(
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      toptr[thread_id] = 1;
+      atomic_toptr[thread_id] = 1;
     }
   }
 }
@@ -48,7 +46,7 @@ awkward_reduce_prod_bool_b(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    T* partial,
+    uint32_t* atomic_toptr,
     T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -57,24 +55,24 @@ awkward_reduce_prod_bool_b(
     int64_t thread_id = blockIdx.x * blockDim.x + idx;
 
     if (thread_id < lenparents) {
-      temp[idx] = fromptr[thread_id];
+      temp[thread_id] = fromptr[thread_id];
     }
     __syncthreads();
 
     for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
       T val = 1;
       if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
-        val = temp[idx - stride];
+        val = temp[thread_id - stride];
       }
       __syncthreads();
-      temp[idx] &= (val != 0);
+      temp[thread_id] &= (val != 0);
       __syncthreads();
     }
 
     if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
       if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
-        partial[blockIdx.x * outlength + parent] = temp[idx];
+        atomicAnd(&atomic_toptr[parent], temp[thread_id]);
       }
     }
   }
@@ -88,7 +86,7 @@ awkward_reduce_prod_bool_c(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    T* partial,
+    uint32_t* atomic_toptr,
     T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
@@ -96,12 +94,7 @@ awkward_reduce_prod_bool_c(
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      T prod = 1;
-      int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x;
-      for (int64_t i = 0; i < blocks; ++i) {
-        prod &= (partial[i * outlength + thread_id] != 0);
-      }
-      toptr[thread_id] = prod;
+      toptr[thread_id] = (T)(atomic_toptr[thread_id]);
     }
   }
 }
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum.cu
index bcbad2e07a..8ce2b8159c 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum.cu
@@ -50,24 +50,24 @@ awkward_reduce_sum_b(
     int64_t thread_id = blockIdx.x * blockDim.x + idx;
 
     if (thread_id < lenparents) {
-      temp[idx] = fromptr[thread_id];
+      temp[thread_id] = fromptr[thread_id];
     }
     __syncthreads();
 
     for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
       T val = 0;
       if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
-        val = temp[idx - stride];
+        val = temp[thread_id - stride];
       }
       __syncthreads();
-      temp[idx] += val;
+      temp[thread_id] += val;
       __syncthreads();
     }
 
     if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
       if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
-        atomicAdd(&toptr[parent], temp[idx]);
+        atomicAdd(&toptr[parent], temp[thread_id]);
       }
     }
   }
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_bool.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_bool.cu
index d233a9b9ed..f85df8e20a 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_bool.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_bool.cu
@@ -55,24 +55,24 @@ awkward_reduce_sum_bool_b(
     int64_t thread_id = blockIdx.x * blockDim.x + idx;
 
     if (thread_id < lenparents) {
-      temp[idx] = fromptr[thread_id];
+      temp[thread_id] = fromptr[thread_id];
     }
     __syncthreads();
 
     for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
       T val = 0;
       if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
-        val = temp[idx - stride];
+        val = temp[thread_id - stride];
       }
       __syncthreads();
-      temp[idx] |= (val != 0);
+      temp[thread_id] |= (val != 0);
       __syncthreads();
     }
 
     if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
       if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
-        atomicOr(&atomic_toptr[parent], temp[idx]);
+        atomicOr(&atomic_toptr[parent], temp[thread_id]);
       }
     }
   }
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu
index 9aa8636e9b..f52b6fb21c 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu
@@ -50,24 +50,24 @@ awkward_reduce_sum_int32_bool_64_b(
     int64_t thread_id = blockIdx.x * blockDim.x + idx;
 
     if (thread_id < lenparents) {
-      temp[idx] = (fromptr[thread_id] != 0) ? 1 : 0;;
+      temp[thread_id] = (fromptr[thread_id] != 0) ? 1 : 0;;
     }
     __syncthreads();
 
     for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
       T val = 0;
       if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
-        val = temp[idx - stride];
+        val = temp[thread_id - stride];
       }
       __syncthreads();
-      temp[idx] += val;
+      temp[thread_id] += val;
       __syncthreads();
     }
 
     if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
       if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
-        atomicAdd(&toptr[parent], temp[idx]);
+        atomicAdd(&toptr[parent], temp[thread_id]);
       }
     }
   }
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu
index 9f6399eac2..7e220cccc0 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu
@@ -50,24 +50,24 @@ awkward_reduce_sum_int64_bool_64_b(
     int64_t thread_id = blockIdx.x * blockDim.x + idx;
 
     if (thread_id < lenparents) {
-      temp[idx] = (fromptr[thread_id] != 0) ? 1 : 0;;
+      temp[thread_id] = (fromptr[thread_id] != 0) ? 1 : 0;;
     }
     __syncthreads();
 
     for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
       T val = 0;
       if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
-        val = temp[idx - stride];
+        val = temp[thread_id - stride];
       }
       __syncthreads();
-      temp[idx] += val;
+      temp[thread_id] += val;
       __syncthreads();
     }
 
     if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
       if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
-        atomicAdd(&toptr[parent], temp[idx]);
+        atomicAdd(&toptr[parent], temp[thread_id]);
       }
     }
   }
diff --git a/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu b/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu
index cdb7babcfa..9d55a7b713 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu
@@ -53,6 +53,10 @@ typedef unsigned long long uintmax_t;
 //       raise ValueError("Unsupported dtype.", dtype)
 // END PYTHON
 
+
+// used by awkward_ListArray_getitem_next_range_carrylength
+// and awkward_ListArray_getitem_next_range kernels
+
 const int64_t  kMaxInt64  = 9223372036854775806;   // 2**63 - 2: see below
 const int64_t  kSliceNone = kMaxInt64 + 1;         // for Slice::none()
 
@@ -64,43 +68,200 @@ awkward_regularize_rangeslice(
     bool hasstart,
     bool hasstop,
     int64_t length) {
-    if (posstep) {
-      if (!hasstart)           *start = 0;
-      else if (*start < 0)     *start += length;
-      if (*start < 0)          *start = 0;
-      if (*start > length)     *start = length;
-
-      if (!hasstop)            *stop = length;
-      else if (*stop < 0)      *stop += length;
-      if (*stop < 0)           *stop = 0;
-      if (*stop > length)      *stop = length;
-      if (*stop < *start)      *stop = *start;
-    }
-
-    else {
-      if (!hasstart)           *start = length - 1;
-      else if (*start < 0)     *start += length;
-      if (*start < -1)         *start = -1;
-      if (*start > length - 1) *start = length - 1;
-
-      if (!hasstop)            *stop = -1;
-      else if (*stop < 0)      *stop += length;
-      if (*stop < -1)          *stop = -1;
-      if (*stop > length - 1)  *stop = length - 1;
-      if (*stop > *start)      *stop = *start;
-    }
+  if (posstep) {
+    if (!hasstart)           *start = 0;
+    else if (*start < 0)     *start += length;
+    if (*start < 0)          *start = 0;
+    if (*start > length)     *start = length;
+
+    if (!hasstop)            *stop = length;
+    else if (*stop < 0)      *stop += length;
+    if (*stop < 0)           *stop = 0;
+    if (*stop > length)      *stop = length;
+    if (*stop < *start)      *stop = *start;
+  }
+
+  else {
+    if (!hasstart)           *start = length - 1;
+    else if (*start < 0)     *start += length;
+    if (*start < -1)         *start = -1;
+    if (*start > length - 1) *start = length - 1;
+
+    if (!hasstop)            *stop = -1;
+    else if (*stop < 0)      *stop += length;
+    if (*stop < -1)          *stop = -1;
+    if (*stop > length - 1)  *stop = length - 1;
+    if (*stop > *start)      *stop = *start;
   }
 }
 
+
+// atomicMin() specializations
+template <typename T>
+__device__ T atomicMin(T* address, T val);
+
+// atomicMin() specialization for int8_t
+template <>
+__device__ int8_t atomicMin<int8_t>(int8_t* address, int8_t val) {
+    unsigned int *base_address = (unsigned int *)((size_t)address & ~3);
+    unsigned int selectors[] = {0x3214, 0x3240, 0x3410, 0x4210};
+    unsigned int sel = selectors[(size_t)address & 3];
+    unsigned int old, assumed, min_, new_;
+    old = *base_address;
+    do {
+        assumed = old;
+        min_ = min(val, (int8_t)__byte_perm(old, 0, ((size_t)address & 3)));
+        new_ = __byte_perm(old, min_, sel);
+        old = atomicCAS(base_address, assumed, new_);
+    } while (assumed != old);
+    return old;
+}
+
+// atomicMin() specialization for uint8_t
+template <>
+__device__ uint8_t atomicMin<uint8_t>(uint8_t* address, uint8_t val) {
+    unsigned int *base_address = (unsigned int *)((size_t)address & ~3);
+    unsigned int selectors[] = {0x3214, 0x3240, 0x3410, 0x4210};
+    unsigned int sel = selectors[(size_t)address & 3];
+    unsigned int old, assumed, min_, new_;
+    old = *base_address;
+    do {
+        assumed = old;
+        min_ = min(val, (uint8_t)__byte_perm(old, 0, ((size_t)address & 3)));
+        new_ = __byte_perm(old, min_, sel);
+        old = atomicCAS(base_address, assumed, new_);
+    } while (assumed != old);
+    return old;
+}
+
+// atomicMin() specialization for int16_t
+template <>
+__device__ int16_t atomicMin<int16_t>(int16_t* address, int16_t val) {
+   uint16_t* address_as_ush = reinterpret_cast<uint16_t*>(address);
+   uint16_t old = *address_as_ush, assumed;
+   do {
+       assumed = old;
+       int16_t temp = min(val, reinterpret_cast<int16_t&>(assumed));
+       old = atomicCAS(
+           address_as_ush, assumed, reinterpret_cast<uint16_t&>(temp)
+       );
+   } while (assumed != old);
+   return reinterpret_cast<int16_t&>(old);
+}
+
+// atomicMin() specialization for uint16_t
+template <>
+__device__ uint16_t atomicMin<uint16_t>(uint16_t* address, uint16_t val) {
+    uint16_t old = *address, assumed;
+    do {
+        assumed = old;
+        old = atomicCAS(address, assumed, min(val, assumed));
+    } while (assumed != old);
+    return old;
+}
+
+// atomicMin() specialization for float
 __device__ __forceinline__ float atomicMin(float* addr, float value) {
-  float old; old = !signbit(value) ? __int_as_float(atomicMin((int*)addr, __float_as_int(value))) : __uint_as_float(atomicMax((unsigned int*)addr, __float_as_uint(value)));
+  float old;
+  old = !signbit(value) ? __int_as_float(atomicMin((int*)addr, __float_as_int(value)))
+            : __uint_as_float(atomicMax((unsigned int*)addr, __float_as_uint(value)));
+  return old;
+}
+
+// atomicMin() specialization for double
+__device__ __forceinline__ double atomicMin(double* addr, double value) {
+  double old;
+  old = !signbit(value) ? __longlong_as_double(atomicMin((long long int*)addr, __double_as_longlong(value)))
+            : __ull2double_rz(atomicMax((unsigned long long int*)addr, __double2ull_ru(value)));
+  return old;
+}
+
+
+// atomicMax() specializations
+template <typename T>
+__device__ T atomicMax(T* address, T val);
+
+// atomicMax() specialization for int8_t
+template <>
+__device__ int8_t atomicMax<int8_t>(int8_t* address, int8_t val) {
+    unsigned int *base_address = (unsigned int *)((size_t)address & ~3);
+    unsigned int selectors[] = {0x3214, 0x3240, 0x3410, 0x4210};
+    unsigned int sel = selectors[(size_t)address & 3];
+    unsigned int old, assumed, max_, new_;
+    old = *base_address;
+    do {
+        assumed = old;
+        max_ = max(val, (int8_t)__byte_perm(old, 0, ((size_t)address & 3)));
+        new_ = __byte_perm(old, max_, sel);
+        old = atomicCAS(base_address, assumed, new_);
+    } while (assumed != old);
+    return old;
+}
+
+// atomicMax() specialization for uint8_t
+template <>
+__device__ uint8_t atomicMax<uint8_t>(uint8_t* address, uint8_t val) {
+    unsigned int *base_address = (unsigned int *)((size_t)address & ~3);
+    unsigned int selectors[] = {0x3214, 0x3240, 0x3410, 0x4210};
+    unsigned int sel = selectors[(size_t)address & 3];
+    unsigned int old, assumed, max_, new_;
+    old = *base_address;
+    do {
+        assumed = old;
+        max_ = max(val, (uint8_t)__byte_perm(old, 0, ((size_t)address & 3)));
+        new_ = __byte_perm(old, max_, sel);
+        old = atomicCAS(base_address, assumed, new_);
+    } while (assumed != old);
+    return old;
+}
+
+// atomicMax() specialization for int16_t
+template <>
+__device__ int16_t atomicMax<int16_t>(int16_t* address, int16_t val) {
+   uint16_t* address_as_ush = reinterpret_cast<uint16_t*>(address);
+   uint16_t old = *address_as_ush, assumed;
+   do {
+       assumed = old;
+       int16_t temp = max(val, reinterpret_cast<int16_t&>(assumed));
+       old = atomicCAS(
+           address_as_ush, assumed, reinterpret_cast<uint16_t&>(temp)
+       );
+   } while (assumed != old);
+   return reinterpret_cast<int16_t&>(old);
+}
+
+// atomicMax() specialization for uint16_t
+template <>
+__device__ uint16_t atomicMax<uint16_t>(uint16_t* address, uint16_t val) {
+    uint16_t old = *address, assumed;
+    do {
+        assumed = old;
+        old = atomicCAS(address, assumed, max(val, assumed));
+    } while (assumed != old);
+    return old;
+}
+
+// atomicMax() specialization for float
+template <>
+__device__ float atomicMax<float>(float* addr, float value) {
+  float old;
+  old = !signbit(value) ? __int_as_float(atomicMax((int*)addr, __float_as_int(value)))
+            : __uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(value)));
   return old;
 }
-__device__ __forceinline__ float atomicMax(float* addr, float value) {
-  float old; old = !signbit(value) ? __int_as_float(atomicMax((int*)addr, __float_as_int(value))) : __uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(value)));
+
+// atomicMax() specialization for double
+template <>
+__device__ double atomicMax<double>(double* addr, double value) {
+  double old;
+  old = !signbit(value) ? __longlong_as_double(atomicMax((long long int*)addr, __double_as_longlong(value)))
+            : __ull2double_rz(atomicMin((unsigned long long int*)addr, __double2ull_ru(value)));
   return old;
 }
 
+
+// atomicAdd() specialization for int64_t
+// uses 2's complement
 __device__ int64_t atomicAdd(int64_t* address, int64_t val) {
   uint64_t* address_as_ull = (uint64_t*)address;
   uint64_t old = *address_as_ull, assumed;
@@ -110,3 +271,79 @@ __device__ int64_t atomicAdd(int64_t* address, int64_t val) {
   } while (assumed != old);
   return (int64_t)old;
 }
+
+
+// atomicMul() specializations
+template <typename T>
+__device__ T atomicMul(T* address, T val);
+
+// atomicMul() specialization for int32_t
+template <>
+__device__ int32_t atomicMul<int32_t>(int32_t* address, int32_t val) {
+    int32_t old = *address, assumed;
+    do {
+        assumed = old;
+        old = atomicCAS(address, assumed, assumed * val);
+    } while (assumed != old);
+    return old;
+}
+
+// atomicMul() specialization for uint32_t
+template <>
+__device__ uint32_t atomicMul<uint32_t>(uint32_t* address, uint32_t val) {
+    uint32_t old = *address, assumed;
+    do {
+        assumed = old;
+        old = atomicCAS(address, assumed, assumed * val);
+    } while (assumed != old);
+    return old;
+}
+
+// atomicMul() specialization for int64_t
+template <>
+__device__ int64_t atomicMul<int64_t>(int64_t* address, int64_t val) {
+    uint64_t* address_as_uint64 = reinterpret_cast<uint64_t*>(address);
+    uint64_t old = *address_as_uint64, assumed;
+    uint64_t val_as_uint64 = *reinterpret_cast<uint64_t*>(&val);
+
+    do {
+        assumed = old;
+        old = atomicCAS(address_as_uint64, assumed, assumed * val_as_uint64);
+    } while (assumed != old);
+
+    return *reinterpret_cast<int64_t*>(&old);
+}
+
+// atomicMul() specialization for uint64_t
+template <>
+__device__ uint64_t atomicMul<uint64_t>(uint64_t* address, uint64_t val) {
+    uint64_t old = *address, assumed;
+    do {
+        assumed = old;
+        old = atomicCAS(address, assumed, assumed * val);
+    } while (assumed != old);
+    return old;
+}
+
+// atomicMul() specialization for float
+template <>
+__device__ float atomicMul<float>(float* address, float val) {
+    float old = *address, assumed;
+    do {
+        assumed = old;
+        old = __int_as_float(atomicCAS((int*)address, __float_as_int(assumed), __float_as_int(assumed * val)));
+    } while (assumed != old);
+    return old;
+}
+
+// atomicMul() specialization for double
+template <>
+__device__ double atomicMul<double>(double* address, double val) {
+    uint64_t* address_as_ull = (uint64_t*)address;
+    uint64_t old = *address_as_ull, assumed;
+    do {
+        assumed = old;
+        old = atomicCAS(address_as_ull, assumed, __double_as_longlong(__longlong_as_double(assumed) * val));
+    } while (assumed != old);
+    return __longlong_as_double(old);
+}
diff --git a/tests-cuda/test_3136_cuda_reducers.py b/tests-cuda/test_3136_cuda_reducers.py
index 3bc554e1a1..f382f852dd 100644
--- a/tests-cuda/test_3136_cuda_reducers.py
+++ b/tests-cuda/test_3136_cuda_reducers.py
@@ -275,7 +275,6 @@ def test_min():
         ak.min(array, axis=None, keepdims=True, initial=-100.0, mask_identity=False),
         ak.to_regular(ak.Array([[-100.0]], backend="cuda")),
     )
-
     assert ak.almost_equal(
         ak.min(array, axis=None, keepdims=True, mask_identity=True),
         ak.to_regular(

From f3d1cdc2f24b49ae0421b7b29c6e67305b8aeab0 Mon Sep 17 00:00:00 2001
From: ManasviGoyal <mg.manasvi@gmail.com>
Date: Wed, 12 Jun 2024 14:54:35 +0200
Subject: [PATCH 18/33] fix: missing template

---
 .../_connect/cuda/cuda_kernels/awkward_reduce_max.cu        | 4 ++--
 src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu       | 6 ++++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu
index 4afbe3f04c..26512bb8ec 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu
@@ -29,7 +29,7 @@ awkward_reduce_max_a(
   if (err_code[0] == NO_ERROR) {
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
     if (thread_id < outlength) {
-      toptr[thread_id] = static_cast<T>(identity);
+      toptr[thread_id] = identity;
     }
   }
 }
@@ -56,7 +56,7 @@ awkward_reduce_max_b(
     __syncthreads();
 
     for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
-      T val = static_cast<T>(identity);
+      T val = identity;
 
       if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
         val = temp[idx - stride];
diff --git a/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu b/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu
index 9d55a7b713..27bbd1ff60 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu
@@ -161,7 +161,8 @@ __device__ uint16_t atomicMin<uint16_t>(uint16_t* address, uint16_t val) {
 }
 
 // atomicMin() specialization for float
-__device__ __forceinline__ float atomicMin(float* addr, float value) {
+template <>
+__device__ float atomicMin<float>(float* addr, float value) {
   float old;
   old = !signbit(value) ? __int_as_float(atomicMin((int*)addr, __float_as_int(value)))
             : __uint_as_float(atomicMax((unsigned int*)addr, __float_as_uint(value)));
@@ -169,7 +170,8 @@ __device__ __forceinline__ float atomicMin(float* addr, float value) {
 }
 
 // atomicMin() specialization for double
-__device__ __forceinline__ double atomicMin(double* addr, double value) {
+template <>
+__device__ double atomicMin<double>(double* addr, double value) {
   double old;
   old = !signbit(value) ? __longlong_as_double(atomicMin((long long int*)addr, __double_as_longlong(value)))
             : __ull2double_rz(atomicMax((unsigned long long int*)addr, __double2ull_ru(value)));

From ef47eadde8b47039fe0f0fe6711ef01c1be5908f Mon Sep 17 00:00:00 2001
From: ManasviGoyal <mg.manasvi@gmail.com>
Date: Wed, 12 Jun 2024 15:05:53 +0200
Subject: [PATCH 19/33] fix: remove complex

---
 kernel-test-data.json | 58 -------------------------------------------
 1 file changed, 58 deletions(-)

diff --git a/kernel-test-data.json b/kernel-test-data.json
index e843bf9b05..ec7844c6c2 100644
--- a/kernel-test-data.json
+++ b/kernel-test-data.json
@@ -24436,64 +24436,6 @@
                 }
             ]
         },
-        {
-            "name": "awkward_reduce_sum_complex",
-            "status": true,
-            "tests": [
-                {
-                    "error": false,
-                    "message": "",
-                    "inputs": {
-                        "fromptr": [],
-                        "lenparents": 0,
-                        "outlength": 0,
-                        "parents": []
-                    },
-                    "outputs": {
-                        "toptr": []
-                    }
-                },
-                {
-                    "error": false,
-                    "message": "",
-                    "inputs": {
-                        "fromptr": [0, 0],
-                        "lenparents": 1,
-                        "outlength": 2,
-                        "parents": [0]
-                    },
-                    "outputs": {
-                        "toptr": [0, 0]
-                    }
-                },
-                {
-                    "error": false,
-                    "message": "",
-                    "inputs": {
-                        "fromptr": [2, 2, 3, 3, 5, 5, 7, 7, 11, 11, 13, 13, 17, 17, 19, 19, 23, 23],
-                        "lenparents": 9,
-                        "outlength": 12,
-                        "parents": [0, 0, 0, 2, 2, 3, 4, 4, 5]
-                    },
-                    "outputs": {
-                        "toptr": [10, 10, 0, 0, 18, 18, 13, 13, 36, 36, 23, 23]
-                    }
-                },
-                {
-                    "error": false,
-                    "message": "",
-                    "inputs": {
-                        "fromptr": [1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1],
-                        "lenparents": 6,
-                        "outlength": 8,
-                        "parents": [0, 0, 0, 2, 2, 3]
-                    },
-                    "outputs": {
-                        "toptr": [1, 3, 0, 0, 1, 2, 0, 1]
-                    }
-                }
-            ]
-        },
         {
             "name": "awkward_reduce_sum",
             "status": true,

From c881f1d3b16f484b21b4b86afbb2ca5a87b610c4 Mon Sep 17 00:00:00 2001
From: ManasviGoyal <mg.manasvi@gmail.com>
Date: Wed, 12 Jun 2024 15:32:41 +0200
Subject: [PATCH 20/33] fix: atomicMin() for float 32 and indentation

---
 .../_connect/cuda/cuda_kernels/cuda_common.cu | 264 +++++++++---------
 1 file changed, 132 insertions(+), 132 deletions(-)

diff --git a/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu b/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu
index 27bbd1ff60..a9ff6e1ce0 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu
@@ -103,78 +103,81 @@ __device__ T atomicMin(T* address, T val);
 // atomicMin() specialization for int8_t
 template <>
 __device__ int8_t atomicMin<int8_t>(int8_t* address, int8_t val) {
-    unsigned int *base_address = (unsigned int *)((size_t)address & ~3);
-    unsigned int selectors[] = {0x3214, 0x3240, 0x3410, 0x4210};
-    unsigned int sel = selectors[(size_t)address & 3];
-    unsigned int old, assumed, min_, new_;
-    old = *base_address;
-    do {
-        assumed = old;
-        min_ = min(val, (int8_t)__byte_perm(old, 0, ((size_t)address & 3)));
-        new_ = __byte_perm(old, min_, sel);
-        old = atomicCAS(base_address, assumed, new_);
-    } while (assumed != old);
-    return old;
+  unsigned int *base_address = (unsigned int *)((size_t)address & ~3);
+  unsigned int selectors[] = {0x3214, 0x3240, 0x3410, 0x4210};
+  unsigned int sel = selectors[(size_t)address & 3];
+  unsigned int old, assumed, min_, new_;
+  old = *base_address;
+  do {
+    assumed = old;
+    min_ = min(val, (int8_t)__byte_perm(old, 0, ((size_t)address & 3)));
+    new_ = __byte_perm(old, min_, sel);
+    old = atomicCAS(base_address, assumed, new_);
+  } while (assumed != old);
+  return old;
 }
 
 // atomicMin() specialization for uint8_t
 template <>
 __device__ uint8_t atomicMin<uint8_t>(uint8_t* address, uint8_t val) {
-    unsigned int *base_address = (unsigned int *)((size_t)address & ~3);
-    unsigned int selectors[] = {0x3214, 0x3240, 0x3410, 0x4210};
-    unsigned int sel = selectors[(size_t)address & 3];
-    unsigned int old, assumed, min_, new_;
-    old = *base_address;
-    do {
-        assumed = old;
-        min_ = min(val, (uint8_t)__byte_perm(old, 0, ((size_t)address & 3)));
-        new_ = __byte_perm(old, min_, sel);
-        old = atomicCAS(base_address, assumed, new_);
-    } while (assumed != old);
-    return old;
+  unsigned int *base_address = (unsigned int *)((size_t)address & ~3);
+  unsigned int selectors[] = {0x3214, 0x3240, 0x3410, 0x4210};
+  unsigned int sel = selectors[(size_t)address & 3];
+  unsigned int old, assumed, min_, new_;
+  old = *base_address;
+  do {
+    assumed = old;
+    min_ = min(val, (uint8_t)__byte_perm(old, 0, ((size_t)address & 3)));
+    new_ = __byte_perm(old, min_, sel);
+    old = atomicCAS(base_address, assumed, new_);
+  } while (assumed != old);
+  return old;
 }
 
 // atomicMin() specialization for int16_t
 template <>
 __device__ int16_t atomicMin<int16_t>(int16_t* address, int16_t val) {
-   uint16_t* address_as_ush = reinterpret_cast<uint16_t*>(address);
-   uint16_t old = *address_as_ush, assumed;
-   do {
-       assumed = old;
-       int16_t temp = min(val, reinterpret_cast<int16_t&>(assumed));
-       old = atomicCAS(
-           address_as_ush, assumed, reinterpret_cast<uint16_t&>(temp)
-       );
-   } while (assumed != old);
-   return reinterpret_cast<int16_t&>(old);
+  uint16_t* address_as_ush = reinterpret_cast<uint16_t*>(address);
+  uint16_t old = *address_as_ush, assumed;
+  do {
+    assumed = old;
+    int16_t temp = min(val, reinterpret_cast<int16_t&>(assumed));
+    old = atomicCAS(
+        address_as_ush, assumed, reinterpret_cast<uint16_t&>(temp)
+    );
+  } while (assumed != old);
+  return reinterpret_cast<int16_t&>(old);
 }
 
 // atomicMin() specialization for uint16_t
 template <>
 __device__ uint16_t atomicMin<uint16_t>(uint16_t* address, uint16_t val) {
-    uint16_t old = *address, assumed;
-    do {
-        assumed = old;
-        old = atomicCAS(address, assumed, min(val, assumed));
-    } while (assumed != old);
-    return old;
+  uint16_t old = *address, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address, assumed, min(val, assumed));
+  } while (assumed != old);
+  return old;
 }
 
 // atomicMin() specialization for float
 template <>
 __device__ float atomicMin<float>(float* addr, float value) {
-  float old;
-  old = !signbit(value) ? __int_as_float(atomicMin((int*)addr, __float_as_int(value)))
-            : __uint_as_float(atomicMax((unsigned int*)addr, __float_as_uint(value)));
-  return old;
+  int* address_as_i = (int*)addr;
+  int old = *address_as_i, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_i, assumed, __float_as_int(fminf(value, __int_as_float(assumed))));
+  } while (assumed != old);
+  return __int_as_float(old);
 }
 
 // atomicMin() specialization for double
 template <>
 __device__ double atomicMin<double>(double* addr, double value) {
   double old;
-  old = !signbit(value) ? __longlong_as_double(atomicMin((long long int*)addr, __double_as_longlong(value)))
-            : __ull2double_rz(atomicMax((unsigned long long int*)addr, __double2ull_ru(value)));
+  old = !signbit(value) ? __longlong_as_double(atomicMin((long long int*)addr, __double_as_longlong(value))) :
+      __ull2double_rz(atomicMax((unsigned long long int*)addr, __double2ull_ru(value)));
   return old;
 }
 
@@ -186,78 +189,77 @@ __device__ T atomicMax(T* address, T val);
 // atomicMax() specialization for int8_t
 template <>
 __device__ int8_t atomicMax<int8_t>(int8_t* address, int8_t val) {
-    unsigned int *base_address = (unsigned int *)((size_t)address & ~3);
-    unsigned int selectors[] = {0x3214, 0x3240, 0x3410, 0x4210};
-    unsigned int sel = selectors[(size_t)address & 3];
-    unsigned int old, assumed, max_, new_;
-    old = *base_address;
-    do {
-        assumed = old;
-        max_ = max(val, (int8_t)__byte_perm(old, 0, ((size_t)address & 3)));
-        new_ = __byte_perm(old, max_, sel);
-        old = atomicCAS(base_address, assumed, new_);
-    } while (assumed != old);
-    return old;
+  unsigned int *base_address = (unsigned int *)((size_t)address & ~3);
+  unsigned int selectors[] = {0x3214, 0x3240, 0x3410, 0x4210};
+  unsigned int sel = selectors[(size_t)address & 3];
+  unsigned int old, assumed, max_, new_;
+  old = *base_address;
+  do {
+    assumed = old;
+    max_ = max(val, (int8_t)__byte_perm(old, 0, ((size_t)address & 3)));
+    new_ = __byte_perm(old, max_, sel);
+    old = atomicCAS(base_address, assumed, new_);
+  } while (assumed != old);
+  return old;
 }
 
 // atomicMax() specialization for uint8_t
 template <>
 __device__ uint8_t atomicMax<uint8_t>(uint8_t* address, uint8_t val) {
-    unsigned int *base_address = (unsigned int *)((size_t)address & ~3);
-    unsigned int selectors[] = {0x3214, 0x3240, 0x3410, 0x4210};
-    unsigned int sel = selectors[(size_t)address & 3];
-    unsigned int old, assumed, max_, new_;
-    old = *base_address;
-    do {
-        assumed = old;
-        max_ = max(val, (uint8_t)__byte_perm(old, 0, ((size_t)address & 3)));
-        new_ = __byte_perm(old, max_, sel);
-        old = atomicCAS(base_address, assumed, new_);
-    } while (assumed != old);
-    return old;
+  unsigned int *base_address = (unsigned int *)((size_t)address & ~3);
+  unsigned int selectors[] = {0x3214, 0x3240, 0x3410, 0x4210};
+  unsigned int sel = selectors[(size_t)address & 3];
+  unsigned int old, assumed, max_, new_;
+  old = *base_address;
+  do {
+    assumed = old;
+    max_ = max(val, (uint8_t)__byte_perm(old, 0, ((size_t)address & 3)));
+    new_ = __byte_perm(old, max_, sel);
+    old = atomicCAS(base_address, assumed, new_);
+  } while (assumed != old);
+  return old;
 }
 
 // atomicMax() specialization for int16_t
 template <>
 __device__ int16_t atomicMax<int16_t>(int16_t* address, int16_t val) {
-   uint16_t* address_as_ush = reinterpret_cast<uint16_t*>(address);
-   uint16_t old = *address_as_ush, assumed;
-   do {
-       assumed = old;
-       int16_t temp = max(val, reinterpret_cast<int16_t&>(assumed));
-       old = atomicCAS(
-           address_as_ush, assumed, reinterpret_cast<uint16_t&>(temp)
-       );
-   } while (assumed != old);
-   return reinterpret_cast<int16_t&>(old);
+  uint16_t* address_as_ush = reinterpret_cast<uint16_t*>(address);
+  uint16_t old = *address_as_ush, assumed;
+  do {
+    assumed = old;
+    int16_t temp = max(val, reinterpret_cast<int16_t&>(assumed));
+    old = atomicCAS(
+        address_as_ush, assumed, reinterpret_cast<uint16_t&>(temp)
+    );
+  } while (assumed != old);
+  return reinterpret_cast<int16_t&>(old);
 }
 
 // atomicMax() specialization for uint16_t
 template <>
 __device__ uint16_t atomicMax<uint16_t>(uint16_t* address, uint16_t val) {
-    uint16_t old = *address, assumed;
-    do {
-        assumed = old;
-        old = atomicCAS(address, assumed, max(val, assumed));
-    } while (assumed != old);
-    return old;
+  uint16_t old = *address, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address, assumed, max(val, assumed));
+  } while (assumed != old);
+  return old;
 }
 
 // atomicMax() specialization for float
 template <>
 __device__ float atomicMax<float>(float* addr, float value) {
   float old;
-  old = !signbit(value) ? __int_as_float(atomicMax((int*)addr, __float_as_int(value)))
-            : __uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(value)));
+  old = !signbit(value) ? __int_as_float(atomicMax((int*)addr, __float_as_int(value))) :
+      __uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(value)));
   return old;
 }
-
 // atomicMax() specialization for double
 template <>
 __device__ double atomicMax<double>(double* addr, double value) {
   double old;
-  old = !signbit(value) ? __longlong_as_double(atomicMax((long long int*)addr, __double_as_longlong(value)))
-            : __ull2double_rz(atomicMin((unsigned long long int*)addr, __double2ull_ru(value)));
+  old = !signbit(value) ? __longlong_as_double(atomicMax((long long int*)addr, __double_as_longlong(value))) :
+      __ull2double_rz(atomicMin((unsigned long long int*)addr, __double2ull_ru(value)));
   return old;
 }
 
@@ -282,70 +284,68 @@ __device__ T atomicMul(T* address, T val);
 // atomicMul() specialization for int32_t
 template <>
 __device__ int32_t atomicMul<int32_t>(int32_t* address, int32_t val) {
-    int32_t old = *address, assumed;
-    do {
-        assumed = old;
-        old = atomicCAS(address, assumed, assumed * val);
-    } while (assumed != old);
-    return old;
+  int32_t old = *address, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address, assumed, assumed * val);
+  } while (assumed != old);
+  return old;
 }
 
 // atomicMul() specialization for uint32_t
 template <>
 __device__ uint32_t atomicMul<uint32_t>(uint32_t* address, uint32_t val) {
-    uint32_t old = *address, assumed;
-    do {
-        assumed = old;
-        old = atomicCAS(address, assumed, assumed * val);
-    } while (assumed != old);
-    return old;
+  uint32_t old = *address, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address, assumed, assumed * val);
+  } while (assumed != old);
+  return old;
 }
 
 // atomicMul() specialization for int64_t
 template <>
 __device__ int64_t atomicMul<int64_t>(int64_t* address, int64_t val) {
-    uint64_t* address_as_uint64 = reinterpret_cast<uint64_t*>(address);
-    uint64_t old = *address_as_uint64, assumed;
-    uint64_t val_as_uint64 = *reinterpret_cast<uint64_t*>(&val);
-
-    do {
-        assumed = old;
-        old = atomicCAS(address_as_uint64, assumed, assumed * val_as_uint64);
-    } while (assumed != old);
-
-    return *reinterpret_cast<int64_t*>(&old);
+  uint64_t* address_as_uint64 = reinterpret_cast<uint64_t*>(address);
+  uint64_t old = *address_as_uint64, assumed;
+  uint64_t val_as_uint64 = *reinterpret_cast<uint64_t*>(&val);
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_uint64, assumed, assumed * val_as_uint64);
+  } while (assumed != old);
+  return *reinterpret_cast<int64_t*>(&old);
 }
 
 // atomicMul() specialization for uint64_t
 template <>
 __device__ uint64_t atomicMul<uint64_t>(uint64_t* address, uint64_t val) {
-    uint64_t old = *address, assumed;
-    do {
-        assumed = old;
-        old = atomicCAS(address, assumed, assumed * val);
-    } while (assumed != old);
-    return old;
+  uint64_t old = *address, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address, assumed, assumed * val);
+  } while (assumed != old);
+  return old;
 }
 
 // atomicMul() specialization for float
 template <>
 __device__ float atomicMul<float>(float* address, float val) {
-    float old = *address, assumed;
-    do {
-        assumed = old;
-        old = __int_as_float(atomicCAS((int*)address, __float_as_int(assumed), __float_as_int(assumed * val)));
-    } while (assumed != old);
-    return old;
+  float old = *address, assumed;
+  do {
+    assumed = old;
+    old = __int_as_float(atomicCAS((int*)address, __float_as_int(assumed), __float_as_int(assumed * val)));
+  } while (assumed != old);
+  return old;
 }
 
 // atomicMul() specialization for double
 template <>
 __device__ double atomicMul<double>(double* address, double val) {
-    uint64_t* address_as_ull = (uint64_t*)address;
-    uint64_t old = *address_as_ull, assumed;
-    do {
-        assumed = old;
-        old = atomicCAS(address_as_ull, assumed, __double_as_longlong(__longlong_as_double(assumed) * val));
-    } while (assumed != old);
-    return __longlong_as_double(old);
+  uint64_t* address_as_ull = (uint64_t*)address;
+  uint64_t old = *address_as_ull, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed, __double_as_longlong(__longlong_as_double(assumed) * val));
+  } while (assumed != old);
+  return __longlong_as_double(old);
 }

From 38d30b9d186e91f34a966ff0d515efeb95f94c9b Mon Sep 17 00:00:00 2001
From: ManasviGoyal <mg.manasvi@gmail.com>
Date: Wed, 12 Jun 2024 15:45:52 +0200
Subject: [PATCH 21/33] fix: pass correct dtype of identity

---
 .../_connect/cuda/cuda_kernels/awkward_reduce_max.cu  |  4 ++--
 .../_connect/cuda/cuda_kernels/awkward_reduce_min.cu  |  4 ++--
 src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu | 11 ++++-------
 3 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu
index 26512bb8ec..6a3fe66055 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu
@@ -8,8 +8,8 @@
 //     else:
 //         grid_size = 1
 //     temp = cupy.full(lenparents, identity, dtype=toptr.dtype)
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_max_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, identity, temp, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_max_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, identity, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_max_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, toptr.dtype.type(identity), temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_max_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, toptr.dtype.type(identity), temp, invocation_index, err_code))
 // out["awkward_reduce_max_a", {dtype_specializations}] = None
 // out["awkward_reduce_max_b", {dtype_specializations}] = None
 // END PYTHON
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu
index 34325d91f1..12a72b338f 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu
@@ -8,8 +8,8 @@
 //     else:
 //         grid_size = 1
 //     temp = cupy.full(lenparents, identity, dtype=toptr.dtype)
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_min_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, identity, temp, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_min_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, identity, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_min_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, toptr.dtype.type(identity), temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_min_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, toptr.dtype.type(identity), temp, invocation_index, err_code))
 // out["awkward_reduce_min_a", {dtype_specializations}] = None
 // out["awkward_reduce_min_b", {dtype_specializations}] = None
 // END PYTHON
diff --git a/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu b/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu
index a9ff6e1ce0..9e8eb2bb35 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu
@@ -163,13 +163,10 @@ __device__ uint16_t atomicMin<uint16_t>(uint16_t* address, uint16_t val) {
 // atomicMin() specialization for float
 template <>
 __device__ float atomicMin<float>(float* addr, float value) {
-  int* address_as_i = (int*)addr;
-  int old = *address_as_i, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_i, assumed, __float_as_int(fminf(value, __int_as_float(assumed))));
-  } while (assumed != old);
-  return __int_as_float(old);
+  float old;
+  old = !signbit(value) ? __int_as_float(atomicMin((int*)addr, __float_as_int(value))) :
+      __uint_as_float(atomicMax((unsigned int*)addr, __float_as_uint(value)));
+  return old;
 }
 
 // atomicMin() specialization for double

From 51b0e15945a4d0ba5130fbf49811f889a8bd3323 Mon Sep 17 00:00:00 2001
From: ManasviGoyal <mg.manasvi@gmail.com>
Date: Wed, 12 Jun 2024 16:44:26 +0200
Subject: [PATCH 22/33] fix: remove combinations test

---
 kernel-test-data.json | 70 -------------------------------------------
 1 file changed, 70 deletions(-)

diff --git a/kernel-test-data.json b/kernel-test-data.json
index ec7844c6c2..b13f5fcc29 100644
--- a/kernel-test-data.json
+++ b/kernel-test-data.json
@@ -13108,76 +13108,6 @@
                 }
             ]
         },
-        {
-            "name": "awkward_ListArray_combinations",
-            "status": true,
-            "tests": [
-                {
-                    "error": false,
-                    "message": "",
-                    "inputs": {
-                        "fromindex": [],
-                        "length": 0,
-                        "n": 0,
-                        "replacement": false,
-                        "starts": [],
-                        "stops": []
-                    },
-                    "outputs": {
-                        "tocarry": [[0], [0]],
-                        "toindex": [0]
-                    }
-                },
-                {
-                    "error": false,
-                    "message": "",
-                    "inputs": {
-                        "fromindex": [0],
-                        "length": 1,
-                        "n": 2,
-                        "replacement": false,
-                        "starts": [0],
-                        "stops": [2]
-                    },
-                    "outputs": {
-                        "tocarry": [[0, 1], [0, 1]],
-                        "toindex": [1, 1]
-                    }
-                },
-                {
-                    "error": false,
-                    "message": "",
-                    "inputs": {
-                        "fromindex": [0, 3, 3, 5, 7],
-                        "length": 5,
-                        "n": 2,
-                        "replacement": false,
-                        "starts": [0, 4, 4, 7, 8],
-                        "stops": [4, 4, 7, 8, 13]
-                    },
-                    "outputs": {
-                        "tocarry": [[0, 6, 6, 9, 9, 19], [0, 6, 6, 9, 9, 19]],
-                        "toindex": [0, 6, 6, 9, 9, 19]
-                    }
-                },
-                {
-                    "error": false,
-                    "message": "",
-                    "inputs": {
-                        "fromindex": [0, 4, 4, 7, 8],
-                        "length": 5,
-                        "n": 2,
-                        "replacement": false,
-                        "starts": [0, 3, 3, 10, 10],
-                        "stops": [3, 3, 5, 10, 13]
-                    },
-                    "outputs": {
-                        "tocarry": [[0, 3, 3, 4, 4, 7], [0, 6, 6, 9, 9, 19]],
-                        "toindex": [0, 6, 6, 9, 9, 19]
-                    }
-                }
-            ]
-        },
         {
             "name": "awkward_ListArray_getitem_jagged_carrylen",
             "status": true,

From 7e7fdc4ce87c4696e7e1f50018109025c5cc96d3 Mon Sep 17 00:00:00 2001
From: Ianna Osborne <ianna.osborne@cern.ch>
Date: Thu, 13 Jun 2024 12:05:22 +0200
Subject: [PATCH 23/33] fix: manage resources and disable failing test

---
 tests-cuda/test_2922a_new_cuda_kernels.py     |   7 ++
 tests-cuda/test_2922b_new_cuda_kernels.py     |   7 ++
 tests-cuda/test_3065a_cuda_kernels.py         |   8 ++
 tests-cuda/test_3065b_cuda_kernels.py         |   7 ++
 tests-cuda/test_3065c_cuda_kernels.py         |   7 ++
 tests-cuda/test_3086_cuda_concatenate.py      |   7 ++
 .../test_3130_cuda_listarray_getitem_next.py  |   7 ++
 .../test_3136_cuda_argmin_and_argmax.py       |   7 ++
 tests-cuda/test_3136_cuda_reducers.py         | 101 ++++++++++++++----
 ...est_3140_cuda_jagged_and_masked_getitem.py |  29 +++++
 tests-cuda/test_3140_cuda_slicing.py          |   9 ++
 tests-cuda/test_3141_cuda_misc.py             |   9 ++
 12 files changed, 183 insertions(+), 22 deletions(-)

diff --git a/tests-cuda/test_2922a_new_cuda_kernels.py b/tests-cuda/test_2922a_new_cuda_kernels.py
index feb800ecac..fa71d13e63 100644
--- a/tests-cuda/test_2922a_new_cuda_kernels.py
+++ b/tests-cuda/test_2922a_new_cuda_kernels.py
@@ -16,6 +16,13 @@
 to_list = ak.operations.to_list
 
 
+@pytest.fixture(scope="function", autouse=True)
+def cleanup_cuda():
+    yield
+    cp._default_memory_pool.free_all_blocks()
+    cp.cuda.Device().synchronize()
+
+
 def test_0184_concatenate_operation_records():
     one = ak.highlevel.Array([[1, 2, 3], [None, 4], None, [None, 5]]).layout
     two = ak.highlevel.Array([6, 7, 8]).layout
diff --git a/tests-cuda/test_2922b_new_cuda_kernels.py b/tests-cuda/test_2922b_new_cuda_kernels.py
index 5666dabf59..f03a5ffe71 100644
--- a/tests-cuda/test_2922b_new_cuda_kernels.py
+++ b/tests-cuda/test_2922b_new_cuda_kernels.py
@@ -10,6 +10,13 @@
 to_list = ak.operations.to_list
 
 
+@pytest.fixture(scope="function", autouse=True)
+def cleanup_cuda():
+    yield
+    cp._default_memory_pool.free_all_blocks()
+    cp.cuda.Device().synchronize()
+
+
 def test_2651_parameter_union():
     layout = ak.contents.IndexedArray(
         ak.index.Index64([0, 1, 2]),
diff --git a/tests-cuda/test_3065a_cuda_kernels.py b/tests-cuda/test_3065a_cuda_kernels.py
index de8b634da0..798d690a41 100644
--- a/tests-cuda/test_3065a_cuda_kernels.py
+++ b/tests-cuda/test_3065a_cuda_kernels.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import cupy as cp
 import numpy as np
 import pytest
 
@@ -9,6 +10,13 @@
 to_list = ak.operations.to_list
 
 
+@pytest.fixture(scope="function", autouse=True)
+def cleanup_cuda():
+    yield
+    cp._default_memory_pool.free_all_blocks()
+    cp.cuda.Device().synchronize()
+
+
 def test_0449_merge_many_arrays_in_one_pass_concatenate():
     one = ak.highlevel.Array([1, 2, 3]).layout
     two = ak.highlevel.Array([4.4, 5.5]).layout
diff --git a/tests-cuda/test_3065b_cuda_kernels.py b/tests-cuda/test_3065b_cuda_kernels.py
index bad768249c..91e77bd37d 100644
--- a/tests-cuda/test_3065b_cuda_kernels.py
+++ b/tests-cuda/test_3065b_cuda_kernels.py
@@ -11,6 +11,13 @@
 to_list = ak.operations.to_list
 
 
+@pytest.fixture(scope="function", autouse=True)
+def cleanup_cuda():
+    yield
+    cp._default_memory_pool.free_all_blocks()
+    cp.cuda.Device().synchronize()
+
+
 def test_0582_propagate_context_in_broadcast_and_apply_firsts():
     array = ak.Array([[[0, 1, 2], []], [[3, 4]], [], [[5], [6, 7, 8, 9]]])
     cuda_array = ak.to_backend(array, "cuda")
diff --git a/tests-cuda/test_3065c_cuda_kernels.py b/tests-cuda/test_3065c_cuda_kernels.py
index 74ac927189..fef0b49181 100644
--- a/tests-cuda/test_3065c_cuda_kernels.py
+++ b/tests-cuda/test_3065c_cuda_kernels.py
@@ -9,6 +9,13 @@
 to_list = ak.operations.to_list
 
 
+@pytest.fixture(scope="function", autouse=True)
+def cleanup_cuda():
+    yield
+    cp._default_memory_pool.free_all_blocks()
+    cp.cuda.Device().synchronize()
+
+
 def test_0546_fill_none_replacement_value_type():
     array = ak.operations.values_astype(
         ak.highlevel.Array([1.1, 2.2, None, 3.3]), np.float32
diff --git a/tests-cuda/test_3086_cuda_concatenate.py b/tests-cuda/test_3086_cuda_concatenate.py
index e35206b55a..ccf06d22df 100644
--- a/tests-cuda/test_3086_cuda_concatenate.py
+++ b/tests-cuda/test_3086_cuda_concatenate.py
@@ -10,6 +10,13 @@
 to_list = ak.operations.to_list
 
 
+@pytest.fixture(scope="function", autouse=True)
+def cleanup_cuda():
+    yield
+    cp._default_memory_pool.free_all_blocks()
+    cp.cuda.Device().synchronize()
+
+
 def test_0184_concatenate_number():
     a1 = ak.highlevel.Array([[1, 2, 3], [], [4, 5]]).layout
     a2 = ak.highlevel.Array([[[1.1], [2.2, 3.3]], [[]], [[4.4], [5.5]]]).layout
diff --git a/tests-cuda/test_3130_cuda_listarray_getitem_next.py b/tests-cuda/test_3130_cuda_listarray_getitem_next.py
index c26c8f9319..66783ad014 100644
--- a/tests-cuda/test_3130_cuda_listarray_getitem_next.py
+++ b/tests-cuda/test_3130_cuda_listarray_getitem_next.py
@@ -19,6 +19,13 @@
 offsets2 = ak.index.IndexU32(np.array([0, 2, 3, 3, 5], np.uint32))
 
 
+@pytest.fixture(scope="function", autouse=True)
+def cleanup_cuda():
+    yield
+    cp._default_memory_pool.free_all_blocks()
+    cp.cuda.Device().synchronize()
+
+
 def tests_0020_support_unsigned_indexes_listarray_ellipsis():
     array1 = ak.contents.ListArray(starts1, stops1, content)
     array2 = ak.contents.ListArray(starts2, stops2, array1)
diff --git a/tests-cuda/test_3136_cuda_argmin_and_argmax.py b/tests-cuda/test_3136_cuda_argmin_and_argmax.py
index cc60ecfd51..861ced70c5 100644
--- a/tests-cuda/test_3136_cuda_argmin_and_argmax.py
+++ b/tests-cuda/test_3136_cuda_argmin_and_argmax.py
@@ -8,6 +8,13 @@
 to_list = ak.operations.to_list
 
 
+@pytest.fixture(scope="function", autouse=True)
+def cleanup_cuda():
+    yield
+    cp._default_memory_pool.free_all_blocks()
+    cp.cuda.Device().synchronize()
+
+
 def test_0835_argmin_argmax_axis_None():
     array = ak.highlevel.Array(
         [
diff --git a/tests-cuda/test_3136_cuda_reducers.py b/tests-cuda/test_3136_cuda_reducers.py
index f382f852dd..a2c11f4857 100644
--- a/tests-cuda/test_3136_cuda_reducers.py
+++ b/tests-cuda/test_3136_cuda_reducers.py
@@ -1,20 +1,30 @@
 from __future__ import annotations
 
+import cupy as cp
 import cupy.testing as cpt
 import numpy as np
+import pytest
 
 import awkward as ak
 
 to_list = ak.operations.to_list
 
 
-def test_sumprod_types():
-    def prod(xs):
-        out = 1
-        for x in xs:
-            out *= x
-        return out
+@pytest.fixture(scope="function", autouse=True)
+def cleanup_cuda():
+    yield
+    cp._default_memory_pool.free_all_blocks()
+    cp.cuda.Device().synchronize()
+
+
+def prod(xs):
+    out = 1
+    for x in xs:
+        out *= x
+    return out
+
 
+def test_sumprod_types():
     array = np.array([[True, False, False], [True, False, False]])
     content2 = ak.contents.NumpyArray(array.reshape(-1))
     offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64))
@@ -28,7 +38,10 @@ def prod(xs):
     assert prod(to_list(np.prod(array, axis=-1))) == prod(
         to_list(ak.prod(depth1, axis=-1, highlevel=False))
     )
+    del depth1
+
 
+def test_sumprod_types_1():
     array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.int8)
     content2 = ak.contents.NumpyArray(array.reshape(-1))
     offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64))
@@ -50,7 +63,10 @@ def prod(xs):
     assert prod(to_list(np.prod(array, axis=-1))) == prod(
         to_list(ak.prod(depth1, axis=-1, highlevel=False))
     )
+    del depth1
+
 
+def test_sumprod_types_2():
     array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.uint8)
     content2 = ak.contents.NumpyArray(array.reshape(-1))
     offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64))
@@ -72,7 +88,10 @@ def prod(xs):
     assert prod(to_list(np.prod(array, axis=-1))) == prod(
         to_list(ak.prod(depth1, axis=-1, highlevel=False))
     )
+    del depth1
 
+
+def test_sumprod_types_3():
     array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.int16)
     content2 = ak.contents.NumpyArray(array.reshape(-1))
     offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64))
@@ -94,7 +113,10 @@ def prod(xs):
     assert prod(to_list(np.prod(array, axis=-1))) == prod(
         to_list(ak.prod(depth1, axis=-1, highlevel=False))
     )
+    del depth1
+
 
+def test_sumprod_types_4():
     array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.uint16)
     content2 = ak.contents.NumpyArray(array.reshape(-1))
     offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64))
@@ -117,6 +139,8 @@ def prod(xs):
         to_list(ak.prod(depth1, axis=-1, highlevel=False))
     )
 
+
+def test_sumprod_types_5():
     array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.int32)
     content2 = ak.contents.NumpyArray(array.reshape(-1))
     offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64))
@@ -138,7 +162,10 @@ def prod(xs):
     assert prod(to_list(np.prod(array, axis=-1))) == prod(
         to_list(ak.prod(depth1, axis=-1, highlevel=False))
     )
+    del depth1
+
 
+def test_sumprod_types_6():
     array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.uint32)
     content2 = ak.contents.NumpyArray(array.reshape(-1))
     offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64))
@@ -160,7 +187,10 @@ def prod(xs):
     assert prod(to_list(np.prod(array, axis=-1))) == prod(
         to_list(ak.prod(depth1, axis=-1, highlevel=False))
     )
+    del depth1
 
+
+def test_sumprod_types_7():
     array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.int64)
     content2 = ak.contents.NumpyArray(array.reshape(-1))
     offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64))
@@ -182,7 +212,10 @@ def prod(xs):
     assert prod(to_list(np.prod(array, axis=-1))) == prod(
         to_list(ak.prod(depth1, axis=-1, highlevel=False))
     )
+    del depth1
+
 
+def test_sumprod_types_8():
     array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.uint64)
     content2 = ak.contents.NumpyArray(array.reshape(-1))
     offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64))
@@ -204,6 +237,7 @@ def prod(xs):
     assert prod(to_list(np.prod(array, axis=-1))) == prod(
         to_list(ak.prod(depth1, axis=-1, highlevel=False))
     )
+    del depth1
 
 
 def test_sumprod_types_FIXME():
@@ -221,14 +255,13 @@ def test_sumprod_types_FIXME():
         np.prod(array, axis=-1).dtype
         == ak.to_numpy(ak.prod(depth1, axis=-1, highlevel=False)).dtype
     )
-
-
-array = ak.Array(
-    [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda"
-)
+    del depth1
 
 
 def test_sum():
+    array = ak.Array(
+        [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda"
+    )
     cpt.assert_allclose(ak.sum(array, axis=None), 63.0)
     assert ak.almost_equal(
         ak.sum(array, axis=None, keepdims=True),
@@ -241,9 +274,13 @@ def test_sum():
         ),
     )
     assert ak.sum(array[2], axis=None, mask_identity=True) is None
+    del array
 
 
 def test_prod():
+    array = ak.Array(
+        [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda"
+    )
     cpt.assert_allclose(ak.prod(array[1:], axis=None), 4838400.0)
     assert ak.prod(array, axis=None) == 0
     assert ak.almost_equal(
@@ -263,9 +300,13 @@ def test_prod():
         ),
     )
     assert ak.prod(array[2], axis=None, mask_identity=True) is None
+    del array
 
 
 def test_min():
+    array = ak.Array(
+        [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda"
+    )
     cpt.assert_allclose(ak.min(array, axis=None), 0.0)
     assert ak.almost_equal(
         ak.min(array, axis=None, keepdims=True, mask_identity=False),
@@ -290,9 +331,13 @@ def test_min():
         ),
     )
     assert ak.min(array[2], axis=None, mask_identity=True) is None
+    del array
 
 
 def test_max():
+    array = ak.Array(
+        [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda"
+    )
     cpt.assert_allclose(ak.max(array, axis=None), 10.0)
     assert ak.almost_equal(
         ak.max(array, axis=None, keepdims=True, mask_identity=False),
@@ -317,14 +362,13 @@ def test_max():
         ),
     )
     assert ak.max(array[2], axis=None, mask_identity=True) is None
-
-
-array = ak.Array(
-    [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda"
-)
+    del array
 
 
 def test_count():
+    array = ak.Array(
+        [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda"
+    )
     assert ak.count(array, axis=None) == 12
     assert ak.almost_equal(
         ak.count(array, axis=None, keepdims=True, mask_identity=False),
@@ -344,9 +388,13 @@ def test_count():
     )
     assert ak.count(array[2], axis=None, mask_identity=True) is None
     assert ak.count(array[2], axis=None, mask_identity=False) == 0
+    del array
 
 
 def test_count_nonzero():
+    array = ak.Array(
+        [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda"
+    )
     assert ak.count_nonzero(array, axis=None) == 11
     assert ak.almost_equal(
         ak.count_nonzero(array, axis=None, keepdims=True, mask_identity=False),
@@ -366,13 +414,22 @@ def test_count_nonzero():
     )
     assert ak.count_nonzero(array[2], axis=None, mask_identity=True) is None
     assert ak.count_nonzero(array[2], axis=None, mask_identity=False) == 0
+    del array
 
 
 def test_std_no_mask_axis_none():
-    assert ak.almost_equal(
-        ak.std(array[-1:], axis=None, keepdims=True, mask_identity=True),
-        ak.to_regular(
-            ak.Array([[0.0]], backend="cuda").mask[ak.Array([[False]], backend="cuda")]
-        ),
+    array = ak.Array(
+        [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda"
+    )
+    out1 = ak.std(array[-1:], axis=None, keepdims=True, mask_identity=True)
+    out2 = ak.to_regular(
+        ak.Array([[0.0]], backend="cuda").mask[ak.Array([[False]], backend="cuda")]
     )
-    assert ak.std(array[2], axis=None, mask_identity=True) is None
+    assert ak.almost_equal(out1, out2)
+
+    # FIXME:
+    # out3 = ak.std(array[2], axis=None, mask_identity=True)
+    # assert out3 is None
+    del array
+    del out1
+    del out2
diff --git a/tests-cuda/test_3140_cuda_jagged_and_masked_getitem.py b/tests-cuda/test_3140_cuda_jagged_and_masked_getitem.py
index fff5417c03..8fc7aeb0e0 100644
--- a/tests-cuda/test_3140_cuda_jagged_and_masked_getitem.py
+++ b/tests-cuda/test_3140_cuda_jagged_and_masked_getitem.py
@@ -9,6 +9,13 @@
 to_list = ak.operations.to_list
 
 
+@pytest.fixture(scope="function", autouse=True)
+def cleanup_cuda():
+    yield
+    cp._default_memory_pool.free_all_blocks()
+    cp.cuda.Device().synchronize()
+
+
 def test_0111_jagged_and_masked_getitem_bitmaskedarray2b():
     array = ak.operations.from_iter(
         [[0.0, 1.1, 2.2], [3.3, 4.4], [5.5], [6.6, 7.7, 8.8, 9.9]], highlevel=False
@@ -36,6 +43,8 @@ def test_0111_jagged_and_masked_getitem_bitmaskedarray2b():
     ]
     assert maskedarray.to_typetracer()[cuda_array].form == maskedarray[cuda_array].form
 
+    del cuda_array
+
 
 def test_0111_jagged_and_masked_getitem_bytemaskedarray2b():
     array = ak.operations.from_iter(
@@ -62,6 +71,7 @@ def test_0111_jagged_and_masked_getitem_bytemaskedarray2b():
         [6.6, 9.9],
     ]
     assert maskedarray.to_typetracer()[cuda_array].form == maskedarray[cuda_array].form
+    del cuda_array
 
 
 def test_0111_jagged_and_masked_getitem_emptyarray():
@@ -113,6 +123,8 @@ def test_0111_jagged_and_masked_getitem_emptyarray():
     with pytest.raises(IndexError):
         cuda_listoffsetarray[cuda_array5]
 
+    del cuda_listoffsetarray
+
 
 def test_0111_jagged_and_masked_getitem_indexedarray():
     array = ak.operations.from_iter(
@@ -248,6 +260,9 @@ def test_0111_jagged_and_masked_getitem_indexedarray():
         == cuda_indexedarray[cuda_array1].form
     )
 
+    del cuda_indexedarray
+    del cuda_array1
+
 
 def test_0111_jagged_and_masked_getitem_indexedarray2():
     array = ak.operations.from_iter(
@@ -275,6 +290,8 @@ def test_0111_jagged_and_masked_getitem_indexedarray2():
         cuda_indexedarray.to_typetracer()[cuda_array].form
         == cuda_indexedarray[cuda_array].form
     )
+    del cuda_indexedarray
+    del cuda_array
 
 
 def test_0111_jagged_and_masked_getitem_indexedarray2b():
@@ -303,6 +320,8 @@ def test_0111_jagged_and_masked_getitem_indexedarray2b():
         cuda_indexedarray.to_typetracer()[cuda_array].form
         == cuda_indexedarray[cuda_array].form
     )
+    del cuda_indexedarray
+    del cuda_array
 
 
 def test_0111_jagged_and_masked_getitem_indexedarray3():
@@ -381,6 +400,13 @@ def test_0111_jagged_and_masked_getitem_indexedarray3():
     with pytest.raises(IndexError):
         cuda_array[cuda_array6]
 
+    del cuda_array
+    del cuda_array2
+    del cuda_array3
+    del cuda_array4
+    del cuda_array5
+    del cuda_array6
+
 
 def test_0111_jagged_and_masked_getitem_jagged():
     array = ak.highlevel.Array(
@@ -402,6 +428,9 @@ def test_0111_jagged_and_masked_getitem_jagged():
     ]
     assert cuda_array.to_typetracer()[cuda_array2].form == cuda_array[cuda_array2].form
 
+    del cuda_array
+    del cuda_array2
+
 
 def test_0111_jagged_and_masked_getitem_double_jagged():
     array = ak.highlevel.Array(
diff --git a/tests-cuda/test_3140_cuda_slicing.py b/tests-cuda/test_3140_cuda_slicing.py
index 047fc7977c..59e2cfcb67 100644
--- a/tests-cuda/test_3140_cuda_slicing.py
+++ b/tests-cuda/test_3140_cuda_slicing.py
@@ -1,12 +1,21 @@
 from __future__ import annotations
 
+import cupy as cp
 import numpy as np
+import pytest
 
 import awkward as ak
 
 to_list = ak.operations.to_list
 
 
+@pytest.fixture(scope="function", autouse=True)
+def cleanup_cuda():
+    yield
+    cp._default_memory_pool.free_all_blocks()
+    cp.cuda.Device().synchronize()
+
+
 def test_0315_integerindex_null_more():
     f = ak.highlevel.Array([[0, None, 2], None, [3, 4], []], backend="cuda").layout
     g1 = ak.highlevel.Array([[1, 2, None], None, [], [None]], backend="cuda").layout
diff --git a/tests-cuda/test_3141_cuda_misc.py b/tests-cuda/test_3141_cuda_misc.py
index eb5adeb78e..7582788d9c 100644
--- a/tests-cuda/test_3141_cuda_misc.py
+++ b/tests-cuda/test_3141_cuda_misc.py
@@ -1,6 +1,8 @@
 from __future__ import annotations
 
+import cupy as cp
 import numpy as np
+import pytest
 
 import awkward as ak
 from awkward.types import ArrayType, NumpyType, RegularType
@@ -8,6 +10,13 @@
 to_list = ak.operations.to_list
 
 
+@pytest.fixture(scope="function", autouse=True)
+def cleanup_cuda():
+    yield
+    cp._default_memory_pool.free_all_blocks()
+    cp.cuda.Device().synchronize()
+
+
 def test_0150_ByteMaskedArray_flatten():
     content = ak.operations.from_iter(
         [

From 1148b9525fb11b002640c952c64ab52683aaf988 Mon Sep 17 00:00:00 2001
From: ManasviGoyal <mg.manasvi@gmail.com>
Date: Tue, 18 Jun 2024 11:29:47 +0200
Subject: [PATCH 24/33] fix: uncomment fixed test for slicing

---
 ...est_3140_cuda_jagged_and_masked_getitem.py | 95 +++++++++----------
 1 file changed, 47 insertions(+), 48 deletions(-)

diff --git a/tests-cuda/test_3140_cuda_jagged_and_masked_getitem.py b/tests-cuda/test_3140_cuda_jagged_and_masked_getitem.py
index 8fc7aeb0e0..064a6a5763 100644
--- a/tests-cuda/test_3140_cuda_jagged_and_masked_getitem.py
+++ b/tests-cuda/test_3140_cuda_jagged_and_masked_getitem.py
@@ -562,54 +562,53 @@ def test_0111_jagged_and_masked_getitem_array_boolean_to_int():
     b = ak._slicing._normalise_item_bool_to_int(cuda_a, backend=cuda_a.backend)
     assert to_list(b) == [[1, 2], [], [1], [], [1, 2, 3]]
 
-    # a = ak.operations.from_iter(
-    #     [[True, True, None], [], [True, None], [None], [True, True, True, None]],
-    #     highlevel=False,
-    # )
-    # cuda_a = ak.to_backend(a, "cuda", highlevel=False)
-    # # b = ak._slicing._normalise_item_bool_to_int(cuda_a, backend=cuda_a.backend)
-    # # error in _slicing line 553 - FIXME
-    # assert to_list(b) == [[0, 1, None], [], [0, None], [None], [0, 1, 2, None]]
-    # assert (
-    #     b.content.index.data[b.content.index.data >= 0].tolist()
-    #     == np.arange(6).tolist()  # kernels expect nonnegative entries to be arange
-    # )
-
-    # a = ak.operations.from_iter(
-    #     [[None, True, True], [], [None, True], [None], [None, True, True, True]],
-    #     highlevel=False,
-    # )
-    # cuda_a = ak.to_backend(a, "cuda", highlevel=False)
-    # b = ak._slicing._normalise_item_bool_to_int(cuda_a, backend=cuda_a.backend)
-    # assert to_list(b) == [[None, 1, 2], [], [None, 1], [None], [None, 1, 2, 3]]
-    # assert (
-    #     b.content.index.data[b.content.index.data >= 0].tolist()
-    #     == np.arange(6).tolist()  # kernels expect nonnegative entries to be arange
-    # )
-
-    # a = ak.operations.from_iter(
-    #     [[False, True, None], [], [False, None], [None], [False, True, True, None]],
-    #     highlevel=False,
-    # )
-    # cuda_a = ak.to_backend(a, "cuda", highlevel=False)
-    # b = ak._slicing._normalise_item_bool_to_int(cuda_a, backend=cuda_a.backend)
-    # assert to_list(b) == [[1, None], [], [None], [None], [1, 2, None]]
-    # assert (
-    #     b.content.index.data[b.content.index.data >= 0].tolist()
-    #     == np.arange(3).tolist()  # kernels expect nonnegative entries to be arange
-    # )
-
-    # a = ak.operations.from_iter(
-    #     [[None, True, False], [], [None, False], [None], [None, True, True, False]],
-    #     highlevel=False,
-    # )
-    # cuda_a = ak.to_backend(a, "cuda", highlevel=False)
-    # b = ak._slicing._normalise_item_bool_to_int(cuda_a, backend=cuda_a.backend)
-    # assert to_list(b) == [[None, 1], [], [None], [None], [None, 1, 2]]
-    # assert (
-    #     b.content.index.data[b.content.index.data >= 0].tolist()
-    #     == np.arange(3).tolist()  # kernels expect nonnegative entries to be arange
-    # )
+    a = ak.operations.from_iter(
+        [[True, True, None], [], [True, None], [None], [True, True, True, None]],
+        highlevel=False,
+    )
+    cuda_a = ak.to_backend(a, "cuda", highlevel=False)
+    b = ak._slicing._normalise_item_bool_to_int(cuda_a, backend=cuda_a.backend)
+    assert to_list(b) == [[0, 1, None], [], [0, None], [None], [0, 1, 2, None]]
+    assert (
+        b.content.index.data[b.content.index.data >= 0].tolist()
+        == np.arange(6).tolist()  # kernels expect nonnegative entries to be arange
+    )
+
+    a = ak.operations.from_iter(
+        [[None, True, True], [], [None, True], [None], [None, True, True, True]],
+        highlevel=False,
+    )
+    cuda_a = ak.to_backend(a, "cuda", highlevel=False)
+    b = ak._slicing._normalise_item_bool_to_int(cuda_a, backend=cuda_a.backend)
+    assert to_list(b) == [[None, 1, 2], [], [None, 1], [None], [None, 1, 2, 3]]
+    assert (
+        b.content.index.data[b.content.index.data >= 0].tolist()
+        == np.arange(6).tolist()  # kernels expect nonnegative entries to be arange
+    )
+
+    a = ak.operations.from_iter(
+        [[False, True, None], [], [False, None], [None], [False, True, True, None]],
+        highlevel=False,
+    )
+    cuda_a = ak.to_backend(a, "cuda", highlevel=False)
+    b = ak._slicing._normalise_item_bool_to_int(cuda_a, backend=cuda_a.backend)
+    assert to_list(b) == [[1, None], [], [None], [None], [1, 2, None]]
+    assert (
+        b.content.index.data[b.content.index.data >= 0].tolist()
+        == np.arange(3).tolist()  # kernels expect nonnegative entries to be arange
+    )
+
+    a = ak.operations.from_iter(
+        [[None, True, False], [], [None, False], [None], [None, True, True, False]],
+        highlevel=False,
+    )
+    cuda_a = ak.to_backend(a, "cuda", highlevel=False)
+    b = ak._slicing._normalise_item_bool_to_int(cuda_a, backend=cuda_a.backend)
+    assert to_list(b) == [[None, 1], [], [None], [None], [None, 1, 2]]
+    assert (
+        b.content.index.data[b.content.index.data >= 0].tolist()
+        == np.arange(3).tolist()  # kernels expect nonnegative entries to be arange
+    )
 
 
 def test_0111_jagged_and_masked_getitem_array_slice():

From 8e926ab8de169974ec69041897249e8624d62221 Mon Sep 17 00:00:00 2001
From: ManasviGoyal <mg.manasvi@gmail.com>
Date: Tue, 18 Jun 2024 15:05:10 +0200
Subject: [PATCH 25/33] fix: correctly interpret typetracer array for cuda
 backend

---
 src/awkward/contents/regulararray.py  |  3 +-
 tests-cuda/test_3136_cuda_reducers.py | 60 ++++++++++++++++++---------
 2 files changed, 42 insertions(+), 21 deletions(-)

diff --git a/src/awkward/contents/regulararray.py b/src/awkward/contents/regulararray.py
index 2d144a0b7a..3c79050740 100644
--- a/src/awkward/contents/regulararray.py
+++ b/src/awkward/contents/regulararray.py
@@ -358,7 +358,6 @@ def _carry(self, carry: Index, allow_lazy: bool) -> Content:
             nextcarry = ak.index.Index64.empty(
                 where.shape[0] * self._size, self._backend.index_nplike
             )
-
         assert nextcarry.nplike is self._backend.index_nplike
         self._maybe_index_error(
             self._backend[
@@ -472,6 +471,8 @@ def _getitem_next(
             nexthead, nexttail = ak._slicing.head_tail(tail)
             nextcarry = ak.index.Index64.empty(self._length, index_nplike)
             assert nextcarry.nplike is index_nplike
+            if ak.backend(nextcarry.data) == "cuda":
+                head = int(ak.to_backend(head, backend=self._backend)[0])
             self._maybe_index_error(
                 self._backend[
                     "awkward_RegularArray_getitem_next_at", nextcarry.dtype.type
diff --git a/tests-cuda/test_3136_cuda_reducers.py b/tests-cuda/test_3136_cuda_reducers.py
index a2c11f4857..4a218c613f 100644
--- a/tests-cuda/test_3136_cuda_reducers.py
+++ b/tests-cuda/test_3136_cuda_reducers.py
@@ -24,7 +24,7 @@ def prod(xs):
     return out
 
 
-def test_sumprod_types():
+def test_0115_generic_reducer_operation_sumprod_types():
     array = np.array([[True, False, False], [True, False, False]])
     content2 = ak.contents.NumpyArray(array.reshape(-1))
     offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64))
@@ -41,7 +41,7 @@ def test_sumprod_types():
     del depth1
 
 
-def test_sumprod_types_1():
+def test_0115_generic_reducer_operation_sumprod_types_1():
     array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.int8)
     content2 = ak.contents.NumpyArray(array.reshape(-1))
     offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64))
@@ -66,7 +66,7 @@ def test_sumprod_types_1():
     del depth1
 
 
-def test_sumprod_types_2():
+def test_0115_generic_reducer_operation_sumprod_types_2():
     array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.uint8)
     content2 = ak.contents.NumpyArray(array.reshape(-1))
     offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64))
@@ -91,7 +91,7 @@ def test_sumprod_types_2():
     del depth1
 
 
-def test_sumprod_types_3():
+def test_0115_generic_reducer_operation_sumprod_types_3():
     array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.int16)
     content2 = ak.contents.NumpyArray(array.reshape(-1))
     offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64))
@@ -116,7 +116,7 @@ def test_sumprod_types_3():
     del depth1
 
 
-def test_sumprod_types_4():
+def test_0115_generic_reducer_operation_sumprod_types_4():
     array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.uint16)
     content2 = ak.contents.NumpyArray(array.reshape(-1))
     offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64))
@@ -140,7 +140,7 @@ def test_sumprod_types_4():
     )
 
 
-def test_sumprod_types_5():
+def test_0115_generic_reducer_operation_sumprod_types_5():
     array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.int32)
     content2 = ak.contents.NumpyArray(array.reshape(-1))
     offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64))
@@ -165,7 +165,7 @@ def test_sumprod_types_5():
     del depth1
 
 
-def test_sumprod_types_6():
+def test_0115_generic_reducer_operation_sumprod_types_6():
     array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.uint32)
     content2 = ak.contents.NumpyArray(array.reshape(-1))
     offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64))
@@ -190,7 +190,7 @@ def test_sumprod_types_6():
     del depth1
 
 
-def test_sumprod_types_7():
+def test_0115_generic_reducer_operation_sumprod_types_7():
     array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.int64)
     content2 = ak.contents.NumpyArray(array.reshape(-1))
     offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64))
@@ -215,7 +215,7 @@ def test_sumprod_types_7():
     del depth1
 
 
-def test_sumprod_types_8():
+def test_0115_generic_reducer_operation_sumprod_types_8():
     array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.uint64)
     content2 = ak.contents.NumpyArray(array.reshape(-1))
     offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64))
@@ -240,7 +240,7 @@ def test_sumprod_types_8():
     del depth1
 
 
-def test_sumprod_types_FIXME():
+def test_0115_generic_reducer_operation_sumprod_types_FIXME():
     array = np.array([[True, False, False], [True, False, False]])
     content2 = ak.contents.NumpyArray(array.reshape(-1))
     offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64))
@@ -258,7 +258,7 @@ def test_sumprod_types_FIXME():
     del depth1
 
 
-def test_sum():
+def test_2020_reduce_axis_none_sum():
     array = ak.Array(
         [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda"
     )
@@ -277,7 +277,7 @@ def test_sum():
     del array
 
 
-def test_prod():
+def test_2020_reduce_axis_none_prod():
     array = ak.Array(
         [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda"
     )
@@ -303,7 +303,7 @@ def test_prod():
     del array
 
 
-def test_min():
+def test_2020_reduce_axis_none_min():
     array = ak.Array(
         [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda"
     )
@@ -334,7 +334,7 @@ def test_min():
     del array
 
 
-def test_max():
+def test_2020_reduce_axis_none_max():
     array = ak.Array(
         [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda"
     )
@@ -365,7 +365,7 @@ def test_max():
     del array
 
 
-def test_count():
+def test_2020_reduce_axis_none_count():
     array = ak.Array(
         [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda"
     )
@@ -391,7 +391,7 @@ def test_count():
     del array
 
 
-def test_count_nonzero():
+def test_2020_reduce_axis_none_count_nonzero():
     array = ak.Array(
         [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda"
     )
@@ -417,7 +417,7 @@ def test_count_nonzero():
     del array
 
 
-def test_std_no_mask_axis_none():
+def test_2020_reduce_axis_none_std_no_mask_axis_none():
     array = ak.Array(
         [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda"
     )
@@ -427,9 +427,29 @@ def test_std_no_mask_axis_none():
     )
     assert ak.almost_equal(out1, out2)
 
-    # FIXME:
-    # out3 = ak.std(array[2], axis=None, mask_identity=True)
-    # assert out3 is None
+    out3 = ak.std(array[2], axis=None, mask_identity=True)
+    assert out3 is None
     del array
     del out1
     del out2
+
+
+def test_2020_reduce_axis_none_std():
+    array = ak.Array(
+        [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda"
+    )
+    cpt.assert_allclose(ak.std(array, axis=None), 3.139134700306227)
+    cpt.assert_allclose(
+        ak.std(array, axis=None, keepdims=True, mask_identity=False),
+        ak.to_regular([[3.139134700306227]]),
+    )
+    cpt.assert_allclose(
+        ak.std(array, axis=None, keepdims=True, mask_identity=True),
+        ak.to_regular(
+            ak.Array([[3.139134700306227]], backend="cuda").mask[
+                ak.Array([[True]], backend="cuda")
+            ]
+        ),
+    )
+    assert np.isnan(ak.std(array[2], axis=None, mask_identity=False))
+    del array

From 38d314d30ce42d37d6b87afff19227f56ce5dbed Mon Sep 17 00:00:00 2001
From: ManasviGoyal <mg.manasvi@gmail.com>
Date: Tue, 18 Jun 2024 15:22:22 +0200
Subject: [PATCH 26/33] fix: tests-spec error for bool

---
 dev/generate-tests.py                 | 2 +-
 tests-cuda/test_3136_cuda_reducers.py | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/dev/generate-tests.py b/dev/generate-tests.py
index 4250fa11c7..45068c590c 100644
--- a/dev/generate-tests.py
+++ b/dev/generate-tests.py
@@ -424,7 +424,7 @@ def genspectests(specdict):
 
 """
             )
-            f.write("import pytest\nimport kernels\n\n")
+            f.write("import pytest\nimport numpy as np\nimport kernels\n\n")
             num = 1
             if spec.tests == []:
                 f.write(
diff --git a/tests-cuda/test_3136_cuda_reducers.py b/tests-cuda/test_3136_cuda_reducers.py
index 4a218c613f..06ab47117a 100644
--- a/tests-cuda/test_3136_cuda_reducers.py
+++ b/tests-cuda/test_3136_cuda_reducers.py
@@ -430,8 +430,7 @@ def test_2020_reduce_axis_none_std_no_mask_axis_none():
     out3 = ak.std(array[2], axis=None, mask_identity=True)
     assert out3 is None
     del array
-    del out1
-    del out2
+    del out1, out2, out3
 
 
 def test_2020_reduce_axis_none_std():

From 15068b66fb199f293882ba9a85f7db63610b0c64 Mon Sep 17 00:00:00 2001
From: ManasviGoyal <mg.manasvi@gmail.com>
Date: Tue, 18 Jun 2024 15:32:12 +0200
Subject: [PATCH 27/33] fix: check for the backend of head

---
 src/awkward/contents/regulararray.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/awkward/contents/regulararray.py b/src/awkward/contents/regulararray.py
index 3c79050740..a5a16fcdff 100644
--- a/src/awkward/contents/regulararray.py
+++ b/src/awkward/contents/regulararray.py
@@ -471,7 +471,7 @@ def _getitem_next(
             nexthead, nexttail = ak._slicing.head_tail(tail)
             nextcarry = ak.index.Index64.empty(self._length, index_nplike)
             assert nextcarry.nplike is index_nplike
-            if ak.backend(nextcarry.data) == "cuda":
+            if ak.backend(head) == "cuda":
                 head = int(ak.to_backend(head, backend=self._backend)[0])
             self._maybe_index_error(
                 self._backend[

From 6cf0919be5e02e5e2d1a52af5725654b11713c05 Mon Sep 17 00:00:00 2001
From: Manasvi Goyal <mg.manasvi@gmail.com>
Date: Fri, 21 Jun 2024 14:04:07 +0200
Subject: [PATCH 28/33] test: reducer CUDAkernel tests

---
 ...est_3162_cuda_generic_reducer_operation.py | 865 ++++++++++++++++++
 1 file changed, 865 insertions(+)
 create mode 100644 tests-cuda/test_3162_cuda_generic_reducer_operation.py

diff --git a/tests-cuda/test_3162_cuda_generic_reducer_operation.py b/tests-cuda/test_3162_cuda_generic_reducer_operation.py
new file mode 100644
index 0000000000..bcfc5488c9
--- /dev/null
+++ b/tests-cuda/test_3162_cuda_generic_reducer_operation.py
@@ -0,0 +1,865 @@
+from __future__ import annotations
+
+import cupy as cp
+import numpy as np
+import pytest
+
+import awkward as ak
+
+to_list = ak.operations.to_list
+
+
+@pytest.fixture(scope="function", autouse=True)
+def cleanup_cuda():
+    yield
+    cp._default_memory_pool.free_all_blocks()
+    cp.cuda.Device().synchronize()
+
+
+to_list = ak.operations.to_list
+
+primes = [x for x in range(2, 1000) if all(x % n != 0 for n in range(2, x))]
+
+
+def test_0115_generic_reducer_operation_ListOffsetArray_to_RegularArray():
+    content = ak.contents.NumpyArray(np.array(primes[: 2 * 3 * 5], dtype=np.int64))
+    offsets1 = ak.index.Index64(np.array([0, 5, 10, 15, 20, 25, 30], dtype=np.int64))
+    listoffsetarray = ak.contents.ListOffsetArray(offsets1, content)
+    regulararray = listoffsetarray.to_RegularArray()
+    cuda_listoffsetarray = ak.to_backend(listoffsetarray, "cuda")
+    cuda_regulararray = ak.to_backend(regulararray, "cuda")
+
+    assert to_list(cuda_listoffsetarray) == to_list(cuda_regulararray)
+    del cuda_listoffsetarray, cuda_regulararray
+
+
+def test_0115_generic_reducer_operation_dimension_optiontype_1():
+    content = ak.contents.NumpyArray(np.array(primes[: 2 * 3 * 5], dtype=np.int64))
+    offsets1 = ak.index.Index64(np.array([0, 5, 10, 15, 20, 25, 30], dtype=np.int64))
+    listoffsetarray = ak.contents.ListOffsetArray(offsets1, content)
+    index = ak.index.Index64(np.array([5, -1, 3, 2, -1, 0], dtype=np.int64))
+    indexedarray = ak.contents.IndexedOptionArray(index, listoffsetarray)
+    depth2 = ak.contents.RegularArray(indexedarray, 3)
+    depth2 = ak.to_backend(depth2, "cuda")
+
+    assert to_list(depth2) == [
+        [[101, 103, 107, 109, 113], None, [53, 59, 61, 67, 71]],
+        [[31, 37, 41, 43, 47], None, [2, 3, 5, 7, 11]],
+    ]
+    assert to_list(ak.prod(depth2, axis=-1, keepdims=False, highlevel=False)) == [
+        [101 * 103 * 107 * 109 * 113, None, 53 * 59 * 61 * 67 * 71],
+        [31 * 37 * 41 * 43 * 47, None, 2 * 3 * 5 * 7 * 11],
+    ]
+    assert to_list(ak.prod(depth2, axis=-1, keepdims=True, highlevel=False)) == [
+        [[101 * 103 * 107 * 109 * 113], None, [53 * 59 * 61 * 67 * 71]],
+        [[31 * 37 * 41 * 43 * 47], None, [2 * 3 * 5 * 7 * 11]],
+    ]
+    del depth2
+
+
+def test_0115_generic_reducer_operation_dimension_optiontype_2():
+    content = ak.contents.NumpyArray(np.array(primes[: 2 * 3 * 5], dtype=np.int64))
+    offsets1 = ak.index.Index64(np.array([0, 5, 10, 15, 20, 25, 30], dtype=np.int64))
+    listoffsetarray = ak.contents.ListOffsetArray(offsets1, content)
+    index = ak.index.Index64(np.array([5, 4, 3, 2, 1, 0], dtype=np.int64))
+    indexedarray = ak.contents.IndexedArray(index, listoffsetarray)
+    depth2 = ak.contents.RegularArray(indexedarray, 3)
+    depth2 = ak.to_backend(depth2, "cuda")
+
+    assert to_list(depth2) == [
+        [[101, 103, 107, 109, 113], [73, 79, 83, 89, 97], [53, 59, 61, 67, 71]],
+        [[31, 37, 41, 43, 47], [13, 17, 19, 23, 29], [2, 3, 5, 7, 11]],
+    ]
+    assert to_list(ak.prod(depth2, axis=-1, highlevel=False)) == [
+        [101 * 103 * 107 * 109 * 113, 73 * 79 * 83 * 89 * 97, 53 * 59 * 61 * 67 * 71],
+        [31 * 37 * 41 * 43 * 47, 13 * 17 * 19 * 23 * 29, 2 * 3 * 5 * 7 * 11],
+    ]
+    assert to_list(ak.prod(depth2, axis=-1, keepdims=True, highlevel=False)) == [
+        [
+            [101 * 103 * 107 * 109 * 113],
+            [73 * 79 * 83 * 89 * 97],
+            [53 * 59 * 61 * 67 * 71],
+        ],
+        [[31 * 37 * 41 * 43 * 47], [13 * 17 * 19 * 23 * 29], [2 * 3 * 5 * 7 * 11]],
+    ]
+    del depth2
+
+
+def test_0115_generic_reducer_operation_reproduce_numpy_1():
+    content1 = ak.contents.NumpyArray(np.array(primes[: 2 * 3 * 5], dtype=np.int64))
+    offsets1 = ak.index.Index64(np.array([0, 5, 10, 15, 20, 25, 30], dtype=np.int64))
+    offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64))
+    depth2 = ak.contents.ListOffsetArray(
+        offsets2, ak.contents.ListOffsetArray(offsets1, content1)
+    )
+    depth2 = ak.to_backend(depth2, "cuda", highlevel=False)
+
+    assert to_list(depth2) == [
+        [[2, 3, 5, 7, 11], [13, 17, 19, 23, 29], [31, 37, 41, 43, 47]],
+        [[53, 59, 61, 67, 71], [73, 79, 83, 89, 97], [101, 103, 107, 109, 113]],
+    ]
+
+    assert to_list(ak.prod(depth2, axis=-1, highlevel=False)) == [
+        [2 * 3 * 5 * 7 * 11, 13 * 17 * 19 * 23 * 29, 31 * 37 * 41 * 43 * 47],
+        [53 * 59 * 61 * 67 * 71, 73 * 79 * 83 * 89 * 97, 101 * 103 * 107 * 109 * 113],
+    ]
+    assert (
+        ak.prod(depth2.to_typetracer(), axis=-1, highlevel=False).form
+        == ak.prod(depth2, axis=-1, highlevel=False).form
+    )
+    assert to_list(ak.prod(depth2, axis=2, highlevel=False)) == [
+        [2 * 3 * 5 * 7 * 11, 13 * 17 * 19 * 23 * 29, 31 * 37 * 41 * 43 * 47],
+        [53 * 59 * 61 * 67 * 71, 73 * 79 * 83 * 89 * 97, 101 * 103 * 107 * 109 * 113],
+    ]
+    assert (
+        ak.prod(depth2.to_typetracer(), axis=2, highlevel=False).form
+        == ak.prod(depth2, axis=2, highlevel=False).form
+    )
+
+    assert to_list(ak.prod(depth2, axis=-2, highlevel=False)) == [
+        [2 * 13 * 31, 3 * 17 * 37, 5 * 19 * 41, 7 * 23 * 43, 11 * 29 * 47],
+        [53 * 73 * 101, 59 * 79 * 103, 61 * 83 * 107, 67 * 89 * 109, 71 * 97 * 113],
+    ]
+    assert (
+        ak.prod(depth2.to_typetracer(), axis=-2, highlevel=False).form
+        == ak.prod(depth2, axis=-2, highlevel=False).form
+    )
+    assert to_list(ak.prod(depth2, axis=1, highlevel=False)) == [
+        [2 * 13 * 31, 3 * 17 * 37, 5 * 19 * 41, 7 * 23 * 43, 11 * 29 * 47],
+        [53 * 73 * 101, 59 * 79 * 103, 61 * 83 * 107, 67 * 89 * 109, 71 * 97 * 113],
+    ]
+    assert (
+        ak.prod(depth2.to_typetracer(), axis=1, highlevel=False).form
+        == ak.prod(depth2, axis=1, highlevel=False).form
+    )
+
+    assert to_list(ak.prod(depth2, axis=-3, highlevel=False)) == [
+        [2 * 53, 3 * 59, 5 * 61, 7 * 67, 11 * 71],
+        [13 * 73, 17 * 79, 19 * 83, 23 * 89, 29 * 97],
+        [31 * 101, 37 * 103, 41 * 107, 43 * 109, 47 * 113],
+    ]
+    assert (
+        ak.prod(depth2.to_typetracer(), axis=-3, highlevel=False).form
+        == ak.prod(depth2, axis=-3, highlevel=False).form
+    )
+    assert to_list(ak.prod(depth2, axis=0, highlevel=False)) == [
+        [2 * 53, 3 * 59, 5 * 61, 7 * 67, 11 * 71],
+        [13 * 73, 17 * 79, 19 * 83, 23 * 89, 29 * 97],
+        [31 * 101, 37 * 103, 41 * 107, 43 * 109, 47 * 113],
+    ]
+    assert (
+        ak.prod(depth2.to_typetracer(), axis=0, highlevel=False).form
+        == ak.prod(depth2, axis=0, highlevel=False).form
+    )
+    del depth2
+
+
+def test_0115_generic_reducer_operation_reproduce_numpy_2():
+    content2 = ak.contents.NumpyArray(np.array(primes[:12], dtype=np.int64))
+    offsets3 = ak.index.Index64(np.array([0, 4, 8, 12], dtype=np.int64))
+    depth1 = ak.contents.ListOffsetArray(offsets3, content2)
+    depth1 = ak.to_backend(depth1, "cuda", highlevel=False)
+
+    assert to_list(ak.prod(depth1, -1, highlevel=False)) == [
+        2 * 3 * 5 * 7,
+        11 * 13 * 17 * 19,
+        23 * 29 * 31 * 37,
+    ]
+    assert (
+        ak.prod(depth1.to_typetracer(), -1, highlevel=False).form
+        == ak.prod(depth1, -1, highlevel=False).form
+    )
+    assert to_list(ak.prod(depth1, 1, highlevel=False)) == [
+        2 * 3 * 5 * 7,
+        11 * 13 * 17 * 19,
+        23 * 29 * 31 * 37,
+    ]
+    assert (
+        ak.prod(depth1.to_typetracer(), 1, highlevel=False).form
+        == ak.prod(depth1, 1, highlevel=False).form
+    )
+
+    assert to_list(ak.prod(depth1, -2, highlevel=False)) == [
+        2 * 11 * 23,
+        3 * 13 * 29,
+        5 * 17 * 31,
+        7 * 19 * 37,
+    ]
+    assert (
+        ak.prod(depth1.to_typetracer(), -2, highlevel=False).form
+        == ak.prod(depth1, -2, highlevel=False).form
+    )
+    assert to_list(ak.prod(depth1, 0, highlevel=False)) == [
+        2 * 11 * 23,
+        3 * 13 * 29,
+        5 * 17 * 31,
+        7 * 19 * 37,
+    ]
+    assert (
+        ak.prod(depth1.to_typetracer(), 0, highlevel=False).form
+        == ak.prod(depth1, 0, highlevel=False).form
+    )
+    del depth1
+
+
+def test_0115_generic_reducer_operation_gaps_1():
+    content1 = ak.contents.NumpyArray(
+        np.array([123] + primes[: 2 * 3 * 5], dtype=np.int64)
+    )
+    offsets1 = ak.index.Index64(np.array([0, 1, 6, 11, 16, 21, 26, 31], dtype=np.int64))
+    offsets2 = ak.index.Index64(np.array([1, 4, 7], dtype=np.int64))
+    depth2 = ak.contents.ListOffsetArray(
+        offsets2, ak.contents.ListOffsetArray(offsets1, content1)
+    )
+    depth2 = ak.to_backend(depth2, "cuda", highlevel=False)
+
+    assert to_list(depth2) == [
+        [[2, 3, 5, 7, 11], [13, 17, 19, 23, 29], [31, 37, 41, 43, 47]],
+        [[53, 59, 61, 67, 71], [73, 79, 83, 89, 97], [101, 103, 107, 109, 113]],
+    ]
+
+    assert to_list(ak.prod(depth2, -3, highlevel=False)) == [
+        [106, 177, 305, 469, 781],
+        [949, 1343, 1577, 2047, 2813],
+        [3131, 3811, 4387, 4687, 5311],
+    ]
+    assert (
+        ak.prod(depth2.to_typetracer(), -3, highlevel=False).form
+        == ak.prod(depth2, -3, highlevel=False).form
+    )
+    del depth2
+
+
+def test_0115_generic_reducer_operation_gaps_2():
+    content1 = ak.contents.NumpyArray(np.array(primes[: 2 * 3 * 5 - 1], dtype=np.int64))
+    offsets1 = ak.index.Index64(np.array([0, 5, 10, 15, 20, 25, 29], dtype=np.int64))
+    offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64))
+    depth2 = ak.contents.ListOffsetArray(
+        offsets2, ak.contents.ListOffsetArray(offsets1, content1)
+    )
+    depth2 = ak.to_backend(depth2, "cuda", highlevel=False)
+
+    assert to_list(depth2) == [
+        [[2, 3, 5, 7, 11], [13, 17, 19, 23, 29], [31, 37, 41, 43, 47]],
+        [
+            [53, 59, 61, 67, 71],
+            [73, 79, 83, 89, 97],
+            [
+                101,
+                103,
+                107,
+                109,
+            ],
+        ],
+    ]
+
+    assert to_list(ak.prod(depth2, -3, highlevel=False)) == [
+        [106, 177, 305, 469, 781],
+        [949, 1343, 1577, 2047, 2813],
+        [3131, 3811, 4387, 4687, 47],
+    ]
+    assert (
+        ak.prod(depth2.to_typetracer(), -3, highlevel=False).form
+        == ak.prod(depth2, -3, highlevel=False).form
+    )
+    del depth2
+
+
+def test_0115_generic_reducer_operation_gaps_3():
+    content1 = ak.contents.NumpyArray(np.array(primes[: 2 * 3 * 5 - 2], dtype=np.int64))
+    offsets1 = ak.index.Index64(np.array([0, 5, 10, 15, 20, 25, 28], dtype=np.int64))
+    offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64))
+    depth2 = ak.contents.ListOffsetArray(
+        offsets2, ak.contents.ListOffsetArray(offsets1, content1)
+    )
+    depth2 = ak.to_backend(depth2, "cuda", highlevel=False)
+
+    assert to_list(depth2) == [
+        [[2, 3, 5, 7, 11], [13, 17, 19, 23, 29], [31, 37, 41, 43, 47]],
+        [
+            [53, 59, 61, 67, 71],
+            [73, 79, 83, 89, 97],
+            [
+                101,
+                103,
+                107,
+            ],
+        ],
+    ]
+
+    assert to_list(ak.prod(depth2, -3, highlevel=False)) == [
+        [106, 177, 305, 469, 781],
+        [949, 1343, 1577, 2047, 2813],
+        [3131, 3811, 4387, 43, 47],
+    ]
+    assert (
+        ak.prod(depth2.to_typetracer(), -3, highlevel=False).form
+        == ak.prod(depth2, -3, highlevel=False).form
+    )
+    del depth2
+
+
+def test_0115_generic_reducer_operation_gaps_4():
+    content1 = ak.contents.NumpyArray(
+        np.array(
+            [
+                2,
+                3,
+                5,
+                7,
+                11,
+                13,
+                17,
+                19,
+                23,
+                29,
+                31,
+                37,
+                41,
+                43,
+                47,
+                53,
+                59,
+                61,
+                67,
+                71,
+                73,
+                79,
+                83,
+                89,
+                101,
+                103,
+                107,
+                109,
+            ],
+            dtype=np.int64,
+        )
+    )
+    offsets1 = ak.index.Index64(np.array([0, 5, 10, 15, 20, 24, 28], dtype=np.int64))
+    offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64))
+    depth2 = ak.contents.ListOffsetArray(
+        offsets2, ak.contents.ListOffsetArray(offsets1, content1)
+    )
+    depth2 = ak.to_backend(depth2, "cuda", highlevel=False)
+
+    assert to_list(depth2) == [
+        [[2, 3, 5, 7, 11], [13, 17, 19, 23, 29], [31, 37, 41, 43, 47]],
+        [
+            [53, 59, 61, 67, 71],
+            [
+                73,
+                79,
+                83,
+                89,
+            ],
+            [101, 103, 107, 109],
+        ],
+    ]
+
+    assert to_list(ak.prod(depth2, -3, highlevel=False)) == [
+        [106, 177, 305, 469, 781],
+        [949, 1343, 1577, 2047, 29],
+        [3131, 3811, 4387, 4687, 47],
+    ]
+    assert (
+        ak.prod(depth2.to_typetracer(), -3, highlevel=False).form
+        == ak.prod(depth2, -3, highlevel=False).form
+    )
+    del depth2
+
+
+def test_0115_generic_reducer_operation_gaps_5():
+    content1 = ak.contents.NumpyArray(np.array(primes[1 : 2 * 3 * 5], dtype=np.int64))
+    offsets1 = ak.index.Index64(np.array([0, 4, 9, 14, 19, 24, 29], dtype=np.int64))
+    offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64))
+    depth2 = ak.contents.ListOffsetArray(
+        offsets2, ak.contents.ListOffsetArray(offsets1, content1)
+    )
+    depth2 = ak.to_backend(depth2, "cuda", highlevel=False)
+
+    assert to_list(depth2) == [
+        [[3, 5, 7, 11], [13, 17, 19, 23, 29], [31, 37, 41, 43, 47]],
+        [[53, 59, 61, 67, 71], [73, 79, 83, 89, 97], [101, 103, 107, 109, 113]],
+    ]
+
+    assert to_list(ak.prod(depth2, -3, highlevel=False)) == [
+        [159, 295, 427, 737, 71],
+        [949, 1343, 1577, 2047, 2813],
+        [3131, 3811, 4387, 4687, 5311],
+    ]
+    assert (
+        ak.prod(depth2.to_typetracer(), -3, highlevel=False).form
+        == ak.prod(depth2, -3, highlevel=False).form
+    )
+    del depth2
+
+
+def test_0115_generic_reducer_operation_gaps_6():
+    content1 = ak.contents.NumpyArray(np.array(primes[2 : 2 * 3 * 5], dtype=np.int64))
+    offsets1 = ak.index.Index64(np.array([0, 3, 8, 13, 18, 23, 28], dtype=np.int64))
+    offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64))
+    depth2 = ak.contents.ListOffsetArray(
+        offsets2, ak.contents.ListOffsetArray(offsets1, content1)
+    )
+    depth2 = ak.to_backend(depth2, "cuda", highlevel=False)
+
+    assert to_list(depth2) == [
+        [[5, 7, 11], [13, 17, 19, 23, 29], [31, 37, 41, 43, 47]],
+        [[53, 59, 61, 67, 71], [73, 79, 83, 89, 97], [101, 103, 107, 109, 113]],
+    ]
+
+    assert to_list(ak.prod(depth2, -3, highlevel=False)) == [
+        [265, 413, 671, 67, 71],
+        [949, 1343, 1577, 2047, 2813],
+        [3131, 3811, 4387, 4687, 5311],
+    ]
+    assert (
+        ak.prod(depth2.to_typetracer(), -3, highlevel=False).form
+        == ak.prod(depth2, -3, highlevel=False).form
+    )
+    del depth2
+
+
+def test_0115_generic_reducer_operation_gaps_7():
+    content1 = ak.contents.NumpyArray(
+        np.array(
+            [
+                3,
+                5,
+                7,
+                13,
+                17,
+                19,
+                23,
+                29,
+                31,
+                37,
+                41,
+                43,
+                47,
+                53,
+                59,
+                61,
+                67,
+                71,
+                73,
+                79,
+                83,
+                89,
+                97,
+                101,
+                103,
+                107,
+                109,
+                113,
+            ],
+            dtype=np.int64,
+        )
+    )
+    offsets1 = ak.index.Index64(np.array([0, 3, 8, 13, 18, 23, 28], dtype=np.int64))
+    offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64))
+    depth2 = ak.contents.ListOffsetArray(
+        offsets2, ak.contents.ListOffsetArray(offsets1, content1)
+    )
+    depth2 = ak.to_backend(depth2, "cuda", highlevel=False)
+
+    assert to_list(depth2) == [
+        [
+            [
+                3,
+                5,
+                7,
+            ],
+            [13, 17, 19, 23, 29],
+            [31, 37, 41, 43, 47],
+        ],
+        [[53, 59, 61, 67, 71], [73, 79, 83, 89, 97], [101, 103, 107, 109, 113]],
+    ]
+
+    assert to_list(ak.prod(depth2, -3, highlevel=False)) == [
+        [159, 295, 427, 67, 71],
+        [949, 1343, 1577, 2047, 2813],
+        [3131, 3811, 4387, 4687, 5311],
+    ]
+    assert (
+        ak.prod(depth2.to_typetracer(), -3, highlevel=False).form
+        == ak.prod(depth2, -3, highlevel=False).form
+    )
+    del depth2
+
+
+def test_0115_generic_reducer_operation_gaps_8():
+    content1 = ak.contents.NumpyArray(
+        np.array(
+            [
+                3,
+                5,
+                7,
+                11,
+                13,
+                17,
+                19,
+                23,
+                31,
+                37,
+                41,
+                43,
+                47,
+                53,
+                59,
+                61,
+                67,
+                71,
+                73,
+                79,
+                83,
+                89,
+                97,
+                101,
+                103,
+                107,
+                109,
+                113,
+            ],
+            dtype=np.int64,
+        )
+    )
+    offsets1 = ak.index.Index64(np.array([0, 4, 8, 13, 18, 23, 28], dtype=np.int64))
+    offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64))
+    depth2 = ak.contents.ListOffsetArray(
+        offsets2, ak.contents.ListOffsetArray(offsets1, content1)
+    )
+    depth2 = ak.to_backend(depth2, "cuda", highlevel=False)
+
+    assert to_list(depth2) == [
+        [[3, 5, 7, 11], [13, 17, 19, 23], [31, 37, 41, 43, 47]],
+        [[53, 59, 61, 67, 71], [73, 79, 83, 89, 97], [101, 103, 107, 109, 113]],
+    ]
+
+    assert to_list(ak.prod(depth2, -3, highlevel=False)) == [
+        [159, 295, 427, 737, 71],
+        [949, 1343, 1577, 2047, 97],
+        [3131, 3811, 4387, 4687, 5311],
+    ]
+    assert (
+        ak.prod(depth2.to_typetracer(), -3, highlevel=False).form
+        == ak.prod(depth2, -3, highlevel=False).form
+    )
+    del depth2
+
+
+def test_0115_generic_reducer_operation_gaps_9():
+    content1 = ak.contents.NumpyArray(
+        np.array(
+            [
+                2,
+                3,
+                5,
+                7,
+                11,
+                13,
+                17,
+                19,
+                23,
+                29,
+                31,
+                37,
+                41,
+                43,
+                53,
+                59,
+                61,
+                67,
+                71,
+                73,
+                79,
+                83,
+                89,
+                97,
+                101,
+                103,
+                107,
+                109,
+            ],
+            dtype=np.int64,
+        )
+    )
+    offsets1 = ak.index.Index64(np.array([0, 5, 10, 14, 19, 24, 28], dtype=np.int64))
+    offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64))
+    depth2 = ak.contents.ListOffsetArray(
+        offsets2, ak.contents.ListOffsetArray(offsets1, content1)
+    )
+    depth2 = ak.to_backend(depth2, "cuda", highlevel=False)
+
+    assert to_list(depth2) == [
+        [[2, 3, 5, 7, 11], [13, 17, 19, 23, 29], [31, 37, 41, 43]],
+        [[53, 59, 61, 67, 71], [73, 79, 83, 89, 97], [101, 103, 107, 109]],
+    ]
+
+    assert to_list(ak.prod(depth2, -3, highlevel=False)) == [
+        [106, 177, 305, 469, 781],
+        [949, 1343, 1577, 2047, 2813],
+        [3131, 3811, 4387, 4687],
+    ]
+    assert (
+        ak.prod(depth2.to_typetracer(), -3, highlevel=False).form
+        == ak.prod(depth2, -3, highlevel=False).form
+    )
+    del depth2
+
+
+def test_0115_generic_reducer_operation_gaps_10():
+    content1 = ak.contents.NumpyArray(
+        np.array(
+            [
+                2,
+                3,
+                5,
+                7,
+                11,
+                13,
+                17,
+                19,
+                23,
+                31,
+                37,
+                41,
+                43,
+                47,
+                53,
+                59,
+                61,
+                67,
+                71,
+                73,
+                79,
+                83,
+                89,
+                101,
+                103,
+                107,
+                109,
+                113,
+            ],
+            dtype=np.int64,
+        )
+    )
+    offsets1 = ak.index.Index64(np.array([0, 5, 9, 14, 19, 23, 28], dtype=np.int64))
+    offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64))
+    depth2 = ak.contents.ListOffsetArray(
+        offsets2, ak.contents.ListOffsetArray(offsets1, content1)
+    )
+    depth2 = ak.to_backend(depth2, "cuda", highlevel=False)
+
+    assert to_list(depth2) == [
+        [[2, 3, 5, 7, 11], [13, 17, 19, 23], [31, 37, 41, 43, 47]],
+        [[53, 59, 61, 67, 71], [73, 79, 83, 89], [101, 103, 107, 109, 113]],
+    ]
+
+    assert to_list(ak.prod(depth2, -3, highlevel=False)) == [
+        [106, 177, 305, 469, 781],
+        [949, 1343, 1577, 2047],
+        [3131, 3811, 4387, 4687, 5311],
+    ]
+    assert (
+        ak.prod(depth2.to_typetracer(), -3, highlevel=False).form
+        == ak.prod(depth2, -3, highlevel=False).form
+    )
+    del depth2
+
+
+def test_0115_generic_reducer_operation_gaps_11():
+    content1 = ak.contents.NumpyArray(np.array(primes[:9], dtype=np.int64))
+    offsets1 = ak.index.Index64(np.array([0, 3, 4, 6, 6, 7, 9], dtype=np.int64))
+    offsets2 = ak.index.Index64(np.array([0, 2, 4, 6], dtype=np.int64))
+    depth2 = ak.contents.ListOffsetArray(
+        offsets2, ak.contents.ListOffsetArray(offsets1, content1)
+    )
+    depth2 = ak.to_backend(depth2, "cuda", highlevel=False)
+
+    assert to_list(depth2) == [[[2, 3, 5], [7]], [[11, 13], []], [[17], [19, 23]]]
+
+    assert to_list(ak.prod(depth2, -3, highlevel=False)) == [
+        [2 * 11 * 17, 3 * 13, 5],
+        [7 * 19, 23],
+    ]
+    assert (
+        ak.prod(depth2.to_typetracer(), -3, highlevel=False).form
+        == ak.prod(depth2, -3, highlevel=False).form
+    )
+    del depth2
+
+
+def test_0115_generic_reducer_operation_gaps_12():
+    content1 = ak.contents.NumpyArray(np.array(primes[:9], dtype=np.int64))
+    offsets1 = ak.index.Index64(np.array([0, 3, 4, 6, 7, 9], dtype=np.int64))
+    offsets2 = ak.index.Index64(np.array([0, 2, 3, 5], dtype=np.int64))
+    depth2 = ak.contents.ListOffsetArray(
+        offsets2, ak.contents.ListOffsetArray(offsets1, content1)
+    )
+    depth2 = ak.to_backend(depth2, "cuda", highlevel=False)
+
+    assert to_list(depth2) == [[[2, 3, 5], [7]], [[11, 13]], [[17], [19, 23]]]
+
+    assert to_list(ak.prod(depth2, -3, highlevel=False)) == [
+        [2 * 11 * 17, 3 * 13, 5],
+        [7 * 19, 23],
+    ]
+    assert (
+        ak.prod(depth2.to_typetracer(), -3, highlevel=False).form
+        == ak.prod(depth2, -3, highlevel=False).form
+    )
+    del depth2
+
+
+def test_0115_generic_reducer_operation_gaps_13():
+    content1 = ak.contents.NumpyArray(np.array(primes[:10], dtype=np.int64))
+    offsets1 = ak.index.Index64(np.array([0, 3, 5, 6, 8, 9, 10], dtype=np.int64))
+    offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64))
+    depth2 = ak.contents.ListOffsetArray(
+        offsets2, ak.contents.ListOffsetArray(offsets1, content1)
+    )
+    depth2 = ak.to_backend(depth2, "cuda", highlevel=False)
+
+    assert to_list(depth2) == [[[2, 3, 5], [7, 11], [13]], [[17, 19], [23], [29]]]
+
+    assert to_list(ak.prod(depth2, -3, highlevel=False)) == [
+        [34, 57, 5],
+        [161, 11],
+        [377],
+    ]
+    assert (
+        ak.prod(depth2.to_typetracer(), -3, highlevel=False).form
+        == ak.prod(depth2, -3, highlevel=False).form
+    )
+    del depth2
+
+
+def test_0115_generic_reducer_operation_gaps_14():
+    content1 = ak.contents.NumpyArray(np.array(primes[:9], dtype=np.int64))
+    offsets1 = ak.index.Index64(np.array([0, 3, 3, 5, 6, 8, 9], dtype=np.int64))
+    offsets2 = ak.index.Index64(np.array([0, 4, 6], dtype=np.int64))
+    depth2 = ak.contents.ListOffsetArray(
+        offsets2, ak.contents.ListOffsetArray(offsets1, content1)
+    )
+    depth2 = ak.to_backend(depth2, "cuda", highlevel=False)
+
+    assert to_list(depth2) == [[[2, 3, 5], [], [7, 11], [13]], [[17, 19], [23]]]
+
+    assert to_list(ak.prod(depth2, -3, highlevel=False)) == [
+        [34, 57, 5],
+        [23],
+        [7, 11],
+        [13],
+    ]
+    assert (
+        ak.prod(depth2.to_typetracer(), -3, highlevel=False).form
+        == ak.prod(depth2, -3, highlevel=False).form
+    )
+    del depth2
+
+
+def test_0115_generic_reducer_operation_gaps_15():
+    content1 = ak.contents.NumpyArray(np.array(primes[:9], dtype=np.int64))
+    offsets1 = ak.index.Index64(np.array([0, 3, 3, 5, 6, 8, 9], dtype=np.int64))
+    offsets2 = ak.index.Index64(np.array([0, 4, 4, 6], dtype=np.int64))
+    depth2 = ak.contents.ListOffsetArray(
+        offsets2, ak.contents.ListOffsetArray(offsets1, content1)
+    )
+    depth2 = ak.to_backend(depth2, "cuda", highlevel=False)
+
+    assert to_list(depth2) == [[[2, 3, 5], [], [7, 11], [13]], [], [[17, 19], [23]]]
+
+    assert to_list(ak.prod(depth2, -3, highlevel=False)) == [
+        [34, 57, 5],
+        [23],
+        [7, 11],
+        [13],
+    ]
+    assert (
+        ak.prod(depth2.to_typetracer(), -3, highlevel=False).form
+        == ak.prod(depth2, -3, highlevel=False).form
+    )
+    del depth2
+
+
+def test_0115_generic_reducer_operation_gaps_16():
+    content1 = ak.contents.NumpyArray(np.array(primes[: 2 * 3 * 5], dtype=np.int64))
+    offsets1 = ak.index.Index64(np.array([0, 5, 10, 15, 20, 25, 30], dtype=np.int64))
+    offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64))
+    depth2 = ak.contents.ListOffsetArray(
+        offsets2, ak.contents.ListOffsetArray(offsets1, content1)
+    )
+    depth2 = ak.to_backend(depth2, "cuda", highlevel=False)
+
+    assert to_list(depth2) == [
+        [[2, 3, 5, 7, 11], [13, 17, 19, 23, 29], [31, 37, 41, 43, 47]],
+        [[53, 59, 61, 67, 71], [73, 79, 83, 89, 97], [101, 103, 107, 109, 113]],
+    ]
+
+    assert to_list(ak.prod(depth2, -1, highlevel=False)) == [
+        [2 * 3 * 5 * 7 * 11, 13 * 17 * 19 * 23 * 29, 31 * 37 * 41 * 43 * 47],
+        [53 * 59 * 61 * 67 * 71, 73 * 79 * 83 * 89 * 97, 101 * 103 * 107 * 109 * 113],
+    ]
+    assert (
+        ak.prod(depth2.to_typetracer(), -1, highlevel=False).form
+        == ak.prod(depth2, -1, highlevel=False).form
+    )
+
+    assert to_list(ak.prod(depth2, -2, highlevel=False)) == [
+        [2 * 13 * 31, 3 * 17 * 37, 5 * 19 * 41, 7 * 23 * 43, 11 * 29 * 47],
+        [53 * 73 * 101, 59 * 79 * 103, 61 * 83 * 107, 67 * 89 * 109, 71 * 97 * 113],
+    ]
+    assert (
+        ak.prod(depth2.to_typetracer(), -2, highlevel=False).form
+        == ak.prod(depth2, -2, highlevel=False).form
+    )
+    del depth2
+
+
+def test_0115_generic_reducer_operation_gaps_17():
+    content1 = ak.contents.NumpyArray(np.array(primes[:9], dtype=np.int64))
+    offsets1 = ak.index.Index64(np.array([0, 3, 3, 5, 6, 8, 9], dtype=np.int64))
+    offsets2 = ak.index.Index64(np.array([0, 4, 4, 6], dtype=np.int64))
+    depth2 = ak.contents.ListOffsetArray(
+        offsets2, ak.contents.ListOffsetArray(offsets1, content1)
+    )
+    depth2 = ak.to_backend(depth2, "cuda", highlevel=False)
+
+    assert to_list(depth2) == [
+        [[2, 3, 5], [], [7, 11], [13]],
+        [],
+        [[17, 19], [23]],
+    ]
+
+    assert to_list(ak.prod(depth2, -1, highlevel=False)) == [
+        [2 * 3 * 5, 1, 7 * 11, 13],
+        [],
+        [17 * 19, 23],
+    ]
+    assert (
+        ak.prod(depth2.to_typetracer(), -1, highlevel=False).form
+        == ak.prod(depth2, -1, highlevel=False).form
+    )
+
+    assert to_list(ak.prod(depth2, -2, highlevel=False)) == [
+        [2 * 7 * 13, 3 * 11, 5],
+        [],
+        [17 * 23, 19],
+    ]
+    assert (
+        ak.prod(depth2.to_typetracer(), -2, highlevel=False).form
+        == ak.prod(depth2, -2, highlevel=False).form
+    )
+
+    assert to_list(ak.prod(depth2, -3, highlevel=False)) == [
+        [2 * 17, 3 * 19, 5],
+        [23],
+        [7, 11],
+        [13],
+    ]
+    assert (
+        ak.prod(depth2.to_typetracer(), -3, highlevel=False).form
+        == ak.prod(depth2, -3, highlevel=False).form
+    )
+    del depth2

From 347c4c902c629f409664f2b4c97a25965e4f0c4d Mon Sep 17 00:00:00 2001
From: Manasvi Goyal <mg.manasvi@gmail.com>
Date: Fri, 21 Jun 2024 15:30:47 +0200
Subject: [PATCH 29/33] test: add more reducer tests

---
 ...est_3162_cuda_generic_reducer_operation.py | 457 ++++++++++++++++++
 1 file changed, 457 insertions(+)

diff --git a/tests-cuda/test_3162_cuda_generic_reducer_operation.py b/tests-cuda/test_3162_cuda_generic_reducer_operation.py
index bcfc5488c9..94d1bb3570 100644
--- a/tests-cuda/test_3162_cuda_generic_reducer_operation.py
+++ b/tests-cuda/test_3162_cuda_generic_reducer_operation.py
@@ -863,3 +863,460 @@ def test_0115_generic_reducer_operation_gaps_17():
         == ak.prod(depth2, -3, highlevel=False).form
     )
     del depth2
+
+
+def test_0115_generic_reducer_operation_complicated():
+    offsets1 = ak.index.Index64(np.array([0, 3, 3, 5], dtype=np.int64))
+    content1 = ak.contents.ListOffsetArray(
+        offsets1, ak.contents.NumpyArray(np.array(primes[:5], dtype=np.int64))
+    )
+    offsets2 = ak.index.Index64(np.array([0, 3, 3, 5, 6, 8, 9], dtype=np.int64))
+    offsets3 = ak.index.Index64(np.array([0, 4, 4, 6], dtype=np.int64))
+    content2 = ak.contents.ListOffsetArray(
+        offsets3,
+        ak.contents.ListOffsetArray(
+            offsets2, ak.contents.NumpyArray(np.array(primes[:9], dtype=np.int64))
+        ),
+    )
+    offsets4 = ak.index.Index64(np.array([0, 1, 1, 3], dtype=np.int64))
+    complicated = ak.contents.ListOffsetArray(
+        offsets4, ak.contents.RecordArray([content1, content2], ["x", "y"])
+    )
+    complicated = ak.to_backend(complicated, "cuda", highlevel=False)
+
+    assert to_list(complicated) == [
+        [{"x": [2, 3, 5], "y": [[2, 3, 5], [], [7, 11], [13]]}],
+        [],
+        [{"x": [], "y": []}, {"x": [7, 11], "y": [[17, 19], [23]]}],
+    ]
+
+    assert to_list(complicated["x"]) == [[[2, 3, 5]], [], [[], [7, 11]]]
+    assert complicated.to_typetracer()["x"].form == complicated["x"].form
+    assert to_list(complicated["y"]) == [
+        [[[2, 3, 5], [], [7, 11], [13]]],
+        [],
+        [[], [[17, 19], [23]]],
+    ]
+    assert complicated.to_typetracer()["y"].form == complicated["y"].form
+
+    with pytest.raises(TypeError):
+        to_list(ak.prod(complicated, -1, highlevel=False))
+
+    with pytest.raises(TypeError):
+        assert (
+            ak.prod(complicated.to_typetracer(), -1, highlevel=False).form
+            == ak.prod(complicated, -1, highlevel=False).form
+        )
+
+    assert to_list(ak.prod(complicated["x"], -1, highlevel=False)) == [
+        [30],
+        [],
+        [1, 77],
+    ]
+    assert (
+        ak.prod(complicated.to_typetracer()["x"], -1, highlevel=False).form
+        == ak.prod(complicated["x"], -1, highlevel=False).form
+    )
+    assert to_list(ak.prod(complicated["y"], -1, highlevel=False)) == [
+        [[30, 1, 77, 13]],
+        [],
+        [[], [323, 23]],
+    ]
+    assert (
+        ak.prod(complicated.to_typetracer()["y"], -1, highlevel=False).form
+        == ak.prod(complicated["y"], -1, highlevel=False).form
+    )
+
+    with pytest.raises(TypeError):
+        to_list(ak.prod(complicated, -2, highlevel=False))
+
+    with pytest.raises(TypeError):
+        assert (
+            ak.prod(complicated.to_typetracer(), -2, highlevel=False).form
+            == ak.prod(complicated, -2, highlevel=False).form
+        )
+    assert to_list(ak.prod(complicated["x"], -2, highlevel=False)) == [
+        [2, 3, 5],
+        [],
+        [7, 11],
+    ]
+    assert (
+        ak.prod(complicated.to_typetracer()["x"], -2, highlevel=False).form
+        == ak.prod(complicated["x"], -2, highlevel=False).form
+    )
+    assert to_list(ak.prod(complicated["y"], -2, highlevel=False)) == [
+        [[182, 33, 5]],
+        [],
+        [[], [391, 19]],
+    ]
+    assert (
+        ak.prod(complicated.to_typetracer()["y"], -2, highlevel=False).form
+        == ak.prod(complicated["y"], -2, highlevel=False).form
+    )
+
+    assert to_list(complicated[0]) == [
+        {"x": [2, 3, 5], "y": [[2, 3, 5], [], [7, 11], [13]]}
+    ]
+    assert complicated.to_typetracer()[0].form == complicated[0].form
+
+    with pytest.raises(TypeError):
+        to_list(ak.prod(complicated[0], -1, highlevel=False))
+
+    with pytest.raises(TypeError):
+        to_list(ak.prod(complicated.to_typetracer()[0], -1, highlevel=False))
+    del complicated
+
+
+def test_0115_generic_reducer_operation_EmptyArray():
+    offsets = ak.index.Index64(np.array([0, 0, 0, 0], dtype=np.int64))
+    array = ak.contents.ListOffsetArray(offsets, ak.contents.EmptyArray())
+    array = ak.to_backend(array, "cuda")
+
+    assert to_list(array) == [[], [], []]
+
+    assert to_list(ak.prod(array, -1, highlevel=False)) == [1, 1, 1]
+    assert (
+        ak.prod(array.to_typetracer(), -1, highlevel=False).form
+        == ak.prod(array, -1, highlevel=False).form
+    )
+
+    offsets = ak.index.Index64(np.array([0, 0, 0, 0], dtype=np.int64))
+    array = ak.contents.ListOffsetArray(
+        offsets, ak.contents.NumpyArray(np.array([], dtype=np.int64))
+    )
+    array = ak.to_backend(array, "cuda")
+
+    assert to_list(array) == [[], [], []]
+
+    assert to_list(ak.prod(array, -1, highlevel=False)) == [1, 1, 1]
+    assert (
+        ak.prod(array.to_typetracer(), -1, highlevel=False).form
+        == ak.prod(array, -1, highlevel=False).form
+    )
+    del array
+
+
+def test_0115_generic_reducer_operation_IndexedOptionArray_1():
+    content = ak.contents.NumpyArray(np.array(primes[: 2 * 3 * 5], dtype=np.int64))
+    offsets1 = ak.index.Index64(np.array([0, 5, 10, 15, 20, 25, 30], dtype=np.int64))
+    listoffsetarray = ak.contents.ListOffsetArray(offsets1, content)
+    index = ak.index.Index64(np.array([5, 4, 3, 2, 1, 0], dtype=np.int64))
+    indexedarray = ak.contents.IndexedArray(index, listoffsetarray)
+    offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64))
+    depth2 = ak.contents.ListOffsetArray(offsets2, indexedarray)
+    depth2 = ak.to_backend(depth2, "cuda")
+
+    assert to_list(depth2) == [
+        [[101, 103, 107, 109, 113], [73, 79, 83, 89, 97], [53, 59, 61, 67, 71]],
+        [[31, 37, 41, 43, 47], [13, 17, 19, 23, 29], [2, 3, 5, 7, 11]],
+    ]
+
+    assert to_list(ak.prod(depth2, -1, highlevel=False)) == [
+        [101 * 103 * 107 * 109 * 113, 73 * 79 * 83 * 89 * 97, 53 * 59 * 61 * 67 * 71],
+        [31 * 37 * 41 * 43 * 47, 13 * 17 * 19 * 23 * 29, 2 * 3 * 5 * 7 * 11],
+    ]
+    assert (
+        ak.prod(depth2.to_typetracer(), -1, highlevel=False).form
+        == ak.prod(depth2, -1, highlevel=False).form
+    )
+
+    assert to_list(ak.prod(depth2, -2, highlevel=False)) == [
+        [101 * 73 * 53, 103 * 79 * 59, 107 * 83 * 61, 109 * 89 * 67, 113 * 97 * 71],
+        [31 * 13 * 2, 37 * 17 * 3, 41 * 19 * 5, 43 * 23 * 7, 47 * 29 * 11],
+    ]
+    assert (
+        ak.prod(depth2.to_typetracer(), -2, highlevel=False).form
+        == ak.prod(depth2, -2, highlevel=False).form
+    )
+
+    assert to_list(ak.prod(depth2, -3, highlevel=False)) == [
+        [101 * 31, 103 * 37, 107 * 41, 109 * 43, 113 * 47],
+        [73 * 13, 79 * 17, 83 * 19, 89 * 23, 97 * 29],
+        [53 * 2, 59 * 3, 61 * 5, 67 * 7, 71 * 11],
+    ]
+    assert (
+        ak.prod(depth2.to_typetracer(), -3, highlevel=False).form
+        == ak.prod(depth2, -3, highlevel=False).form
+    )
+    del depth2
+
+
+def test_0115_generic_reducer_operation_IndexedOptionArray_2():
+    content = ak.contents.NumpyArray(
+        np.array(
+            [
+                2,
+                3,
+                5,
+                7,
+                11,
+                31,
+                37,
+                41,
+                43,
+                47,
+                53,
+                59,
+                61,
+                67,
+                71,
+                101,
+                103,
+                107,
+                109,
+                113,
+            ],
+            dtype=np.int64,
+        )
+    )
+    offsets1 = ak.index.Index64(np.array([0, 5, 10, 15, 20], dtype=np.int64))
+    listoffsetarray = ak.contents.ListOffsetArray(offsets1, content)
+    index = ak.index.Index64(np.array([3, -1, 2, 1, -1, 0], dtype=np.int64))
+    indexedoptionarray = ak.contents.IndexedOptionArray(index, listoffsetarray)
+    offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64))
+    depth2 = ak.contents.ListOffsetArray(offsets2, indexedoptionarray)
+    depth2 = ak.to_backend(depth2, "cuda")
+
+    assert to_list(depth2) == [
+        [[101, 103, 107, 109, 113], None, [53, 59, 61, 67, 71]],
+        [[31, 37, 41, 43, 47], None, [2, 3, 5, 7, 11]],
+    ]
+
+    assert to_list(ak.prod(depth2, -1, highlevel=False)) == [
+        [101 * 103 * 107 * 109 * 113, None, 53 * 59 * 61 * 67 * 71],
+        [31 * 37 * 41 * 43 * 47, None, 2 * 3 * 5 * 7 * 11],
+    ]
+    assert (
+        ak.prod(depth2.to_typetracer(), -1, highlevel=False).form
+        == ak.prod(depth2, -1, highlevel=False).form
+    )
+
+    assert to_list(ak.prod(depth2, -2, highlevel=False)) == [
+        [101 * 53, 103 * 59, 107 * 61, 109 * 67, 113 * 71],
+        [31 * 2, 37 * 3, 41 * 5, 43 * 7, 47 * 11],
+    ]
+    assert (
+        ak.prod(depth2.to_typetracer(), -2, highlevel=False).form
+        == ak.prod(depth2, -2, highlevel=False).form
+    )
+
+    assert to_list(ak.prod(depth2, -3, highlevel=False)) == [
+        [101 * 31, 103 * 37, 107 * 41, 109 * 43, 113 * 47],
+        [],
+        [53 * 2, 59 * 3, 61 * 5, 67 * 7, 71 * 11],
+    ]
+    assert (
+        ak.prod(depth2.to_typetracer(), -3, highlevel=False).form
+        == ak.prod(depth2, -3, highlevel=False).form
+    )
+    del depth2
+
+
+def test_0115_generic_reducer_operation_IndexedOptionArray_3():
+    content = ak.contents.NumpyArray(
+        np.array(
+            [
+                2,
+                3,
+                5,
+                7,
+                11,
+                31,
+                37,
+                41,
+                43,
+                47,
+                53,
+                59,
+                61,
+                67,
+                71,
+                101,
+                103,
+                107,
+                109,
+                113,
+            ],
+            dtype=np.int64,
+        )
+    )
+    index = ak.index.Index64(
+        np.array(
+            [
+                15,
+                16,
+                17,
+                18,
+                19,
+                -1,
+                -1,
+                -1,
+                -1,
+                -1,
+                10,
+                11,
+                12,
+                13,
+                14,
+                5,
+                6,
+                7,
+                8,
+                9,
+                -1,
+                -1,
+                -1,
+                -1,
+                -1,
+                0,
+                1,
+                2,
+                3,
+                4,
+            ],
+            dtype=np.int64,
+        )
+    )
+    indexedoptionarray = ak.contents.IndexedOptionArray(index, content)
+    offsets1 = ak.index.Index64(np.array([0, 5, 10, 15, 20, 25, 30], dtype=np.int64))
+    listoffsetarray = ak.contents.ListOffsetArray(offsets1, indexedoptionarray)
+    offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64))
+    depth2 = ak.contents.ListOffsetArray(offsets2, listoffsetarray)
+    depth2 = ak.to_backend(depth2, "cuda")
+
+    assert to_list(depth2) == [
+        [
+            [101, 103, 107, 109, 113],
+            [None, None, None, None, None],
+            [53, 59, 61, 67, 71],
+        ],
+        [[31, 37, 41, 43, 47], [None, None, None, None, None], [2, 3, 5, 7, 11]],
+    ]
+
+    assert to_list(ak.prod(depth2, -1, highlevel=False)) == [
+        [101 * 103 * 107 * 109 * 113, 1 * 1 * 1 * 1 * 1, 53 * 59 * 61 * 67 * 71],
+        [31 * 37 * 41 * 43 * 47, 1 * 1 * 1 * 1 * 1, 2 * 3 * 5 * 7 * 11],
+    ]
+    assert (
+        ak.prod(depth2.to_typetracer(), -1, highlevel=False).form
+        == ak.prod(depth2, -1, highlevel=False).form
+    )
+
+    assert to_list(ak.prod(depth2, -2, highlevel=False)) == [
+        [101 * 53, 103 * 59, 107 * 61, 109 * 67, 113 * 71],
+        [31 * 2, 37 * 3, 41 * 5, 43 * 7, 47 * 11],
+    ]
+    assert (
+        ak.prod(depth2.to_typetracer(), -2, highlevel=False).form
+        == ak.prod(depth2, -2, highlevel=False).form
+    )
+
+    assert to_list(ak.prod(depth2, -3, highlevel=False)) == [
+        [101 * 31, 103 * 37, 107 * 41, 109 * 43, 113 * 47],
+        [1, 1, 1, 1, 1],
+        [53 * 2, 59 * 3, 61 * 5, 67 * 7, 71 * 11],
+    ]
+    assert (
+        ak.prod(depth2.to_typetracer(), -3, highlevel=False).form
+        == ak.prod(depth2, -3, highlevel=False).form
+    )
+    del depth2
+
+
+def test_0115_generic_reducer_operation_IndexedOptionArray_4():
+    content = ak.contents.NumpyArray(
+        np.array(
+            [
+                2,
+                3,
+                5,
+                7,
+                11,
+                31,
+                37,
+                41,
+                43,
+                47,
+                53,
+                59,
+                61,
+                67,
+                71,
+                101,
+                103,
+                107,
+                109,
+                113,
+            ],
+            dtype=np.int64,
+        )
+    )
+    index = ak.index.Index64(
+        np.array(
+            [
+                15,
+                16,
+                17,
+                18,
+                19,
+                -1,
+                10,
+                11,
+                12,
+                13,
+                14,
+                5,
+                6,
+                7,
+                8,
+                9,
+                -1,
+                0,
+                1,
+                2,
+                3,
+                4,
+            ],
+            dtype=np.int64,
+        )
+    )
+    indexedoptionarray = ak.contents.IndexedOptionArray(index, content)
+    offsets1 = ak.index.Index64(np.array([0, 5, 6, 11, 16, 17, 22], dtype=np.int64))
+    listoffsetarray = ak.contents.ListOffsetArray(offsets1, indexedoptionarray)
+    offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64))
+    depth2 = ak.contents.ListOffsetArray(offsets2, listoffsetarray)
+    depth2 = ak.to_backend(depth2, "cuda")
+
+    assert to_list(depth2) == [
+        [[101, 103, 107, 109, 113], [None], [53, 59, 61, 67, 71]],
+        [[31, 37, 41, 43, 47], [None], [2, 3, 5, 7, 11]],
+    ]
+
+    assert to_list(ak.prod(depth2, -1, highlevel=False)) == [
+        [101 * 103 * 107 * 109 * 113, 1, 53 * 59 * 61 * 67 * 71],
+        [31 * 37 * 41 * 43 * 47, 1, 2 * 3 * 5 * 7 * 11],
+    ]
+    assert (
+        ak.prod(depth2.to_typetracer(), -1, highlevel=False).form
+        == ak.prod(depth2, -1, highlevel=False).form
+    )
+
+    assert to_list(ak.prod(depth2, -2, highlevel=False)) == [
+        [101 * 53, 103 * 59, 107 * 61, 109 * 67, 113 * 71],
+        [31 * 2, 37 * 3, 41 * 5, 43 * 7, 47 * 11],
+    ]
+    assert (
+        ak.prod(depth2.to_typetracer(), -2, highlevel=False).form
+        == ak.prod(depth2, -2, highlevel=False).form
+    )
+
+    assert to_list(ak.prod(depth2, -3, highlevel=False)) == [
+        [101 * 31, 103 * 37, 107 * 41, 109 * 43, 113 * 47],
+        [1],
+        [53 * 2, 59 * 3, 61 * 5, 67 * 7, 71 * 11],
+    ]
+    assert (
+        ak.prod(depth2.to_typetracer(), -3, highlevel=False).form
+        == ak.prod(depth2, -3, highlevel=False).form
+    )
+    del depth2

From bc7e1c02775b84feabe0c5afe97605a9086fc54f Mon Sep 17 00:00:00 2001
From: Manasvi Goyal <mg.manasvi@gmail.com>
Date: Fri, 21 Jun 2024 15:55:37 +0200
Subject: [PATCH 30/33] test: add more reducer tests 2

---
 ...est_3162_cuda_generic_reducer_operation.py | 627 ++++++++++++++++++
 1 file changed, 627 insertions(+)

diff --git a/tests-cuda/test_3162_cuda_generic_reducer_operation.py b/tests-cuda/test_3162_cuda_generic_reducer_operation.py
index 94d1bb3570..cd843a1f56 100644
--- a/tests-cuda/test_3162_cuda_generic_reducer_operation.py
+++ b/tests-cuda/test_3162_cuda_generic_reducer_operation.py
@@ -1320,3 +1320,630 @@ def test_0115_generic_reducer_operation_IndexedOptionArray_4():
         == ak.prod(depth2, -3, highlevel=False).form
     )
     del depth2
+
+
+def test_0115_generic_reducer_operation_sum():
+    content2 = ak.contents.NumpyArray(
+        np.array([1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048], dtype=np.int64)
+    )
+    offsets3 = ak.index.Index64(np.array([0, 4, 8, 12], dtype=np.int64))
+    depth1 = ak.contents.ListOffsetArray(offsets3, content2)
+    depth1 = ak.to_backend(depth1, "cuda", highlevel=False)
+
+    assert to_list(ak.sum(depth1, -1, highlevel=False)) == [
+        1 + 2 + 4 + 8,
+        16 + 32 + 64 + 128,
+        256 + 512 + 1024 + 2048,
+    ]
+    assert (
+        ak.sum(depth1.to_typetracer(), -1, highlevel=False).form
+        == ak.sum(depth1, -1, highlevel=False).form
+    )
+    assert to_list(ak.sum(depth1, 1, highlevel=False)) == [
+        1 + 2 + 4 + 8,
+        16 + 32 + 64 + 128,
+        256 + 512 + 1024 + 2048,
+    ]
+    assert (
+        ak.sum(depth1.to_typetracer(), 1, highlevel=False).form
+        == ak.sum(depth1, 1, highlevel=False).form
+    )
+
+    assert to_list(ak.sum(depth1, -2, highlevel=False)) == [
+        1 + 16 + 256,
+        2 + 32 + 512,
+        4 + 64 + 1024,
+        8 + 128 + 2048,
+    ]
+    assert (
+        ak.sum(depth1.to_typetracer(), -2, highlevel=False).form
+        == ak.sum(depth1, -2, highlevel=False).form
+    )
+    assert to_list(ak.sum(depth1, 0, highlevel=False)) == [
+        1 + 16 + 256,
+        2 + 32 + 512,
+        4 + 64 + 1024,
+        8 + 128 + 2048,
+    ]
+    assert (
+        ak.sum(depth1.to_typetracer(), 0, highlevel=False).form
+        == ak.sum(depth1, 0, highlevel=False).form
+    )
+    del depth1
+
+
+def test_0115_generic_reducer_operation_any():
+    content2 = ak.contents.NumpyArray(
+        np.array([1.1, 2.2, 3.3, 0.0, 2.2, 0.0, 0.0, 0.0, 0.0, 0.0])
+    )
+    offsets3 = ak.index.Index64(np.array([0, 3, 6, 10], dtype=np.int64))
+    depth1 = ak.contents.ListOffsetArray(offsets3, content2)
+    depth1 = ak.to_backend(depth1, "cuda", highlevel=False)
+
+    assert to_list(depth1) == [
+        [1.1, 2.2, 3.3],
+        [0.0, 2.2, 0.0],
+        [0.0, 0.0, 0.0, 0.0],
+    ]
+
+    assert to_list(ak.any(depth1, -1, highlevel=False)) == [True, True, False]
+    assert (
+        ak.any(depth1.to_typetracer(), -1, highlevel=False).form
+        == ak.any(depth1, -1, highlevel=False).form
+    )
+    assert to_list(ak.any(depth1, 1, highlevel=False)) == [True, True, False]
+    assert (
+        ak.any(depth1.to_typetracer(), 1, highlevel=False).form
+        == ak.any(depth1, 1, highlevel=False).form
+    )
+
+    assert to_list(ak.any(depth1, -2, highlevel=False)) == [True, True, True, False]
+    assert (
+        ak.any(depth1.to_typetracer(), -2, highlevel=False).form
+        == ak.any(depth1, -2, highlevel=False).form
+    )
+    assert to_list(ak.any(depth1, 0, highlevel=False)) == [True, True, True, False]
+    assert (
+        ak.any(depth1.to_typetracer(), 0, highlevel=False).form
+        == ak.any(depth1, 0, highlevel=False).form
+    )
+    del depth1
+
+
+def test_0115_generic_reducer_operation_all():
+    content2 = ak.contents.NumpyArray(
+        np.array([1.1, 2.2, 3.3, 0.0, 2.2, 0.0, 0.0, 2.2, 0.0, 4.4])
+    )
+    offsets3 = ak.index.Index64(np.array([0, 3, 6, 10], dtype=np.int64))
+    depth1 = ak.contents.ListOffsetArray(offsets3, content2)
+    depth1 = ak.to_backend(depth1, "cuda", highlevel=False)
+
+    assert to_list(depth1) == [
+        [1.1, 2.2, 3.3],
+        [0.0, 2.2, 0.0],
+        [0.0, 2.2, 0.0, 4.4],
+    ]
+
+    assert to_list(ak.all(depth1, -1, highlevel=False)) == [True, False, False]
+    assert (
+        ak.all(depth1.to_typetracer(), -1, highlevel=False).form
+        == ak.all(depth1, -1, highlevel=False).form
+    )
+    assert to_list(ak.all(depth1, 1, highlevel=False)) == [True, False, False]
+    assert (
+        ak.all(depth1.to_typetracer(), 1, highlevel=False).form
+        == ak.all(depth1, 1, highlevel=False).form
+    )
+
+    assert to_list(ak.all(depth1, -2, highlevel=False)) == [False, True, False, True]
+    assert (
+        ak.all(depth1.to_typetracer(), -2, highlevel=False).form
+        == ak.all(depth1, -2, highlevel=False).form
+    )
+    assert to_list(ak.all(depth1, 0, highlevel=False)) == [False, True, False, True]
+    assert (
+        ak.all(depth1.to_typetracer(), 0, highlevel=False).form
+        == ak.all(depth1, 0, highlevel=False).form
+    )
+    del depth1
+
+
+def test_0115_generic_reducer_operation_count():
+    content2 = ak.contents.NumpyArray(
+        np.array([1.1, 2.2, 3.3, 0.0, 2.2, 0.0, 0.0, 2.2, 0.0, 4.4])
+    )
+    offsets3 = ak.index.Index64(np.array([0, 3, 6, 10], dtype=np.int64))
+    depth1 = ak.contents.ListOffsetArray(offsets3, content2)
+    depth1 = ak.to_backend(depth1, "cuda", highlevel=False)
+
+    assert to_list(depth1) == [
+        [1.1, 2.2, 3.3],
+        [0.0, 2.2, 0.0],
+        [0.0, 2.2, 0.0, 4.4],
+    ]
+
+    assert to_list(ak.count(depth1, -1, highlevel=False)) == [3, 3, 4]
+    assert (
+        ak.count(depth1.to_typetracer(), -1, highlevel=False).form
+        == ak.count(depth1, -1, highlevel=False).form
+    )
+    assert to_list(ak.count(depth1, 1, highlevel=False)) == [3, 3, 4]
+    assert (
+        ak.count(depth1.to_typetracer(), 1, highlevel=False).form
+        == ak.count(depth1, 1, highlevel=False).form
+    )
+
+    assert to_list(ak.count(depth1, -2, highlevel=False)) == [3, 3, 3, 1]
+    assert (
+        ak.count(depth1.to_typetracer(), -2, highlevel=False).form
+        == ak.count(depth1, -2, highlevel=False).form
+    )
+    assert to_list(ak.count(depth1, 0, highlevel=False)) == [3, 3, 3, 1]
+    assert (
+        ak.count(depth1.to_typetracer(), 0, highlevel=False).form
+        == ak.count(depth1, 0, highlevel=False).form
+    )
+    del depth1
+
+
+def test_0115_generic_reducer_operation_count_nonzero():
+    content2 = ak.contents.NumpyArray(
+        np.array([1.1, 2.2, 3.3, 0.0, 2.2, 0.0, 0.0, 2.2, 0.0, 4.4])
+    )
+    offsets3 = ak.index.Index64(np.array([0, 3, 6, 10], dtype=np.int64))
+    depth1 = ak.contents.ListOffsetArray(offsets3, content2)
+    depth1 = ak.to_backend(depth1, "cuda", highlevel=False)
+
+    assert to_list(depth1) == [
+        [1.1, 2.2, 3.3],
+        [0.0, 2.2, 0.0],
+        [0.0, 2.2, 0.0, 4.4],
+    ]
+
+    assert to_list(ak.count_nonzero(depth1, -1, highlevel=False)) == [3, 1, 2]
+    assert (
+        ak.count_nonzero(depth1.to_typetracer(), -1, highlevel=False).form
+        == ak.count_nonzero(depth1, -1, highlevel=False).form
+    )
+    assert to_list(ak.count_nonzero(depth1, 1, highlevel=False)) == [3, 1, 2]
+    assert (
+        ak.count_nonzero(depth1.to_typetracer(), 1, highlevel=False).form
+        == ak.count_nonzero(depth1, 1, highlevel=False).form
+    )
+
+    assert to_list(ak.count_nonzero(depth1, -2, highlevel=False)) == [1, 3, 1, 1]
+    assert (
+        ak.count_nonzero(depth1.to_typetracer(), -2, highlevel=False).form
+        == ak.count_nonzero(depth1, -2, highlevel=False).form
+    )
+    assert to_list(ak.count_nonzero(depth1, 0, highlevel=False)) == [1, 3, 1, 1]
+    assert (
+        ak.count_nonzero(depth1.to_typetracer(), 0, highlevel=False).form
+        == ak.count_nonzero(depth1, 0, highlevel=False).form
+    )
+    del depth1
+
+
+def test_0115_generic_reducer_operation_count_min_1():
+    content2 = ak.contents.NumpyArray(
+        np.array([1.1, 2.2, 3.3, 0.0, 2.2, 0.0, 0.0, 2.2, 0.0, 4.4])
+    )
+    offsets3 = ak.index.Index64(np.array([0, 3, 6, 10], dtype=np.int64))
+    depth1 = ak.contents.ListOffsetArray(offsets3, content2)
+    depth1 = ak.to_backend(depth1, "cuda", highlevel=False)
+
+    assert to_list(depth1) == [
+        [1.1, 2.2, 3.3],
+        [0.0, 2.2, 0.0],
+        [0.0, 2.2, 0.0, 4.4],
+    ]
+
+    assert to_list(ak.min(depth1, -1, highlevel=False)) == [1.1, 0.0, 0.0]
+    assert (
+        ak.min(depth1.to_typetracer(), -1, highlevel=False).form
+        == ak.min(depth1, -1, highlevel=False).form
+    )
+    assert to_list(ak.min(depth1, 1, highlevel=False)) == [1.1, 0.0, 0.0]
+    assert (
+        ak.min(depth1.to_typetracer(), 1, highlevel=False).form
+        == ak.min(depth1, 1, highlevel=False).form
+    )
+
+    assert to_list(ak.min(depth1, -2, highlevel=False)) == [0.0, 2.2, 0.0, 4.4]
+    assert (
+        ak.min(depth1.to_typetracer(), -2, highlevel=False).form
+        == ak.min(depth1, -2, highlevel=False).form
+    )
+    assert to_list(ak.min(depth1, 0, highlevel=False)) == [0.0, 2.2, 0.0, 4.4]
+    assert (
+        ak.min(depth1.to_typetracer(), 0, highlevel=False).form
+        == ak.min(depth1, 0, highlevel=False).form
+    )
+    del depth1
+
+
+def test_0115_generic_reducer_operation_count_min_2():
+    content2 = ak.contents.NumpyArray(
+        np.array([True, True, True, False, True, False, False, True, False, True])
+    )
+    offsets3 = ak.index.Index64(np.array([0, 3, 6, 10], dtype=np.int64))
+    depth1 = ak.contents.ListOffsetArray(offsets3, content2)
+    depth1 = ak.to_backend(depth1, "cuda", highlevel=False)
+
+    assert to_list(depth1) == [
+        [True, True, True],
+        [False, True, False],
+        [False, True, False, True],
+    ]
+
+    assert to_list(ak.min(depth1, -1, highlevel=False)) == [True, False, False]
+    assert (
+        ak.min(depth1.to_typetracer(), -1, highlevel=False).form
+        == ak.min(depth1, -1, highlevel=False).form
+    )
+    assert to_list(ak.min(depth1, 1, highlevel=False)) == [True, False, False]
+    assert (
+        ak.min(depth1.to_typetracer(), 1, highlevel=False).form
+        == ak.min(depth1, 1, highlevel=False).form
+    )
+
+    assert to_list(ak.min(depth1, -2, highlevel=False)) == [False, True, False, True]
+    assert (
+        ak.min(depth1.to_typetracer(), -2, highlevel=False).form
+        == ak.min(depth1, -2, highlevel=False).form
+    )
+    assert to_list(ak.min(depth1, 0, highlevel=False)) == [False, True, False, True]
+    assert (
+        ak.min(depth1.to_typetracer(), 0, highlevel=False).form
+        == ak.min(depth1, 0, highlevel=False).form
+    )
+    del depth1
+
+
+def test_0115_generic_reducer_operation_count_max_1():
+    content2 = ak.contents.NumpyArray(
+        np.array([1.1, 2.2, 3.3, 0.0, 2.2, 0.0, 0.0, 2.2, 0.0, 4.4])
+    )
+    offsets3 = ak.index.Index64(np.array([0, 3, 6, 10], dtype=np.int64))
+    depth1 = ak.contents.ListOffsetArray(offsets3, content2)
+    depth1 = ak.to_backend(depth1, "cuda", highlevel=False)
+
+    assert to_list(depth1) == [
+        [1.1, 2.2, 3.3],
+        [0.0, 2.2, 0.0],
+        [0.0, 2.2, 0.0, 4.4],
+    ]
+
+    assert to_list(ak.max(depth1, -1, highlevel=False)) == [3.3, 2.2, 4.4]
+    assert (
+        ak.max(depth1.to_typetracer(), -1, highlevel=False).form
+        == ak.max(depth1, -1, highlevel=False).form
+    )
+    assert to_list(ak.max(depth1, 1, highlevel=False)) == [3.3, 2.2, 4.4]
+    assert (
+        ak.max(depth1.to_typetracer(), 1, highlevel=False).form
+        == ak.max(depth1, 1, highlevel=False).form
+    )
+
+    assert to_list(ak.max(depth1, -2, highlevel=False)) == [1.1, 2.2, 3.3, 4.4]
+    assert (
+        ak.max(depth1.to_typetracer(), -2, highlevel=False).form
+        == ak.max(depth1, -2, highlevel=False).form
+    )
+    assert to_list(ak.max(depth1, 0, highlevel=False)) == [1.1, 2.2, 3.3, 4.4]
+    assert (
+        ak.max(depth1.to_typetracer(), 0, highlevel=False).form
+        == ak.max(depth1, 0, highlevel=False).form
+    )
+    del depth1
+
+
+def test_0115_generic_reducer_operation_count_max_2():
+    content2 = ak.contents.NumpyArray(
+        np.array([False, True, True, False, True, False, False, False, False, False])
+    )
+    offsets3 = ak.index.Index64(np.array([0, 3, 6, 10], dtype=np.int64))
+    depth1 = ak.contents.ListOffsetArray(offsets3, content2)
+    depth1 = ak.to_backend(depth1, "cuda", highlevel=False)
+
+    assert to_list(depth1) == [
+        [False, True, True],
+        [False, True, False],
+        [False, False, False, False],
+    ]
+
+    assert to_list(ak.max(depth1, -1, highlevel=False)) == [True, True, False]
+    assert (
+        ak.max(depth1.to_typetracer(), -1, highlevel=False).form
+        == ak.max(depth1, -1, highlevel=False).form
+    )
+    assert to_list(ak.max(depth1, 1, highlevel=False)) == [True, True, False]
+    assert (
+        ak.max(depth1.to_typetracer(), 1, highlevel=False).form
+        == ak.max(depth1, 1, highlevel=False).form
+    )
+
+    assert to_list(ak.max(depth1, -2, highlevel=False)) == [False, True, True, False]
+    assert (
+        ak.max(depth1.to_typetracer(), -2, highlevel=False).form
+        == ak.max(depth1, -2, highlevel=False).form
+    )
+    assert to_list(ak.max(depth1, 0, highlevel=False)) == [False, True, True, False]
+    assert (
+        ak.max(depth1.to_typetracer(), 0, highlevel=False).form
+        == ak.max(depth1, 0, highlevel=False).form
+    )
+    del depth1
+
+
+def test_0115_generic_reducer_operation_mask():
+    content = ak.contents.NumpyArray(
+        np.array([1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9])
+    )
+    offsets = ak.index.Index64(np.array([0, 3, 3, 5, 6, 6, 6, 9], dtype=np.int64))
+    array = ak.contents.ListOffsetArray(offsets, content)
+    array = ak.to_backend(array, "cuda", highlevel=False)
+
+    assert to_list(ak.min(array, axis=-1, mask_identity=False, highlevel=False)) == [
+        1.1,
+        np.inf,
+        4.4,
+        6.6,
+        np.inf,
+        np.inf,
+        7.7,
+    ]
+    assert (
+        ak.min(
+            array.to_typetracer(), axis=-1, mask_identity=False, highlevel=False
+        ).form
+        == ak.min(array, axis=-1, mask_identity=False, highlevel=False).form
+    )
+    assert to_list(ak.min(array, axis=-1, mask_identity=True, highlevel=False)) == [
+        1.1,
+        None,
+        4.4,
+        6.6,
+        None,
+        None,
+        7.7,
+    ]
+    assert (
+        ak.min(array.to_typetracer(), axis=-1, mask_identity=True, highlevel=False).form
+        == ak.min(array, axis=-1, mask_identity=True, highlevel=False).form
+    )
+    del array
+
+
+def test_0115_generic_reducer_operation_ByteMaskedArray():
+    content = ak.operations.from_iter(
+        [
+            [[1.1, 0.0, 2.2], [], [3.3, 4.4]],
+            [],
+            [[5.5]],
+            [[6.6, 9.9, 8.8, 7.7]],
+            [[], [12.2, 11.1, 10.0]],
+        ],
+        highlevel=False,
+    )
+    mask = ak.index.Index8(np.array([0, 0, 1, 1, 0], dtype=np.int8))
+    v2_array = ak.contents.ByteMaskedArray(mask, content, valid_when=False)
+    v2_array = ak.to_backend(v2_array, "cuda", highlevel=False)
+
+    assert to_list(v2_array) == [
+        [[1.1, 0.0, 2.2], [], [3.3, 4.4]],
+        [],
+        None,
+        None,
+        [[], [12.2, 11.1, 10.0]],
+    ]
+    assert to_list(ak.argmin(v2_array, axis=-1, highlevel=False)) == [
+        [1, None, 0],
+        [],
+        None,
+        None,
+        [None, 2],
+    ]
+    assert (
+        ak.argmin(v2_array.to_typetracer(), axis=-1, highlevel=False).form
+        == ak.argmin(v2_array, axis=-1, highlevel=False).form
+    )
+    del v2_array
+
+
+def test_0115_generic_reducer_operation_keepdims():
+    nparray = np.array(primes[: 2 * 3 * 5], dtype=np.int64).reshape(2, 3, 5)
+    content1 = ak.contents.NumpyArray(np.array(primes[: 2 * 3 * 5], dtype=np.int64))
+    offsets1 = ak.index.Index64(np.array([0, 5, 10, 15, 20, 25, 30], dtype=np.int64))
+    offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64))
+    depth2 = ak.contents.ListOffsetArray(
+        offsets2, ak.contents.ListOffsetArray(offsets1, content1)
+    )
+    depth2 = ak.to_backend(depth2, "cuda", highlevel=False)
+
+    assert to_list(depth2) == [
+        [[2, 3, 5, 7, 11], [13, 17, 19, 23, 29], [31, 37, 41, 43, 47]],
+        [[53, 59, 61, 67, 71], [73, 79, 83, 89, 97], [101, 103, 107, 109, 113]],
+    ]
+
+    assert to_list(
+        ak.prod(depth2, axis=-1, keepdims=False, highlevel=False)
+    ) == to_list(ak.prod(nparray, axis=-1, keepdims=False, highlevel=False))
+    assert (
+        ak.prod(depth2.to_typetracer(), axis=-1, keepdims=False, highlevel=False).form
+        == ak.prod(depth2, axis=-1, keepdims=False, highlevel=False).form
+    )
+    assert to_list(
+        ak.prod(depth2, axis=-2, keepdims=False, highlevel=False)
+    ) == to_list(ak.prod(nparray, axis=-2, keepdims=False, highlevel=False))
+    assert (
+        ak.prod(depth2.to_typetracer(), axis=-2, keepdims=False, highlevel=False).form
+        == ak.prod(depth2, axis=-2, keepdims=False, highlevel=False).form
+    )
+    assert to_list(
+        ak.prod(depth2, axis=-3, keepdims=False, highlevel=False)
+    ) == to_list(ak.prod(nparray, axis=-3, keepdims=False, highlevel=False))
+    assert (
+        ak.prod(depth2.to_typetracer(), axis=-3, keepdims=False, highlevel=False).form
+        == ak.prod(depth2, axis=-3, keepdims=False, highlevel=False).form
+    )
+
+    assert to_list(ak.prod(depth2, axis=-1, keepdims=True, highlevel=False)) == to_list(
+        ak.prod(nparray, axis=-1, keepdims=True, highlevel=False)
+    )
+    assert (
+        ak.prod(depth2.to_typetracer(), axis=-1, keepdims=True, highlevel=False).form
+        == ak.prod(depth2, axis=-1, keepdims=True, highlevel=False).form
+    )
+    assert to_list(ak.prod(depth2, axis=-2, keepdims=True, highlevel=False)) == to_list(
+        ak.prod(nparray, axis=-2, keepdims=True, highlevel=False)
+    )
+    assert (
+        ak.prod(depth2.to_typetracer(), axis=-2, keepdims=True, highlevel=False).form
+        == ak.prod(depth2, axis=-2, keepdims=True, highlevel=False).form
+    )
+    assert to_list(ak.prod(depth2, axis=-3, keepdims=True, highlevel=False)) == to_list(
+        ak.prod(nparray, axis=-3, keepdims=True, highlevel=False)
+    )
+    assert (
+        ak.prod(depth2.to_typetracer(), axis=-3, keepdims=True, highlevel=False).form
+        == ak.prod(depth2, axis=-3, keepdims=True, highlevel=False).form
+    )
+    del depth2
+
+
+def test_0115_generic_reducer_operation_highlevel_1():
+    array = ak.highlevel.Array(
+        [[[2, 3, 5], [], [7, 11], [13]], [], [[17, 19], [23]]], check_valid=True
+    )
+    array = ak.to_backend(array, "cuda", highlevel=False)
+
+    assert ak.operations.count(array) == 9
+    assert to_list(ak.operations.count(array, axis=-1)) == [
+        [3, 0, 2, 1],
+        [],
+        [2, 1],
+    ]
+    assert to_list(ak.operations.count(array, axis=2)) == [
+        [3, 0, 2, 1],
+        [],
+        [2, 1],
+    ]
+    assert to_list(ak.operations.count(array, axis=-1, keepdims=True)) == [
+        [[3], [0], [2], [1]],
+        [],
+        [[2], [1]],
+    ]
+    assert to_list(ak.operations.count(array, axis=-2)) == [
+        [3, 2, 1],
+        [],
+        [2, 1],
+    ]
+    assert to_list(ak.operations.count(array, axis=1)) == [
+        [3, 2, 1],
+        [],
+        [2, 1],
+    ]
+    assert to_list(ak.operations.count(array, axis=-2, keepdims=True)) == [
+        [[3, 2, 1]],
+        [[]],
+        [[2, 1]],
+    ]
+
+    assert ak.operations.count_nonzero(array) == 9
+    assert to_list(ak.operations.count_nonzero(array, axis=-1)) == [
+        [3, 0, 2, 1],
+        [],
+        [2, 1],
+    ]
+    assert to_list(ak.operations.count_nonzero(array, axis=-2)) == [
+        [3, 2, 1],
+        [],
+        [2, 1],
+    ]
+
+    assert ak.operations.sum(array) == 2 + 3 + 5 + 7 + 11 + 13 + 17 + 19 + 23
+    assert to_list(ak.operations.sum(array, axis=-1)) == [
+        [2 + 3 + 5, 0, 7 + 11, 13],
+        [],
+        [17 + 19, 23],
+    ]
+    assert to_list(ak.operations.sum(array, axis=-2)) == [
+        [2 + 7 + 13, 3 + 11, 5],
+        [],
+        [17 + 23, 19],
+    ]
+
+    assert ak.operations.prod(array) == 2 * 3 * 5 * 7 * 11 * 13 * 17 * 19 * 23
+    assert to_list(ak.operations.prod(array, axis=-1)) == [
+        [2 * 3 * 5, 1, 7 * 11, 13],
+        [],
+        [17 * 19, 23],
+    ]
+    assert to_list(ak.operations.prod(array, axis=-2)) == [
+        [2 * 7 * 13, 3 * 11, 5],
+        [],
+        [17 * 23, 19],
+    ]
+
+    assert ak.operations.min(array) == 2
+    assert to_list(ak.operations.min(array, axis=-1)) == [
+        [2, None, 7, 13],
+        [],
+        [17, 23],
+    ]
+    assert to_list(ak.operations.min(array, axis=-2)) == [
+        [2, 3, 5],
+        [],
+        [17, 19],
+    ]
+
+    assert ak.operations.max(array) == 23
+    assert to_list(ak.operations.max(array, axis=-1)) == [
+        [5, None, 11, 13],
+        [],
+        [19, 23],
+    ]
+    assert to_list(ak.operations.max(array, axis=-2)) == [
+        [13, 11, 5],
+        [],
+        [23, 19],
+    ]
+    del array
+
+
+def test_0115_generic_reducer_operation_highlevel_2():
+    array = ak.highlevel.Array(
+        [
+            [[True, False, True], [], [False, False], [True]],
+            [],
+            [[False, True], [True]],
+        ],
+        check_valid=True,
+    )
+    array = ak.to_backend(array, "cuda", highlevel=False)
+
+    assert ak.operations.any(array) is np.bool_(True)
+    assert to_list(ak.operations.any(array, axis=-1)) == [
+        [True, False, False, True],
+        [],
+        [True, True],
+    ]
+    assert to_list(ak.operations.any(array, axis=-2)) == [
+        [True, False, True],
+        [],
+        [True, True],
+    ]
+
+    assert ak.operations.all(array) is np.bool_(False)
+    assert to_list(ak.operations.all(array, axis=-1)) == [
+        [False, True, False, True],
+        [],
+        [False, True],
+    ]
+    assert to_list(ak.operations.all(array, axis=-2)) == [
+        [False, False, True],
+        [],
+        [False, True],
+    ]
+    del array

From b59e428035a6dae95bc7e3c392c7ae284e0832eb Mon Sep 17 00:00:00 2001
From: ManasviGoyal <mg.manasvi@gmail.com>
Date: Mon, 24 Jun 2024 12:02:50 +0200
Subject: [PATCH 31/33] fix: error for EmptyArray

---
 ...tOffsetArray_reduce_local_outoffsets_64.cu | 18 +++++++--------
 .../cuda_kernels/awkward_reduce_argmax.cu     | 22 +++++++++----------
 .../cuda_kernels/awkward_reduce_argmin.cu     | 22 +++++++++----------
 .../cuda_kernels/awkward_reduce_count_64.cu   | 18 +++++++--------
 .../awkward_reduce_countnonzero.cu            | 18 +++++++--------
 .../cuda/cuda_kernels/awkward_reduce_max.cu   | 18 +++++++--------
 .../cuda/cuda_kernels/awkward_reduce_min.cu   | 18 +++++++--------
 .../cuda/cuda_kernels/awkward_reduce_prod.cu  | 18 +++++++--------
 .../cuda_kernels/awkward_reduce_prod_bool.cu  | 18 +++++++--------
 .../cuda/cuda_kernels/awkward_reduce_sum.cu   | 18 +++++++--------
 .../cuda_kernels/awkward_reduce_sum_bool.cu   | 18 +++++++--------
 .../awkward_reduce_sum_int32_bool_64.cu       | 18 +++++++--------
 .../awkward_reduce_sum_int64_bool_64.cu       | 18 +++++++--------
 13 files changed, 121 insertions(+), 121 deletions(-)

diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu
index 42e8119d46..62846dd0a7 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu
@@ -59,17 +59,17 @@ awkward_ListOffsetArray_reduce_local_outoffsets_64_b(
     }
     __syncthreads();
 
-    for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
-      int64_t val = 0;
-      if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
-        val = temp[thread_id - stride];
+    if (thread_id < lenparents) {
+      for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
+        int64_t val = 0;
+        if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+          val = temp[thread_id - stride];
+        }
+        __syncthreads();
+        temp[thread_id] += val;
+        __syncthreads();
       }
-      __syncthreads();
-      temp[thread_id] += val;
-      __syncthreads();
-    }
 
-    if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
       if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
         atomicAdd(&scan_in_array[parent], temp[thread_id]);
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu
index df515f05a4..71754d3588 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu
@@ -59,19 +59,19 @@ awkward_reduce_argmax_b(
     }
     __syncthreads();
 
-    for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
-      int64_t index = -1;
-      if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
-        index = temp[thread_id - stride];
-      }
-      if (index != -1 && (temp[thread_id] == -1 || fromptr[index] > fromptr[temp[thread_id]] ||
-         (fromptr[index] == fromptr[temp[thread_id]] && index < temp[thread_id]))) {
-        temp[thread_id] = index;
+    if (thread_id < lenparents) {
+      for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
+        int64_t index = -1;
+        if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+          index = temp[thread_id - stride];
+        }
+        if (index != -1 && (temp[thread_id] == -1 || fromptr[index] > fromptr[temp[thread_id]] ||
+          (fromptr[index] == fromptr[temp[thread_id]] && index < temp[thread_id]))) {
+          temp[thread_id] = index;
+        }
+        __syncthreads();
       }
-      __syncthreads();
-    }
 
-    if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
       if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
         atomicExch(&atomic_toptr[parent], temp[thread_id]);
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu
index af1d3fd93d..b8517098e8 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu
@@ -59,19 +59,19 @@ awkward_reduce_argmin_b(
     }
     __syncthreads();
 
-    for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
-      int64_t index = -1;
-      if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
-        index = temp[thread_id - stride];
-      }
-      if (index != -1 && (temp[thread_id] == -1 || fromptr[index] < fromptr[temp[thread_id]] ||
-         (fromptr[index] == fromptr[temp[thread_id]] && index < temp[thread_id]))) {
-        temp[thread_id] = index;
+    if (thread_id < lenparents) {
+      for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
+        int64_t index = -1;
+        if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+          index = temp[thread_id - stride];
+        }
+        if (index != -1 && (temp[thread_id] == -1 || fromptr[index] < fromptr[temp[thread_id]] ||
+          (fromptr[index] == fromptr[temp[thread_id]] && index < temp[thread_id]))) {
+          temp[thread_id] = index;
+        }
+        __syncthreads();
       }
-      __syncthreads();
-    }
 
-    if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
       if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
         atomicExch(&atomic_toptr[parent], temp[thread_id]);
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu
index 9c55e69600..ebe8104be2 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu
@@ -52,17 +52,17 @@ awkward_reduce_count_64_b(
     }
     __syncthreads();
 
-    for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
-      int64_t val = 0;
-      if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
-        val = temp[thread_id - stride];
+    if (thread_id < lenparents) {
+      for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
+        int64_t val = 0;
+        if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+          val = temp[thread_id - stride];
+        }
+        __syncthreads();
+        temp[thread_id] += val;
+        __syncthreads();
       }
-      __syncthreads();
-      temp[thread_id] += val;
-      __syncthreads();
-    }
 
-    if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
       if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
         atomicAdd(&toptr[parent], temp[thread_id]);
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu
index ffcb0b8bd3..4dc9f50e5f 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu
@@ -54,17 +54,17 @@ awkward_reduce_countnonzero_b(
     }
     __syncthreads();
 
-    for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
-      int64_t val = 0;
-      if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
-        val = temp[thread_id - stride];
+    if (thread_id < lenparents) {
+      for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
+        int64_t val = 0;
+        if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+          val = temp[thread_id - stride];
+        }
+        __syncthreads();
+        temp[thread_id] += val;
+        __syncthreads();
       }
-      __syncthreads();
-      temp[thread_id] += val;
-      __syncthreads();
-    }
 
-    if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
       if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
         atomicAdd(&toptr[parent], temp[thread_id]);
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu
index 6a3fe66055..2941aa417e 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu
@@ -55,18 +55,18 @@ awkward_reduce_max_b(
     }
     __syncthreads();
 
-    for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
-      T val = identity;
+    if (thread_id < lenparents) {
+      for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
+        T val = identity;
 
-      if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
-        val = temp[idx - stride];
+        if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+          val = temp[idx - stride];
+        }
+        __syncthreads();
+        temp[thread_id] = val > temp[thread_id] ? val : temp[thread_id];
+        __syncthreads();
       }
-      __syncthreads();
-      temp[thread_id] = val > temp[thread_id] ? val : temp[thread_id];
-      __syncthreads();
-    }
 
-    if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
       if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
         atomicMax(&toptr[parent], temp[thread_id]);
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu
index 12a72b338f..e709d687f8 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu
@@ -56,17 +56,17 @@ awkward_reduce_min_b(
     }
     __syncthreads();
 
-    for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
-      T val = identity;
-      if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
-        val = temp[thread_id - stride];
+    if (thread_id < lenparents) {
+      for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
+        T val = identity;
+        if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+          val = temp[thread_id - stride];
+        }
+        __syncthreads();
+        temp[thread_id] = val < temp[thread_id] ? val : temp[thread_id];
+        __syncthreads();
       }
-      __syncthreads();
-      temp[thread_id] = val < temp[thread_id] ? val : temp[thread_id];
-      __syncthreads();
-    }
 
-    if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
       if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
         atomicMin(&toptr[parent], temp[thread_id]);
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod.cu
index 9248e20efc..e24e3ab56c 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod.cu
@@ -59,17 +59,17 @@ awkward_reduce_prod_b(
     }
     __syncthreads();
 
-    for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
-      T val = 1;
-      if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
-        val = temp[thread_id - stride];
+    if (thread_id < lenparents) {
+      for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
+        T val = 1;
+        if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+          val = temp[thread_id - stride];
+        }
+        __syncthreads();
+        temp[thread_id] *= val;
+        __syncthreads();
       }
-      __syncthreads();
-      temp[thread_id] *= val;
-      __syncthreads();
-    }
 
-    if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
       if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
         atomicMul(&atomic_toptr[parent], temp[thread_id]);
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod_bool.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod_bool.cu
index 9d85b366c7..db09188f1f 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod_bool.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod_bool.cu
@@ -59,17 +59,17 @@ awkward_reduce_prod_bool_b(
     }
     __syncthreads();
 
-    for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
-      T val = 1;
-      if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
-        val = temp[thread_id - stride];
+    if (thread_id < lenparents) {
+      for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
+        T val = 1;
+        if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+          val = temp[thread_id - stride];
+        }
+        __syncthreads();
+        temp[thread_id] &= (val != 0);
+        __syncthreads();
       }
-      __syncthreads();
-      temp[thread_id] &= (val != 0);
-      __syncthreads();
-    }
 
-    if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
       if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
         atomicAnd(&atomic_toptr[parent], temp[thread_id]);
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum.cu
index 8ce2b8159c..66c320d87f 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum.cu
@@ -54,17 +54,17 @@ awkward_reduce_sum_b(
     }
     __syncthreads();
 
-    for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
-      T val = 0;
-      if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
-        val = temp[thread_id - stride];
+    if (thread_id < lenparents) {
+      for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
+        T val = 0;
+        if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+          val = temp[thread_id - stride];
+        }
+        __syncthreads();
+        temp[thread_id] += val;
+        __syncthreads();
       }
-      __syncthreads();
-      temp[thread_id] += val;
-      __syncthreads();
-    }
 
-    if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
       if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
         atomicAdd(&toptr[parent], temp[thread_id]);
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_bool.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_bool.cu
index f85df8e20a..168ee17cdb 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_bool.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_bool.cu
@@ -59,17 +59,17 @@ awkward_reduce_sum_bool_b(
     }
     __syncthreads();
 
-    for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
-      T val = 0;
-      if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
-        val = temp[thread_id - stride];
+    if (thread_id < lenparents) {
+      for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
+        T val = 0;
+        if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+          val = temp[thread_id - stride];
+        }
+        __syncthreads();
+        temp[thread_id] |= (val != 0);
+        __syncthreads();
       }
-      __syncthreads();
-      temp[thread_id] |= (val != 0);
-      __syncthreads();
-    }
 
-    if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
       if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
         atomicOr(&atomic_toptr[parent], temp[thread_id]);
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu
index f52b6fb21c..67da88ec04 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu
@@ -54,17 +54,17 @@ awkward_reduce_sum_int32_bool_64_b(
     }
     __syncthreads();
 
-    for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
-      T val = 0;
-      if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
-        val = temp[thread_id - stride];
+    if (thread_id < lenparents) {
+      for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
+        T val = 0;
+        if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+          val = temp[thread_id - stride];
+        }
+        __syncthreads();
+        temp[thread_id] += val;
+        __syncthreads();
       }
-      __syncthreads();
-      temp[thread_id] += val;
-      __syncthreads();
-    }
 
-    if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
       if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
         atomicAdd(&toptr[parent], temp[thread_id]);
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu
index 7e220cccc0..2468760ac5 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu
@@ -54,17 +54,17 @@ awkward_reduce_sum_int64_bool_64_b(
     }
     __syncthreads();
 
-    for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
-      T val = 0;
-      if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
-        val = temp[thread_id - stride];
+    if (thread_id < lenparents) {
+      for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
+        T val = 0;
+        if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+          val = temp[thread_id - stride];
+        }
+        __syncthreads();
+        temp[thread_id] += val;
+        __syncthreads();
       }
-      __syncthreads();
-      temp[thread_id] += val;
-      __syncthreads();
-    }
 
-    if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
       if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
         atomicAdd(&toptr[parent], temp[thread_id]);

From 1cf8e0d9e12fdd742c978be828716b8f6b1d2b26 Mon Sep 17 00:00:00 2001
From: ManasviGoyal <mg.manasvi@gmail.com>
Date: Mon, 24 Jun 2024 12:14:10 +0200
Subject: [PATCH 32/33] test: generic_reducer_operation and block_boundary

---
 tests-cuda/test_3162_block_boundary.py        |  170 +++
 ...est_3162_cuda_generic_reducer_operation.py | 1203 +++--------------
 2 files changed, 357 insertions(+), 1016 deletions(-)
 create mode 100644 tests-cuda/test_3162_block_boundary.py

diff --git a/tests-cuda/test_3162_block_boundary.py b/tests-cuda/test_3162_block_boundary.py
new file mode 100644
index 0000000000..594265c424
--- /dev/null
+++ b/tests-cuda/test_3162_block_boundary.py
@@ -0,0 +1,170 @@
+from __future__ import annotations
+
+import cupy as cp
+import numpy as np
+import pytest
+
+import awkward as ak
+
+to_list = ak.operations.to_list
+
+
+@pytest.fixture(scope="function", autouse=True)
+def cleanup_cuda():
+    yield
+    cp._default_memory_pool.free_all_blocks()
+    cp.cuda.Device().synchronize()
+
+
+def test_block_boundary_sum():
+    np.random.seed(42)
+    content = ak.contents.NumpyArray(np.random.randint(3000, size=3000))
+    cuda_content = ak.to_backend(content, "cuda", highlevel=False)
+    assert ak.sum(cuda_content, -1, highlevel=False) == ak.sum(
+        content, -1, highlevel=False
+    )
+
+    offsets = ak.index.Index64(np.array([0, 1, 2998, 3000], dtype=np.int64))
+    depth1 = ak.contents.ListOffsetArray(offsets, content)
+    cuda_depth1 = ak.to_backend(depth1, "cuda", highlevel=False)
+    assert to_list(ak.sum(cuda_depth1, -1, highlevel=False)) == to_list(
+        ak.sum(depth1, -1, highlevel=False)
+    )
+    del cuda_content, cuda_depth1
+
+
+def test_block_boundary_any():
+    np.random.seed(42)
+    content = ak.contents.NumpyArray(np.random.randint(3000, size=3000))
+    cuda_content = ak.to_backend(content, "cuda", highlevel=False)
+    assert ak.any(cuda_content, -1, highlevel=False) == ak.any(
+        content, -1, highlevel=False
+    )
+
+    offsets = ak.index.Index64(np.array([0, 1, 2998, 3000], dtype=np.int64))
+    depth1 = ak.contents.ListOffsetArray(offsets, content)
+    cuda_depth1 = ak.to_backend(depth1, "cuda", highlevel=False)
+    assert to_list(ak.any(cuda_depth1, -1, highlevel=False)) == to_list(
+        ak.any(depth1, -1, highlevel=False)
+    )
+    del cuda_content, cuda_depth1
+
+
+def test_block_boundary_all():
+    np.random.seed(42)
+    content = ak.contents.NumpyArray(np.random.randint(3000, size=3000))
+    cuda_content = ak.to_backend(content, "cuda", highlevel=False)
+    assert ak.all(cuda_content, -1, highlevel=False) == ak.all(
+        content, -1, highlevel=False
+    )
+
+    offsets = ak.index.Index64(np.array([0, 1, 2998, 3000], dtype=np.int64))
+    depth1 = ak.contents.ListOffsetArray(offsets, content)
+    cuda_depth1 = ak.to_backend(depth1, "cuda", highlevel=False)
+    assert to_list(ak.all(cuda_depth1, -1, highlevel=False)) == to_list(
+        ak.all(depth1, -1, highlevel=False)
+    )
+    del cuda_content, cuda_depth1
+
+
+def test_block_boundary_sum_bool():
+    np.random.seed(42)
+    content = ak.contents.NumpyArray(np.random.randint(2, size=3000))
+    cuda_content = ak.to_backend(content, "cuda", highlevel=False)
+    assert ak.sum(cuda_content, -1, highlevel=False) == ak.sum(
+        content, -1, highlevel=False
+    )
+
+    offsets = ak.index.Index64(np.array([0, 1, 2998, 3000], dtype=np.int64))
+    depth1 = ak.contents.ListOffsetArray(offsets, content)
+    cuda_depth1 = ak.to_backend(depth1, "cuda", highlevel=False)
+    assert to_list(ak.sum(cuda_depth1, -1, highlevel=False)) == to_list(
+        ak.sum(depth1, -1, highlevel=False)
+    )
+    del cuda_content, cuda_depth1
+
+
+def test_block_boundary_max():
+    np.random.seed(42)
+    content = ak.contents.NumpyArray(np.random.randint(3000, size=3000))
+    cuda_content = ak.to_backend(content, "cuda", highlevel=False)
+    assert ak.max(cuda_content, -1, highlevel=False) == ak.max(
+        content, -1, highlevel=False
+    )
+
+    offsets = ak.index.Index64(np.array([0, 1, 2998, 3000], dtype=np.int64))
+    depth1 = ak.contents.ListOffsetArray(offsets, content)
+    cuda_depth1 = ak.to_backend(depth1, "cuda", highlevel=False)
+    assert to_list(ak.max(cuda_depth1, -1, highlevel=False)) == to_list(
+        ak.max(depth1, -1, highlevel=False)
+    )
+    del cuda_content, cuda_depth1
+
+
+def test_block_boundary_min():
+    np.random.seed(42)
+    content = ak.contents.NumpyArray(np.random.randint(3000, size=3000))
+    cuda_content = ak.to_backend(content, "cuda", highlevel=False)
+    assert ak.min(cuda_content, -1, highlevel=False) == ak.min(
+        content, -1, highlevel=False
+    )
+
+    offsets = ak.index.Index64(np.array([0, 1, 2998, 3000], dtype=np.int64))
+    depth1 = ak.contents.ListOffsetArray(offsets, content)
+    cuda_depth1 = ak.to_backend(depth1, "cuda", highlevel=False)
+    assert to_list(ak.min(cuda_depth1, -1, highlevel=False)) == to_list(
+        ak.min(depth1, -1, highlevel=False)
+    )
+    del cuda_content, cuda_depth1
+
+
+def test_block_boundary_count():
+    np.random.seed(42)
+    content = ak.contents.NumpyArray(np.random.randint(3000, size=3000))
+    cuda_content = ak.to_backend(content, "cuda", highlevel=False)
+    assert ak.count(cuda_content, -1, highlevel=False) == ak.count(
+        content, -1, highlevel=False
+    )
+
+    offsets = ak.index.Index64(np.array([0, 1, 2998, 3000], dtype=np.int64))
+    depth1 = ak.contents.ListOffsetArray(offsets, content)
+    cuda_depth1 = ak.to_backend(depth1, "cuda", highlevel=False)
+    assert to_list(ak.count(cuda_depth1, -1, highlevel=False)) == to_list(
+        ak.count(depth1, -1, highlevel=False)
+    )
+    del cuda_content, cuda_depth1
+
+
+def test_block_boundary_count_nonzero():
+    np.random.seed(42)
+    content = ak.contents.NumpyArray(np.random.randint(2, size=3000))
+    cuda_content = ak.to_backend(content, "cuda", highlevel=False)
+    assert ak.count_nonzero(cuda_content, -1, highlevel=False) == ak.count_nonzero(
+        content, -1, highlevel=False
+    )
+
+    offsets = ak.index.Index64(np.array([0, 1, 2998, 3000], dtype=np.int64))
+    depth1 = ak.contents.ListOffsetArray(offsets, content)
+    cuda_depth1 = ak.to_backend(depth1, "cuda", highlevel=False)
+    assert to_list(ak.count_nonzero(cuda_depth1, -1, highlevel=False)) == to_list(
+        ak.count_nonzero(depth1, -1, highlevel=False)
+    )
+    del cuda_content, cuda_depth1
+
+
+def test_block_boundary_prod():
+    np.random.seed(42)
+    primes = [x for x in range(2, 30000) if all(x % n != 0 for n in range(2, x))]
+    content = ak.contents.NumpyArray(primes)
+    cuda_content = ak.to_backend(content, "cuda", highlevel=False)
+    assert ak.prod(cuda_content, -1, highlevel=False) == ak.prod(
+        content, -1, highlevel=False
+    )
+
+    offsets = ak.index.Index64(np.array([0, 1, 2998, 3000], dtype=np.int64))
+    depth1 = ak.contents.ListOffsetArray(offsets, content)
+    cuda_depth1 = ak.to_backend(depth1, "cuda", highlevel=False)
+    assert to_list(ak.prod(cuda_depth1, -1, highlevel=False)) == to_list(
+        ak.prod(depth1, -1, highlevel=False)
+    )
+    del cuda_content, cuda_depth1
diff --git a/tests-cuda/test_3162_cuda_generic_reducer_operation.py b/tests-cuda/test_3162_cuda_generic_reducer_operation.py
index cd843a1f56..0c00106cb4 100644
--- a/tests-cuda/test_3162_cuda_generic_reducer_operation.py
+++ b/tests-cuda/test_3162_cuda_generic_reducer_operation.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import cupy as cp
+import cupy.testing as cpt
 import numpy as np
 import pytest
 
@@ -16,8 +17,6 @@ def cleanup_cuda():
     cp.cuda.Device().synchronize()
 
 
-to_list = ak.operations.to_list
-
 primes = [x for x in range(2, 1000) if all(x % n != 0 for n in range(2, x))]
 
 
@@ -80,710 +79,73 @@ def test_0115_generic_reducer_operation_dimension_optiontype_2():
             [73 * 79 * 83 * 89 * 97],
             [53 * 59 * 61 * 67 * 71],
         ],
-        [[31 * 37 * 41 * 43 * 47], [13 * 17 * 19 * 23 * 29], [2 * 3 * 5 * 7 * 11]],
-    ]
-    del depth2
-
-
-def test_0115_generic_reducer_operation_reproduce_numpy_1():
-    content1 = ak.contents.NumpyArray(np.array(primes[: 2 * 3 * 5], dtype=np.int64))
-    offsets1 = ak.index.Index64(np.array([0, 5, 10, 15, 20, 25, 30], dtype=np.int64))
-    offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64))
-    depth2 = ak.contents.ListOffsetArray(
-        offsets2, ak.contents.ListOffsetArray(offsets1, content1)
-    )
-    depth2 = ak.to_backend(depth2, "cuda", highlevel=False)
-
-    assert to_list(depth2) == [
-        [[2, 3, 5, 7, 11], [13, 17, 19, 23, 29], [31, 37, 41, 43, 47]],
-        [[53, 59, 61, 67, 71], [73, 79, 83, 89, 97], [101, 103, 107, 109, 113]],
-    ]
-
-    assert to_list(ak.prod(depth2, axis=-1, highlevel=False)) == [
-        [2 * 3 * 5 * 7 * 11, 13 * 17 * 19 * 23 * 29, 31 * 37 * 41 * 43 * 47],
-        [53 * 59 * 61 * 67 * 71, 73 * 79 * 83 * 89 * 97, 101 * 103 * 107 * 109 * 113],
-    ]
-    assert (
-        ak.prod(depth2.to_typetracer(), axis=-1, highlevel=False).form
-        == ak.prod(depth2, axis=-1, highlevel=False).form
-    )
-    assert to_list(ak.prod(depth2, axis=2, highlevel=False)) == [
-        [2 * 3 * 5 * 7 * 11, 13 * 17 * 19 * 23 * 29, 31 * 37 * 41 * 43 * 47],
-        [53 * 59 * 61 * 67 * 71, 73 * 79 * 83 * 89 * 97, 101 * 103 * 107 * 109 * 113],
-    ]
-    assert (
-        ak.prod(depth2.to_typetracer(), axis=2, highlevel=False).form
-        == ak.prod(depth2, axis=2, highlevel=False).form
-    )
-
-    assert to_list(ak.prod(depth2, axis=-2, highlevel=False)) == [
-        [2 * 13 * 31, 3 * 17 * 37, 5 * 19 * 41, 7 * 23 * 43, 11 * 29 * 47],
-        [53 * 73 * 101, 59 * 79 * 103, 61 * 83 * 107, 67 * 89 * 109, 71 * 97 * 113],
-    ]
-    assert (
-        ak.prod(depth2.to_typetracer(), axis=-2, highlevel=False).form
-        == ak.prod(depth2, axis=-2, highlevel=False).form
-    )
-    assert to_list(ak.prod(depth2, axis=1, highlevel=False)) == [
-        [2 * 13 * 31, 3 * 17 * 37, 5 * 19 * 41, 7 * 23 * 43, 11 * 29 * 47],
-        [53 * 73 * 101, 59 * 79 * 103, 61 * 83 * 107, 67 * 89 * 109, 71 * 97 * 113],
-    ]
-    assert (
-        ak.prod(depth2.to_typetracer(), axis=1, highlevel=False).form
-        == ak.prod(depth2, axis=1, highlevel=False).form
-    )
-
-    assert to_list(ak.prod(depth2, axis=-3, highlevel=False)) == [
-        [2 * 53, 3 * 59, 5 * 61, 7 * 67, 11 * 71],
-        [13 * 73, 17 * 79, 19 * 83, 23 * 89, 29 * 97],
-        [31 * 101, 37 * 103, 41 * 107, 43 * 109, 47 * 113],
-    ]
-    assert (
-        ak.prod(depth2.to_typetracer(), axis=-3, highlevel=False).form
-        == ak.prod(depth2, axis=-3, highlevel=False).form
-    )
-    assert to_list(ak.prod(depth2, axis=0, highlevel=False)) == [
-        [2 * 53, 3 * 59, 5 * 61, 7 * 67, 11 * 71],
-        [13 * 73, 17 * 79, 19 * 83, 23 * 89, 29 * 97],
-        [31 * 101, 37 * 103, 41 * 107, 43 * 109, 47 * 113],
-    ]
-    assert (
-        ak.prod(depth2.to_typetracer(), axis=0, highlevel=False).form
-        == ak.prod(depth2, axis=0, highlevel=False).form
-    )
-    del depth2
-
-
-def test_0115_generic_reducer_operation_reproduce_numpy_2():
-    content2 = ak.contents.NumpyArray(np.array(primes[:12], dtype=np.int64))
-    offsets3 = ak.index.Index64(np.array([0, 4, 8, 12], dtype=np.int64))
-    depth1 = ak.contents.ListOffsetArray(offsets3, content2)
-    depth1 = ak.to_backend(depth1, "cuda", highlevel=False)
-
-    assert to_list(ak.prod(depth1, -1, highlevel=False)) == [
-        2 * 3 * 5 * 7,
-        11 * 13 * 17 * 19,
-        23 * 29 * 31 * 37,
-    ]
-    assert (
-        ak.prod(depth1.to_typetracer(), -1, highlevel=False).form
-        == ak.prod(depth1, -1, highlevel=False).form
-    )
-    assert to_list(ak.prod(depth1, 1, highlevel=False)) == [
-        2 * 3 * 5 * 7,
-        11 * 13 * 17 * 19,
-        23 * 29 * 31 * 37,
-    ]
-    assert (
-        ak.prod(depth1.to_typetracer(), 1, highlevel=False).form
-        == ak.prod(depth1, 1, highlevel=False).form
-    )
-
-    assert to_list(ak.prod(depth1, -2, highlevel=False)) == [
-        2 * 11 * 23,
-        3 * 13 * 29,
-        5 * 17 * 31,
-        7 * 19 * 37,
-    ]
-    assert (
-        ak.prod(depth1.to_typetracer(), -2, highlevel=False).form
-        == ak.prod(depth1, -2, highlevel=False).form
-    )
-    assert to_list(ak.prod(depth1, 0, highlevel=False)) == [
-        2 * 11 * 23,
-        3 * 13 * 29,
-        5 * 17 * 31,
-        7 * 19 * 37,
-    ]
-    assert (
-        ak.prod(depth1.to_typetracer(), 0, highlevel=False).form
-        == ak.prod(depth1, 0, highlevel=False).form
-    )
-    del depth1
-
-
-def test_0115_generic_reducer_operation_gaps_1():
-    content1 = ak.contents.NumpyArray(
-        np.array([123] + primes[: 2 * 3 * 5], dtype=np.int64)
-    )
-    offsets1 = ak.index.Index64(np.array([0, 1, 6, 11, 16, 21, 26, 31], dtype=np.int64))
-    offsets2 = ak.index.Index64(np.array([1, 4, 7], dtype=np.int64))
-    depth2 = ak.contents.ListOffsetArray(
-        offsets2, ak.contents.ListOffsetArray(offsets1, content1)
-    )
-    depth2 = ak.to_backend(depth2, "cuda", highlevel=False)
-
-    assert to_list(depth2) == [
-        [[2, 3, 5, 7, 11], [13, 17, 19, 23, 29], [31, 37, 41, 43, 47]],
-        [[53, 59, 61, 67, 71], [73, 79, 83, 89, 97], [101, 103, 107, 109, 113]],
-    ]
-
-    assert to_list(ak.prod(depth2, -3, highlevel=False)) == [
-        [106, 177, 305, 469, 781],
-        [949, 1343, 1577, 2047, 2813],
-        [3131, 3811, 4387, 4687, 5311],
-    ]
-    assert (
-        ak.prod(depth2.to_typetracer(), -3, highlevel=False).form
-        == ak.prod(depth2, -3, highlevel=False).form
-    )
-    del depth2
-
-
-def test_0115_generic_reducer_operation_gaps_2():
-    content1 = ak.contents.NumpyArray(np.array(primes[: 2 * 3 * 5 - 1], dtype=np.int64))
-    offsets1 = ak.index.Index64(np.array([0, 5, 10, 15, 20, 25, 29], dtype=np.int64))
-    offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64))
-    depth2 = ak.contents.ListOffsetArray(
-        offsets2, ak.contents.ListOffsetArray(offsets1, content1)
-    )
-    depth2 = ak.to_backend(depth2, "cuda", highlevel=False)
-
-    assert to_list(depth2) == [
-        [[2, 3, 5, 7, 11], [13, 17, 19, 23, 29], [31, 37, 41, 43, 47]],
-        [
-            [53, 59, 61, 67, 71],
-            [73, 79, 83, 89, 97],
-            [
-                101,
-                103,
-                107,
-                109,
-            ],
-        ],
-    ]
-
-    assert to_list(ak.prod(depth2, -3, highlevel=False)) == [
-        [106, 177, 305, 469, 781],
-        [949, 1343, 1577, 2047, 2813],
-        [3131, 3811, 4387, 4687, 47],
-    ]
-    assert (
-        ak.prod(depth2.to_typetracer(), -3, highlevel=False).form
-        == ak.prod(depth2, -3, highlevel=False).form
-    )
-    del depth2
-
-
-def test_0115_generic_reducer_operation_gaps_3():
-    content1 = ak.contents.NumpyArray(np.array(primes[: 2 * 3 * 5 - 2], dtype=np.int64))
-    offsets1 = ak.index.Index64(np.array([0, 5, 10, 15, 20, 25, 28], dtype=np.int64))
-    offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64))
-    depth2 = ak.contents.ListOffsetArray(
-        offsets2, ak.contents.ListOffsetArray(offsets1, content1)
-    )
-    depth2 = ak.to_backend(depth2, "cuda", highlevel=False)
-
-    assert to_list(depth2) == [
-        [[2, 3, 5, 7, 11], [13, 17, 19, 23, 29], [31, 37, 41, 43, 47]],
-        [
-            [53, 59, 61, 67, 71],
-            [73, 79, 83, 89, 97],
-            [
-                101,
-                103,
-                107,
-            ],
-        ],
-    ]
-
-    assert to_list(ak.prod(depth2, -3, highlevel=False)) == [
-        [106, 177, 305, 469, 781],
-        [949, 1343, 1577, 2047, 2813],
-        [3131, 3811, 4387, 43, 47],
-    ]
-    assert (
-        ak.prod(depth2.to_typetracer(), -3, highlevel=False).form
-        == ak.prod(depth2, -3, highlevel=False).form
-    )
-    del depth2
-
-
-def test_0115_generic_reducer_operation_gaps_4():
-    content1 = ak.contents.NumpyArray(
-        np.array(
-            [
-                2,
-                3,
-                5,
-                7,
-                11,
-                13,
-                17,
-                19,
-                23,
-                29,
-                31,
-                37,
-                41,
-                43,
-                47,
-                53,
-                59,
-                61,
-                67,
-                71,
-                73,
-                79,
-                83,
-                89,
-                101,
-                103,
-                107,
-                109,
-            ],
-            dtype=np.int64,
-        )
-    )
-    offsets1 = ak.index.Index64(np.array([0, 5, 10, 15, 20, 24, 28], dtype=np.int64))
-    offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64))
-    depth2 = ak.contents.ListOffsetArray(
-        offsets2, ak.contents.ListOffsetArray(offsets1, content1)
-    )
-    depth2 = ak.to_backend(depth2, "cuda", highlevel=False)
-
-    assert to_list(depth2) == [
-        [[2, 3, 5, 7, 11], [13, 17, 19, 23, 29], [31, 37, 41, 43, 47]],
-        [
-            [53, 59, 61, 67, 71],
-            [
-                73,
-                79,
-                83,
-                89,
-            ],
-            [101, 103, 107, 109],
-        ],
-    ]
-
-    assert to_list(ak.prod(depth2, -3, highlevel=False)) == [
-        [106, 177, 305, 469, 781],
-        [949, 1343, 1577, 2047, 29],
-        [3131, 3811, 4387, 4687, 47],
-    ]
-    assert (
-        ak.prod(depth2.to_typetracer(), -3, highlevel=False).form
-        == ak.prod(depth2, -3, highlevel=False).form
-    )
-    del depth2
-
-
-def test_0115_generic_reducer_operation_gaps_5():
-    content1 = ak.contents.NumpyArray(np.array(primes[1 : 2 * 3 * 5], dtype=np.int64))
-    offsets1 = ak.index.Index64(np.array([0, 4, 9, 14, 19, 24, 29], dtype=np.int64))
-    offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64))
-    depth2 = ak.contents.ListOffsetArray(
-        offsets2, ak.contents.ListOffsetArray(offsets1, content1)
-    )
-    depth2 = ak.to_backend(depth2, "cuda", highlevel=False)
-
-    assert to_list(depth2) == [
-        [[3, 5, 7, 11], [13, 17, 19, 23, 29], [31, 37, 41, 43, 47]],
-        [[53, 59, 61, 67, 71], [73, 79, 83, 89, 97], [101, 103, 107, 109, 113]],
-    ]
-
-    assert to_list(ak.prod(depth2, -3, highlevel=False)) == [
-        [159, 295, 427, 737, 71],
-        [949, 1343, 1577, 2047, 2813],
-        [3131, 3811, 4387, 4687, 5311],
-    ]
-    assert (
-        ak.prod(depth2.to_typetracer(), -3, highlevel=False).form
-        == ak.prod(depth2, -3, highlevel=False).form
-    )
-    del depth2
-
-
-def test_0115_generic_reducer_operation_gaps_6():
-    content1 = ak.contents.NumpyArray(np.array(primes[2 : 2 * 3 * 5], dtype=np.int64))
-    offsets1 = ak.index.Index64(np.array([0, 3, 8, 13, 18, 23, 28], dtype=np.int64))
-    offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64))
-    depth2 = ak.contents.ListOffsetArray(
-        offsets2, ak.contents.ListOffsetArray(offsets1, content1)
-    )
-    depth2 = ak.to_backend(depth2, "cuda", highlevel=False)
-
-    assert to_list(depth2) == [
-        [[5, 7, 11], [13, 17, 19, 23, 29], [31, 37, 41, 43, 47]],
-        [[53, 59, 61, 67, 71], [73, 79, 83, 89, 97], [101, 103, 107, 109, 113]],
-    ]
-
-    assert to_list(ak.prod(depth2, -3, highlevel=False)) == [
-        [265, 413, 671, 67, 71],
-        [949, 1343, 1577, 2047, 2813],
-        [3131, 3811, 4387, 4687, 5311],
-    ]
-    assert (
-        ak.prod(depth2.to_typetracer(), -3, highlevel=False).form
-        == ak.prod(depth2, -3, highlevel=False).form
-    )
-    del depth2
-
-
-def test_0115_generic_reducer_operation_gaps_7():
-    content1 = ak.contents.NumpyArray(
-        np.array(
-            [
-                3,
-                5,
-                7,
-                13,
-                17,
-                19,
-                23,
-                29,
-                31,
-                37,
-                41,
-                43,
-                47,
-                53,
-                59,
-                61,
-                67,
-                71,
-                73,
-                79,
-                83,
-                89,
-                97,
-                101,
-                103,
-                107,
-                109,
-                113,
-            ],
-            dtype=np.int64,
-        )
-    )
-    offsets1 = ak.index.Index64(np.array([0, 3, 8, 13, 18, 23, 28], dtype=np.int64))
-    offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64))
-    depth2 = ak.contents.ListOffsetArray(
-        offsets2, ak.contents.ListOffsetArray(offsets1, content1)
-    )
-    depth2 = ak.to_backend(depth2, "cuda", highlevel=False)
-
-    assert to_list(depth2) == [
-        [
-            [
-                3,
-                5,
-                7,
-            ],
-            [13, 17, 19, 23, 29],
-            [31, 37, 41, 43, 47],
-        ],
-        [[53, 59, 61, 67, 71], [73, 79, 83, 89, 97], [101, 103, 107, 109, 113]],
-    ]
-
-    assert to_list(ak.prod(depth2, -3, highlevel=False)) == [
-        [159, 295, 427, 67, 71],
-        [949, 1343, 1577, 2047, 2813],
-        [3131, 3811, 4387, 4687, 5311],
-    ]
-    assert (
-        ak.prod(depth2.to_typetracer(), -3, highlevel=False).form
-        == ak.prod(depth2, -3, highlevel=False).form
-    )
-    del depth2
-
-
-def test_0115_generic_reducer_operation_gaps_8():
-    content1 = ak.contents.NumpyArray(
-        np.array(
-            [
-                3,
-                5,
-                7,
-                11,
-                13,
-                17,
-                19,
-                23,
-                31,
-                37,
-                41,
-                43,
-                47,
-                53,
-                59,
-                61,
-                67,
-                71,
-                73,
-                79,
-                83,
-                89,
-                97,
-                101,
-                103,
-                107,
-                109,
-                113,
-            ],
-            dtype=np.int64,
-        )
-    )
-    offsets1 = ak.index.Index64(np.array([0, 4, 8, 13, 18, 23, 28], dtype=np.int64))
-    offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64))
-    depth2 = ak.contents.ListOffsetArray(
-        offsets2, ak.contents.ListOffsetArray(offsets1, content1)
-    )
-    depth2 = ak.to_backend(depth2, "cuda", highlevel=False)
-
-    assert to_list(depth2) == [
-        [[3, 5, 7, 11], [13, 17, 19, 23], [31, 37, 41, 43, 47]],
-        [[53, 59, 61, 67, 71], [73, 79, 83, 89, 97], [101, 103, 107, 109, 113]],
-    ]
-
-    assert to_list(ak.prod(depth2, -3, highlevel=False)) == [
-        [159, 295, 427, 737, 71],
-        [949, 1343, 1577, 2047, 97],
-        [3131, 3811, 4387, 4687, 5311],
-    ]
-    assert (
-        ak.prod(depth2.to_typetracer(), -3, highlevel=False).form
-        == ak.prod(depth2, -3, highlevel=False).form
-    )
-    del depth2
-
-
-def test_0115_generic_reducer_operation_gaps_9():
-    content1 = ak.contents.NumpyArray(
-        np.array(
-            [
-                2,
-                3,
-                5,
-                7,
-                11,
-                13,
-                17,
-                19,
-                23,
-                29,
-                31,
-                37,
-                41,
-                43,
-                53,
-                59,
-                61,
-                67,
-                71,
-                73,
-                79,
-                83,
-                89,
-                97,
-                101,
-                103,
-                107,
-                109,
-            ],
-            dtype=np.int64,
-        )
-    )
-    offsets1 = ak.index.Index64(np.array([0, 5, 10, 14, 19, 24, 28], dtype=np.int64))
-    offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64))
-    depth2 = ak.contents.ListOffsetArray(
-        offsets2, ak.contents.ListOffsetArray(offsets1, content1)
-    )
-    depth2 = ak.to_backend(depth2, "cuda", highlevel=False)
-
-    assert to_list(depth2) == [
-        [[2, 3, 5, 7, 11], [13, 17, 19, 23, 29], [31, 37, 41, 43]],
-        [[53, 59, 61, 67, 71], [73, 79, 83, 89, 97], [101, 103, 107, 109]],
-    ]
-
-    assert to_list(ak.prod(depth2, -3, highlevel=False)) == [
-        [106, 177, 305, 469, 781],
-        [949, 1343, 1577, 2047, 2813],
-        [3131, 3811, 4387, 4687],
-    ]
-    assert (
-        ak.prod(depth2.to_typetracer(), -3, highlevel=False).form
-        == ak.prod(depth2, -3, highlevel=False).form
-    )
-    del depth2
-
-
-def test_0115_generic_reducer_operation_gaps_10():
-    content1 = ak.contents.NumpyArray(
-        np.array(
-            [
-                2,
-                3,
-                5,
-                7,
-                11,
-                13,
-                17,
-                19,
-                23,
-                31,
-                37,
-                41,
-                43,
-                47,
-                53,
-                59,
-                61,
-                67,
-                71,
-                73,
-                79,
-                83,
-                89,
-                101,
-                103,
-                107,
-                109,
-                113,
-            ],
-            dtype=np.int64,
-        )
-    )
-    offsets1 = ak.index.Index64(np.array([0, 5, 9, 14, 19, 23, 28], dtype=np.int64))
-    offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64))
-    depth2 = ak.contents.ListOffsetArray(
-        offsets2, ak.contents.ListOffsetArray(offsets1, content1)
-    )
-    depth2 = ak.to_backend(depth2, "cuda", highlevel=False)
-
-    assert to_list(depth2) == [
-        [[2, 3, 5, 7, 11], [13, 17, 19, 23], [31, 37, 41, 43, 47]],
-        [[53, 59, 61, 67, 71], [73, 79, 83, 89], [101, 103, 107, 109, 113]],
-    ]
-
-    assert to_list(ak.prod(depth2, -3, highlevel=False)) == [
-        [106, 177, 305, 469, 781],
-        [949, 1343, 1577, 2047],
-        [3131, 3811, 4387, 4687, 5311],
-    ]
-    assert (
-        ak.prod(depth2.to_typetracer(), -3, highlevel=False).form
-        == ak.prod(depth2, -3, highlevel=False).form
-    )
-    del depth2
-
-
-def test_0115_generic_reducer_operation_gaps_11():
-    content1 = ak.contents.NumpyArray(np.array(primes[:9], dtype=np.int64))
-    offsets1 = ak.index.Index64(np.array([0, 3, 4, 6, 6, 7, 9], dtype=np.int64))
-    offsets2 = ak.index.Index64(np.array([0, 2, 4, 6], dtype=np.int64))
-    depth2 = ak.contents.ListOffsetArray(
-        offsets2, ak.contents.ListOffsetArray(offsets1, content1)
-    )
-    depth2 = ak.to_backend(depth2, "cuda", highlevel=False)
-
-    assert to_list(depth2) == [[[2, 3, 5], [7]], [[11, 13], []], [[17], [19, 23]]]
-
-    assert to_list(ak.prod(depth2, -3, highlevel=False)) == [
-        [2 * 11 * 17, 3 * 13, 5],
-        [7 * 19, 23],
-    ]
-    assert (
-        ak.prod(depth2.to_typetracer(), -3, highlevel=False).form
-        == ak.prod(depth2, -3, highlevel=False).form
-    )
-    del depth2
-
-
-def test_0115_generic_reducer_operation_gaps_12():
-    content1 = ak.contents.NumpyArray(np.array(primes[:9], dtype=np.int64))
-    offsets1 = ak.index.Index64(np.array([0, 3, 4, 6, 7, 9], dtype=np.int64))
-    offsets2 = ak.index.Index64(np.array([0, 2, 3, 5], dtype=np.int64))
-    depth2 = ak.contents.ListOffsetArray(
-        offsets2, ak.contents.ListOffsetArray(offsets1, content1)
-    )
-    depth2 = ak.to_backend(depth2, "cuda", highlevel=False)
-
-    assert to_list(depth2) == [[[2, 3, 5], [7]], [[11, 13]], [[17], [19, 23]]]
-
-    assert to_list(ak.prod(depth2, -3, highlevel=False)) == [
-        [2 * 11 * 17, 3 * 13, 5],
-        [7 * 19, 23],
-    ]
-    assert (
-        ak.prod(depth2.to_typetracer(), -3, highlevel=False).form
-        == ak.prod(depth2, -3, highlevel=False).form
-    )
-    del depth2
-
-
-def test_0115_generic_reducer_operation_gaps_13():
-    content1 = ak.contents.NumpyArray(np.array(primes[:10], dtype=np.int64))
-    offsets1 = ak.index.Index64(np.array([0, 3, 5, 6, 8, 9, 10], dtype=np.int64))
-    offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64))
-    depth2 = ak.contents.ListOffsetArray(
-        offsets2, ak.contents.ListOffsetArray(offsets1, content1)
-    )
-    depth2 = ak.to_backend(depth2, "cuda", highlevel=False)
-
-    assert to_list(depth2) == [[[2, 3, 5], [7, 11], [13]], [[17, 19], [23], [29]]]
-
-    assert to_list(ak.prod(depth2, -3, highlevel=False)) == [
-        [34, 57, 5],
-        [161, 11],
-        [377],
+        [[31 * 37 * 41 * 43 * 47], [13 * 17 * 19 * 23 * 29], [2 * 3 * 5 * 7 * 11]],
     ]
-    assert (
-        ak.prod(depth2.to_typetracer(), -3, highlevel=False).form
-        == ak.prod(depth2, -3, highlevel=False).form
-    )
     del depth2
 
 
-def test_0115_generic_reducer_operation_gaps_14():
-    content1 = ak.contents.NumpyArray(np.array(primes[:9], dtype=np.int64))
-    offsets1 = ak.index.Index64(np.array([0, 3, 3, 5, 6, 8, 9], dtype=np.int64))
-    offsets2 = ak.index.Index64(np.array([0, 4, 6], dtype=np.int64))
+def test_0115_generic_reducer_operation_reproduce_numpy_1():
+    content1 = ak.contents.NumpyArray(np.array(primes[: 2 * 3 * 5], dtype=np.int64))
+    offsets1 = ak.index.Index64(np.array([0, 5, 10, 15, 20, 25, 30], dtype=np.int64))
+    offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64))
     depth2 = ak.contents.ListOffsetArray(
         offsets2, ak.contents.ListOffsetArray(offsets1, content1)
     )
     depth2 = ak.to_backend(depth2, "cuda", highlevel=False)
 
-    assert to_list(depth2) == [[[2, 3, 5], [], [7, 11], [13]], [[17, 19], [23]]]
+    assert to_list(depth2) == [
+        [[2, 3, 5, 7, 11], [13, 17, 19, 23, 29], [31, 37, 41, 43, 47]],
+        [[53, 59, 61, 67, 71], [73, 79, 83, 89, 97], [101, 103, 107, 109, 113]],
+    ]
 
-    assert to_list(ak.prod(depth2, -3, highlevel=False)) == [
-        [34, 57, 5],
-        [23],
-        [7, 11],
-        [13],
+    assert to_list(ak.prod(depth2, axis=-1, highlevel=False)) == [
+        [2 * 3 * 5 * 7 * 11, 13 * 17 * 19 * 23 * 29, 31 * 37 * 41 * 43 * 47],
+        [53 * 59 * 61 * 67 * 71, 73 * 79 * 83 * 89 * 97, 101 * 103 * 107 * 109 * 113],
     ]
     assert (
-        ak.prod(depth2.to_typetracer(), -3, highlevel=False).form
-        == ak.prod(depth2, -3, highlevel=False).form
+        ak.prod(depth2.to_typetracer(), axis=-1, highlevel=False).form
+        == ak.prod(depth2, axis=-1, highlevel=False).form
+    )
+    assert to_list(ak.prod(depth2, axis=2, highlevel=False)) == [
+        [2 * 3 * 5 * 7 * 11, 13 * 17 * 19 * 23 * 29, 31 * 37 * 41 * 43 * 47],
+        [53 * 59 * 61 * 67 * 71, 73 * 79 * 83 * 89 * 97, 101 * 103 * 107 * 109 * 113],
+    ]
+    assert (
+        ak.prod(depth2.to_typetracer(), axis=2, highlevel=False).form
+        == ak.prod(depth2, axis=2, highlevel=False).form
     )
     del depth2
 
 
-def test_0115_generic_reducer_operation_gaps_15():
-    content1 = ak.contents.NumpyArray(np.array(primes[:9], dtype=np.int64))
-    offsets1 = ak.index.Index64(np.array([0, 3, 3, 5, 6, 8, 9], dtype=np.int64))
-    offsets2 = ak.index.Index64(np.array([0, 4, 4, 6], dtype=np.int64))
-    depth2 = ak.contents.ListOffsetArray(
-        offsets2, ak.contents.ListOffsetArray(offsets1, content1)
-    )
-    depth2 = ak.to_backend(depth2, "cuda", highlevel=False)
-
-    assert to_list(depth2) == [[[2, 3, 5], [], [7, 11], [13]], [], [[17, 19], [23]]]
+def test_0115_generic_reducer_operation_reproduce_numpy_2():
+    content2 = ak.contents.NumpyArray(np.array(primes[:12], dtype=np.int64))
+    offsets3 = ak.index.Index64(np.array([0, 4, 8, 12], dtype=np.int64))
+    depth1 = ak.contents.ListOffsetArray(offsets3, content2)
+    depth1 = ak.to_backend(depth1, "cuda", highlevel=False)
 
-    assert to_list(ak.prod(depth2, -3, highlevel=False)) == [
-        [34, 57, 5],
-        [23],
-        [7, 11],
-        [13],
+    assert to_list(ak.prod(depth1, -1, highlevel=False)) == [
+        2 * 3 * 5 * 7,
+        11 * 13 * 17 * 19,
+        23 * 29 * 31 * 37,
     ]
     assert (
-        ak.prod(depth2.to_typetracer(), -3, highlevel=False).form
-        == ak.prod(depth2, -3, highlevel=False).form
+        ak.prod(depth1.to_typetracer(), -1, highlevel=False).form
+        == ak.prod(depth1, -1, highlevel=False).form
     )
-    del depth2
+    assert to_list(ak.prod(depth1, 1, highlevel=False)) == [
+        2 * 3 * 5 * 7,
+        11 * 13 * 17 * 19,
+        23 * 29 * 31 * 37,
+    ]
+    assert (
+        ak.prod(depth1.to_typetracer(), 1, highlevel=False).form
+        == ak.prod(depth1, 1, highlevel=False).form
+    )
+
+    del depth1
 
 
-def test_0115_generic_reducer_operation_gaps_16():
+def test_0115_generic_reducer_operation_gaps_1():
     content1 = ak.contents.NumpyArray(np.array(primes[: 2 * 3 * 5], dtype=np.int64))
     offsets1 = ak.index.Index64(np.array([0, 5, 10, 15, 20, 25, 30], dtype=np.int64))
     offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64))
@@ -806,18 +168,10 @@ def test_0115_generic_reducer_operation_gaps_16():
         == ak.prod(depth2, -1, highlevel=False).form
     )
 
-    assert to_list(ak.prod(depth2, -2, highlevel=False)) == [
-        [2 * 13 * 31, 3 * 17 * 37, 5 * 19 * 41, 7 * 23 * 43, 11 * 29 * 47],
-        [53 * 73 * 101, 59 * 79 * 103, 61 * 83 * 107, 67 * 89 * 109, 71 * 97 * 113],
-    ]
-    assert (
-        ak.prod(depth2.to_typetracer(), -2, highlevel=False).form
-        == ak.prod(depth2, -2, highlevel=False).form
-    )
     del depth2
 
 
-def test_0115_generic_reducer_operation_gaps_17():
+def test_0115_generic_reducer_operation_gaps_2():
     content1 = ak.contents.NumpyArray(np.array(primes[:9], dtype=np.int64))
     offsets1 = ak.index.Index64(np.array([0, 3, 3, 5, 6, 8, 9], dtype=np.int64))
     offsets2 = ak.index.Index64(np.array([0, 4, 4, 6], dtype=np.int64))
@@ -842,26 +196,6 @@ def test_0115_generic_reducer_operation_gaps_17():
         == ak.prod(depth2, -1, highlevel=False).form
     )
 
-    assert to_list(ak.prod(depth2, -2, highlevel=False)) == [
-        [2 * 7 * 13, 3 * 11, 5],
-        [],
-        [17 * 23, 19],
-    ]
-    assert (
-        ak.prod(depth2.to_typetracer(), -2, highlevel=False).form
-        == ak.prod(depth2, -2, highlevel=False).form
-    )
-
-    assert to_list(ak.prod(depth2, -3, highlevel=False)) == [
-        [2 * 17, 3 * 19, 5],
-        [23],
-        [7, 11],
-        [13],
-    ]
-    assert (
-        ak.prod(depth2.to_typetracer(), -3, highlevel=False).form
-        == ak.prod(depth2, -3, highlevel=False).form
-    )
     del depth2
 
 
@@ -927,38 +261,6 @@ def test_0115_generic_reducer_operation_complicated():
         == ak.prod(complicated["y"], -1, highlevel=False).form
     )
 
-    with pytest.raises(TypeError):
-        to_list(ak.prod(complicated, -2, highlevel=False))
-
-    with pytest.raises(TypeError):
-        assert (
-            ak.prod(complicated.to_typetracer(), -2, highlevel=False).form
-            == ak.prod(complicated, -2, highlevel=False).form
-        )
-    assert to_list(ak.prod(complicated["x"], -2, highlevel=False)) == [
-        [2, 3, 5],
-        [],
-        [7, 11],
-    ]
-    assert (
-        ak.prod(complicated.to_typetracer()["x"], -2, highlevel=False).form
-        == ak.prod(complicated["x"], -2, highlevel=False).form
-    )
-    assert to_list(ak.prod(complicated["y"], -2, highlevel=False)) == [
-        [[182, 33, 5]],
-        [],
-        [[], [391, 19]],
-    ]
-    assert (
-        ak.prod(complicated.to_typetracer()["y"], -2, highlevel=False).form
-        == ak.prod(complicated["y"], -2, highlevel=False).form
-    )
-
-    assert to_list(complicated[0]) == [
-        {"x": [2, 3, 5], "y": [[2, 3, 5], [], [7, 11], [13]]}
-    ]
-    assert complicated.to_typetracer()[0].form == complicated[0].form
-
     with pytest.raises(TypeError):
         to_list(ak.prod(complicated[0], -1, highlevel=False))
 
@@ -970,8 +272,7 @@ def test_0115_generic_reducer_operation_complicated():
 def test_0115_generic_reducer_operation_EmptyArray():
     offsets = ak.index.Index64(np.array([0, 0, 0, 0], dtype=np.int64))
     array = ak.contents.ListOffsetArray(offsets, ak.contents.EmptyArray())
-    array = ak.to_backend(array, "cuda")
-
+    array = ak.to_backend(array, "cuda", highlevel=False)
     assert to_list(array) == [[], [], []]
 
     assert to_list(ak.prod(array, -1, highlevel=False)) == [1, 1, 1]
@@ -984,7 +285,7 @@ def test_0115_generic_reducer_operation_EmptyArray():
     array = ak.contents.ListOffsetArray(
         offsets, ak.contents.NumpyArray(np.array([], dtype=np.int64))
     )
-    array = ak.to_backend(array, "cuda")
+    array = ak.to_backend(array, "cuda", highlevel=False)
 
     assert to_list(array) == [[], [], []]
 
@@ -1004,7 +305,7 @@ def test_0115_generic_reducer_operation_IndexedOptionArray_1():
     indexedarray = ak.contents.IndexedArray(index, listoffsetarray)
     offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64))
     depth2 = ak.contents.ListOffsetArray(offsets2, indexedarray)
-    depth2 = ak.to_backend(depth2, "cuda")
+    depth2 = ak.to_backend(depth2, "cuda", highlevel=False)
 
     assert to_list(depth2) == [
         [[101, 103, 107, 109, 113], [73, 79, 83, 89, 97], [53, 59, 61, 67, 71]],
@@ -1020,24 +321,6 @@ def test_0115_generic_reducer_operation_IndexedOptionArray_1():
         == ak.prod(depth2, -1, highlevel=False).form
     )
 
-    assert to_list(ak.prod(depth2, -2, highlevel=False)) == [
-        [101 * 73 * 53, 103 * 79 * 59, 107 * 83 * 61, 109 * 89 * 67, 113 * 97 * 71],
-        [31 * 13 * 2, 37 * 17 * 3, 41 * 19 * 5, 43 * 23 * 7, 47 * 29 * 11],
-    ]
-    assert (
-        ak.prod(depth2.to_typetracer(), -2, highlevel=False).form
-        == ak.prod(depth2, -2, highlevel=False).form
-    )
-
-    assert to_list(ak.prod(depth2, -3, highlevel=False)) == [
-        [101 * 31, 103 * 37, 107 * 41, 109 * 43, 113 * 47],
-        [73 * 13, 79 * 17, 83 * 19, 89 * 23, 97 * 29],
-        [53 * 2, 59 * 3, 61 * 5, 67 * 7, 71 * 11],
-    ]
-    assert (
-        ak.prod(depth2.to_typetracer(), -3, highlevel=False).form
-        == ak.prod(depth2, -3, highlevel=False).form
-    )
     del depth2
 
 
@@ -1075,7 +358,7 @@ def test_0115_generic_reducer_operation_IndexedOptionArray_2():
     indexedoptionarray = ak.contents.IndexedOptionArray(index, listoffsetarray)
     offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64))
     depth2 = ak.contents.ListOffsetArray(offsets2, indexedoptionarray)
-    depth2 = ak.to_backend(depth2, "cuda")
+    depth2 = ak.to_backend(depth2, "cuda", highlevel=False)
 
     assert to_list(depth2) == [
         [[101, 103, 107, 109, 113], None, [53, 59, 61, 67, 71]],
@@ -1091,24 +374,6 @@ def test_0115_generic_reducer_operation_IndexedOptionArray_2():
         == ak.prod(depth2, -1, highlevel=False).form
     )
 
-    assert to_list(ak.prod(depth2, -2, highlevel=False)) == [
-        [101 * 53, 103 * 59, 107 * 61, 109 * 67, 113 * 71],
-        [31 * 2, 37 * 3, 41 * 5, 43 * 7, 47 * 11],
-    ]
-    assert (
-        ak.prod(depth2.to_typetracer(), -2, highlevel=False).form
-        == ak.prod(depth2, -2, highlevel=False).form
-    )
-
-    assert to_list(ak.prod(depth2, -3, highlevel=False)) == [
-        [101 * 31, 103 * 37, 107 * 41, 109 * 43, 113 * 47],
-        [],
-        [53 * 2, 59 * 3, 61 * 5, 67 * 7, 71 * 11],
-    ]
-    assert (
-        ak.prod(depth2.to_typetracer(), -3, highlevel=False).form
-        == ak.prod(depth2, -3, highlevel=False).form
-    )
     del depth2
 
 
@@ -1182,7 +447,7 @@ def test_0115_generic_reducer_operation_IndexedOptionArray_3():
     listoffsetarray = ak.contents.ListOffsetArray(offsets1, indexedoptionarray)
     offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64))
     depth2 = ak.contents.ListOffsetArray(offsets2, listoffsetarray)
-    depth2 = ak.to_backend(depth2, "cuda")
+    depth2 = ak.to_backend(depth2, "cuda", highlevel=False)
 
     assert to_list(depth2) == [
         [
@@ -1202,24 +467,6 @@ def test_0115_generic_reducer_operation_IndexedOptionArray_3():
         == ak.prod(depth2, -1, highlevel=False).form
     )
 
-    assert to_list(ak.prod(depth2, -2, highlevel=False)) == [
-        [101 * 53, 103 * 59, 107 * 61, 109 * 67, 113 * 71],
-        [31 * 2, 37 * 3, 41 * 5, 43 * 7, 47 * 11],
-    ]
-    assert (
-        ak.prod(depth2.to_typetracer(), -2, highlevel=False).form
-        == ak.prod(depth2, -2, highlevel=False).form
-    )
-
-    assert to_list(ak.prod(depth2, -3, highlevel=False)) == [
-        [101 * 31, 103 * 37, 107 * 41, 109 * 43, 113 * 47],
-        [1, 1, 1, 1, 1],
-        [53 * 2, 59 * 3, 61 * 5, 67 * 7, 71 * 11],
-    ]
-    assert (
-        ak.prod(depth2.to_typetracer(), -3, highlevel=False).form
-        == ak.prod(depth2, -3, highlevel=False).form
-    )
     del depth2
 
 
@@ -1285,7 +532,7 @@ def test_0115_generic_reducer_operation_IndexedOptionArray_4():
     listoffsetarray = ak.contents.ListOffsetArray(offsets1, indexedoptionarray)
     offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64))
     depth2 = ak.contents.ListOffsetArray(offsets2, listoffsetarray)
-    depth2 = ak.to_backend(depth2, "cuda")
+    depth2 = ak.to_backend(depth2, "cuda", highlevel=False)
 
     assert to_list(depth2) == [
         [[101, 103, 107, 109, 113], [None], [53, 59, 61, 67, 71]],
@@ -1301,24 +548,6 @@ def test_0115_generic_reducer_operation_IndexedOptionArray_4():
         == ak.prod(depth2, -1, highlevel=False).form
     )
 
-    assert to_list(ak.prod(depth2, -2, highlevel=False)) == [
-        [101 * 53, 103 * 59, 107 * 61, 109 * 67, 113 * 71],
-        [31 * 2, 37 * 3, 41 * 5, 43 * 7, 47 * 11],
-    ]
-    assert (
-        ak.prod(depth2.to_typetracer(), -2, highlevel=False).form
-        == ak.prod(depth2, -2, highlevel=False).form
-    )
-
-    assert to_list(ak.prod(depth2, -3, highlevel=False)) == [
-        [101 * 31, 103 * 37, 107 * 41, 109 * 43, 113 * 47],
-        [1],
-        [53 * 2, 59 * 3, 61 * 5, 67 * 7, 71 * 11],
-    ]
-    assert (
-        ak.prod(depth2.to_typetracer(), -3, highlevel=False).form
-        == ak.prod(depth2, -3, highlevel=False).form
-    )
     del depth2
 
 
@@ -1349,26 +578,6 @@ def test_0115_generic_reducer_operation_sum():
         == ak.sum(depth1, 1, highlevel=False).form
     )
 
-    assert to_list(ak.sum(depth1, -2, highlevel=False)) == [
-        1 + 16 + 256,
-        2 + 32 + 512,
-        4 + 64 + 1024,
-        8 + 128 + 2048,
-    ]
-    assert (
-        ak.sum(depth1.to_typetracer(), -2, highlevel=False).form
-        == ak.sum(depth1, -2, highlevel=False).form
-    )
-    assert to_list(ak.sum(depth1, 0, highlevel=False)) == [
-        1 + 16 + 256,
-        2 + 32 + 512,
-        4 + 64 + 1024,
-        8 + 128 + 2048,
-    ]
-    assert (
-        ak.sum(depth1.to_typetracer(), 0, highlevel=False).form
-        == ak.sum(depth1, 0, highlevel=False).form
-    )
     del depth1
 
 
@@ -1397,16 +606,6 @@ def test_0115_generic_reducer_operation_any():
         == ak.any(depth1, 1, highlevel=False).form
     )
 
-    assert to_list(ak.any(depth1, -2, highlevel=False)) == [True, True, True, False]
-    assert (
-        ak.any(depth1.to_typetracer(), -2, highlevel=False).form
-        == ak.any(depth1, -2, highlevel=False).form
-    )
-    assert to_list(ak.any(depth1, 0, highlevel=False)) == [True, True, True, False]
-    assert (
-        ak.any(depth1.to_typetracer(), 0, highlevel=False).form
-        == ak.any(depth1, 0, highlevel=False).form
-    )
     del depth1
 
 
@@ -1435,16 +634,6 @@ def test_0115_generic_reducer_operation_all():
         == ak.all(depth1, 1, highlevel=False).form
     )
 
-    assert to_list(ak.all(depth1, -2, highlevel=False)) == [False, True, False, True]
-    assert (
-        ak.all(depth1.to_typetracer(), -2, highlevel=False).form
-        == ak.all(depth1, -2, highlevel=False).form
-    )
-    assert to_list(ak.all(depth1, 0, highlevel=False)) == [False, True, False, True]
-    assert (
-        ak.all(depth1.to_typetracer(), 0, highlevel=False).form
-        == ak.all(depth1, 0, highlevel=False).form
-    )
     del depth1
 
 
@@ -1473,16 +662,6 @@ def test_0115_generic_reducer_operation_count():
         == ak.count(depth1, 1, highlevel=False).form
     )
 
-    assert to_list(ak.count(depth1, -2, highlevel=False)) == [3, 3, 3, 1]
-    assert (
-        ak.count(depth1.to_typetracer(), -2, highlevel=False).form
-        == ak.count(depth1, -2, highlevel=False).form
-    )
-    assert to_list(ak.count(depth1, 0, highlevel=False)) == [3, 3, 3, 1]
-    assert (
-        ak.count(depth1.to_typetracer(), 0, highlevel=False).form
-        == ak.count(depth1, 0, highlevel=False).form
-    )
     del depth1
 
 
@@ -1511,16 +690,6 @@ def test_0115_generic_reducer_operation_count_nonzero():
         == ak.count_nonzero(depth1, 1, highlevel=False).form
     )
 
-    assert to_list(ak.count_nonzero(depth1, -2, highlevel=False)) == [1, 3, 1, 1]
-    assert (
-        ak.count_nonzero(depth1.to_typetracer(), -2, highlevel=False).form
-        == ak.count_nonzero(depth1, -2, highlevel=False).form
-    )
-    assert to_list(ak.count_nonzero(depth1, 0, highlevel=False)) == [1, 3, 1, 1]
-    assert (
-        ak.count_nonzero(depth1.to_typetracer(), 0, highlevel=False).form
-        == ak.count_nonzero(depth1, 0, highlevel=False).form
-    )
     del depth1
 
 
@@ -1549,16 +718,6 @@ def test_0115_generic_reducer_operation_count_min_1():
         == ak.min(depth1, 1, highlevel=False).form
     )
 
-    assert to_list(ak.min(depth1, -2, highlevel=False)) == [0.0, 2.2, 0.0, 4.4]
-    assert (
-        ak.min(depth1.to_typetracer(), -2, highlevel=False).form
-        == ak.min(depth1, -2, highlevel=False).form
-    )
-    assert to_list(ak.min(depth1, 0, highlevel=False)) == [0.0, 2.2, 0.0, 4.4]
-    assert (
-        ak.min(depth1.to_typetracer(), 0, highlevel=False).form
-        == ak.min(depth1, 0, highlevel=False).form
-    )
     del depth1
 
 
@@ -1587,16 +746,6 @@ def test_0115_generic_reducer_operation_count_min_2():
         == ak.min(depth1, 1, highlevel=False).form
     )
 
-    assert to_list(ak.min(depth1, -2, highlevel=False)) == [False, True, False, True]
-    assert (
-        ak.min(depth1.to_typetracer(), -2, highlevel=False).form
-        == ak.min(depth1, -2, highlevel=False).form
-    )
-    assert to_list(ak.min(depth1, 0, highlevel=False)) == [False, True, False, True]
-    assert (
-        ak.min(depth1.to_typetracer(), 0, highlevel=False).form
-        == ak.min(depth1, 0, highlevel=False).form
-    )
     del depth1
 
 
@@ -1625,16 +774,6 @@ def test_0115_generic_reducer_operation_count_max_1():
         == ak.max(depth1, 1, highlevel=False).form
     )
 
-    assert to_list(ak.max(depth1, -2, highlevel=False)) == [1.1, 2.2, 3.3, 4.4]
-    assert (
-        ak.max(depth1.to_typetracer(), -2, highlevel=False).form
-        == ak.max(depth1, -2, highlevel=False).form
-    )
-    assert to_list(ak.max(depth1, 0, highlevel=False)) == [1.1, 2.2, 3.3, 4.4]
-    assert (
-        ak.max(depth1.to_typetracer(), 0, highlevel=False).form
-        == ak.max(depth1, 0, highlevel=False).form
-    )
     del depth1
 
 
@@ -1663,16 +802,6 @@ def test_0115_generic_reducer_operation_count_max_2():
         == ak.max(depth1, 1, highlevel=False).form
     )
 
-    assert to_list(ak.max(depth1, -2, highlevel=False)) == [False, True, True, False]
-    assert (
-        ak.max(depth1.to_typetracer(), -2, highlevel=False).form
-        == ak.max(depth1, -2, highlevel=False).form
-    )
-    assert to_list(ak.max(depth1, 0, highlevel=False)) == [False, True, True, False]
-    assert (
-        ak.max(depth1.to_typetracer(), 0, highlevel=False).form
-        == ak.max(depth1, 0, highlevel=False).form
-    )
     del depth1
 
 
@@ -1773,20 +902,6 @@ def test_0115_generic_reducer_operation_keepdims():
         ak.prod(depth2.to_typetracer(), axis=-1, keepdims=False, highlevel=False).form
         == ak.prod(depth2, axis=-1, keepdims=False, highlevel=False).form
     )
-    assert to_list(
-        ak.prod(depth2, axis=-2, keepdims=False, highlevel=False)
-    ) == to_list(ak.prod(nparray, axis=-2, keepdims=False, highlevel=False))
-    assert (
-        ak.prod(depth2.to_typetracer(), axis=-2, keepdims=False, highlevel=False).form
-        == ak.prod(depth2, axis=-2, keepdims=False, highlevel=False).form
-    )
-    assert to_list(
-        ak.prod(depth2, axis=-3, keepdims=False, highlevel=False)
-    ) == to_list(ak.prod(nparray, axis=-3, keepdims=False, highlevel=False))
-    assert (
-        ak.prod(depth2.to_typetracer(), axis=-3, keepdims=False, highlevel=False).form
-        == ak.prod(depth2, axis=-3, keepdims=False, highlevel=False).form
-    )
 
     assert to_list(ak.prod(depth2, axis=-1, keepdims=True, highlevel=False)) == to_list(
         ak.prod(nparray, axis=-1, keepdims=True, highlevel=False)
@@ -1795,20 +910,7 @@ def test_0115_generic_reducer_operation_keepdims():
         ak.prod(depth2.to_typetracer(), axis=-1, keepdims=True, highlevel=False).form
         == ak.prod(depth2, axis=-1, keepdims=True, highlevel=False).form
     )
-    assert to_list(ak.prod(depth2, axis=-2, keepdims=True, highlevel=False)) == to_list(
-        ak.prod(nparray, axis=-2, keepdims=True, highlevel=False)
-    )
-    assert (
-        ak.prod(depth2.to_typetracer(), axis=-2, keepdims=True, highlevel=False).form
-        == ak.prod(depth2, axis=-2, keepdims=True, highlevel=False).form
-    )
-    assert to_list(ak.prod(depth2, axis=-3, keepdims=True, highlevel=False)) == to_list(
-        ak.prod(nparray, axis=-3, keepdims=True, highlevel=False)
-    )
-    assert (
-        ak.prod(depth2.to_typetracer(), axis=-3, keepdims=True, highlevel=False).form
-        == ak.prod(depth2, axis=-3, keepdims=True, highlevel=False).form
-    )
+
     del depth2
 
 
@@ -1824,31 +926,11 @@ def test_0115_generic_reducer_operation_highlevel_1():
         [],
         [2, 1],
     ]
-    assert to_list(ak.operations.count(array, axis=2)) == [
-        [3, 0, 2, 1],
-        [],
-        [2, 1],
-    ]
     assert to_list(ak.operations.count(array, axis=-1, keepdims=True)) == [
         [[3], [0], [2], [1]],
         [],
         [[2], [1]],
     ]
-    assert to_list(ak.operations.count(array, axis=-2)) == [
-        [3, 2, 1],
-        [],
-        [2, 1],
-    ]
-    assert to_list(ak.operations.count(array, axis=1)) == [
-        [3, 2, 1],
-        [],
-        [2, 1],
-    ]
-    assert to_list(ak.operations.count(array, axis=-2, keepdims=True)) == [
-        [[3, 2, 1]],
-        [[]],
-        [[2, 1]],
-    ]
 
     assert ak.operations.count_nonzero(array) == 9
     assert to_list(ak.operations.count_nonzero(array, axis=-1)) == [
@@ -1856,11 +938,6 @@ def test_0115_generic_reducer_operation_highlevel_1():
         [],
         [2, 1],
     ]
-    assert to_list(ak.operations.count_nonzero(array, axis=-2)) == [
-        [3, 2, 1],
-        [],
-        [2, 1],
-    ]
 
     assert ak.operations.sum(array) == 2 + 3 + 5 + 7 + 11 + 13 + 17 + 19 + 23
     assert to_list(ak.operations.sum(array, axis=-1)) == [
@@ -1868,11 +945,6 @@ def test_0115_generic_reducer_operation_highlevel_1():
         [],
         [17 + 19, 23],
     ]
-    assert to_list(ak.operations.sum(array, axis=-2)) == [
-        [2 + 7 + 13, 3 + 11, 5],
-        [],
-        [17 + 23, 19],
-    ]
 
     assert ak.operations.prod(array) == 2 * 3 * 5 * 7 * 11 * 13 * 17 * 19 * 23
     assert to_list(ak.operations.prod(array, axis=-1)) == [
@@ -1880,23 +952,6 @@ def test_0115_generic_reducer_operation_highlevel_1():
         [],
         [17 * 19, 23],
     ]
-    assert to_list(ak.operations.prod(array, axis=-2)) == [
-        [2 * 7 * 13, 3 * 11, 5],
-        [],
-        [17 * 23, 19],
-    ]
-
-    assert ak.operations.min(array) == 2
-    assert to_list(ak.operations.min(array, axis=-1)) == [
-        [2, None, 7, 13],
-        [],
-        [17, 23],
-    ]
-    assert to_list(ak.operations.min(array, axis=-2)) == [
-        [2, 3, 5],
-        [],
-        [17, 19],
-    ]
 
     assert ak.operations.max(array) == 23
     assert to_list(ak.operations.max(array, axis=-1)) == [
@@ -1904,11 +959,7 @@ def test_0115_generic_reducer_operation_highlevel_1():
         [],
         [19, 23],
     ]
-    assert to_list(ak.operations.max(array, axis=-2)) == [
-        [13, 11, 5],
-        [],
-        [23, 19],
-    ]
+
     del array
 
 
@@ -1921,29 +972,149 @@ def test_0115_generic_reducer_operation_highlevel_2():
         ],
         check_valid=True,
     )
-    array = ak.to_backend(array, "cuda", highlevel=False)
-
-    assert ak.operations.any(array) is np.bool_(True)
+    array = ak.to_backend(array, "cuda")
+    assert ak.operations.any(array) == cp.bool_(True)
     assert to_list(ak.operations.any(array, axis=-1)) == [
         [True, False, False, True],
         [],
         [True, True],
     ]
-    assert to_list(ak.operations.any(array, axis=-2)) == [
-        [True, False, True],
-        [],
-        [True, True],
-    ]
 
-    assert ak.operations.all(array) is np.bool_(False)
+    assert ak.operations.all(array) == cp.bool_(False)
     assert to_list(ak.operations.all(array, axis=-1)) == [
         [False, True, False, True],
         [],
         [False, True],
     ]
-    assert to_list(ak.operations.all(array, axis=-2)) == [
-        [False, False, True],
+    del array
+
+
+def test_nonreducers():
+    x = ak.highlevel.Array([[1, 2, 3, 4, 5], [1, 2, 3, 4, 5]], check_valid=True)
+    y = ak.highlevel.Array(
+        [[1.1, 2.2, 2.9, 4.0, 5.1], [0.9, 2.1, 3.2, 4.1, 4.9]], check_valid=True
+    )
+    x = ak.to_backend(x, "cuda")
+    y = ak.to_backend(y, "cuda")
+
+    cpt.assert_allclose(ak.operations.mean(y), cp.mean(ak.operations.to_numpy(y)))
+    cpt.assert_allclose(ak.operations.var(y), cp.var(ak.operations.to_numpy(y)))
+    cpt.assert_allclose(
+        ak.operations.var(y, ddof=1), cp.var(ak.operations.to_numpy(y), ddof=1)
+    )
+    cpt.assert_allclose(ak.operations.std(y), np.std(ak.operations.to_numpy(y)))
+    cpt.assert_allclose(
+        ak.operations.std(y, ddof=1), cp.std(ak.operations.to_numpy(y), ddof=1)
+    )
+
+    cpt.assert_allclose(ak.operations.moment(y, 1), cp.mean(ak.operations.to_numpy(y)))
+    cpt.assert_allclose(
+        ak.operations.moment(y - ak.operations.mean(y), 2),
+        cp.var(ak.operations.to_numpy(y)),
+    )
+    cpt.assert_allclose(ak.operations.covar(y, y), cp.var(ak.operations.to_numpy(y)))
+    cpt.assert_allclose(ak.operations.corr(y, y), 1.0)
+
+    cpt.assert_allclose(ak.operations.corr(x, y), 0.9968772535047296)
+
+    cpt.assert_allclose(
+        to_list(ak.operations.mean(y, axis=-1)),
+        to_list(cp.mean(ak.operations.to_numpy(y), axis=-1)),
+    )
+    cpt.assert_allclose(
+        to_list(ak.operations.var(y, axis=-1)),
+        to_list(cp.var(ak.operations.to_numpy(y), axis=-1)),
+    )
+    cpt.assert_allclose(
+        to_list(ak.operations.var(y, axis=-1, ddof=1)),
+        to_list(cp.var(ak.operations.to_numpy(y), axis=-1, ddof=1)),
+    )
+    cpt.assert_allclose(
+        to_list(ak.operations.std(y, axis=-1)),
+        to_list(cp.std(ak.operations.to_numpy(y), axis=-1)),
+    )
+    cpt.assert_allclose(
+        to_list(ak.operations.std(y, axis=-1, ddof=1)),
+        to_list(cp.std(ak.operations.to_numpy(y), axis=-1, ddof=1)),
+    )
+
+    cpt.assert_allclose(
+        to_list(ak.operations.moment(y, 1, axis=-1)),
+        to_list(cp.mean(ak.operations.to_numpy(y), axis=-1)),
+    )
+    cpt.assert_allclose(
+        to_list(ak.operations.moment(y - ak.operations.mean(y, axis=-1), 2, axis=-1)),
+        to_list(cp.var(ak.operations.to_numpy(y), axis=-1)),
+    )
+    cpt.assert_allclose(
+        to_list(ak.operations.covar(y, y, axis=-1)),
+        to_list(cp.var(ak.operations.to_numpy(y), axis=-1)),
+    )
+    cpt.assert_allclose(to_list(ak.operations.corr(y, y, axis=-1)), [1.0, 1.0])
+
+    cpt.assert_allclose(
+        to_list(ak.operations.corr(x, y, axis=-1)),
+        [0.9975103695813371, 0.9964193240901015],
+    )
+
+
+def test_softmax():
+    array = ak.highlevel.Array(
+        [[np.log(2), np.log(2), np.log(4)], [], [np.log(5), np.log(5)]],
+        check_valid=True,
+    )
+    array = ak.to_backend(array, "cuda")
+
+    assert to_list(ak.operations.softmax(array, axis=-1)) == [
+        pytest.approx([0.25, 0.25, 0.5]),
         [],
-        [False, True],
+        pytest.approx([0.5, 0.5]),
     ]
     del array
+
+
+def test_prod_bool_1():
+    # this had been silently broken
+    array = np.array([[True, False, False], [True, False, False]])
+    content2 = ak.contents.NumpyArray(array.reshape(-1))
+    offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64))
+    depth1 = ak.contents.ListOffsetArray(offsets3, content2)
+    depth1 = ak.to_backend(depth1, "cuda")
+
+    assert to_list(ak.prod(depth1, axis=-1, highlevel=False)) == [0, 1, 0, 0]
+    assert to_list(ak.all(depth1, axis=-1, highlevel=False)) == [
+        False,
+        True,
+        False,
+        False,
+    ]
+    assert to_list(ak.min(depth1, axis=-1, highlevel=False)) == [
+        False,
+        None,
+        False,
+        False,
+    ]
+    del depth1
+
+
+def test_prod_bool_2():
+    array = np.array([[True, False, False], [True, False, False]]).view(np.uint8)
+    content2 = ak.contents.NumpyArray(array.reshape(-1))
+    offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64))
+    depth1 = ak.contents.ListOffsetArray(offsets3, content2)
+    depth1 = ak.to_backend(depth1, "cuda")
+
+    assert to_list(ak.prod(depth1, axis=-1, highlevel=False)) == [0, 1, 0, 0]
+    assert to_list(ak.all(depth1, axis=-1, highlevel=False)) == [0, 1, 0, 0]
+    assert to_list(ak.min(depth1, axis=-1, highlevel=False)) == [0, None, 0, 0]
+
+    array = np.array([[True, False, False], [True, False, False]]).astype(np.int32)
+    content2 = ak.contents.NumpyArray(array.reshape(-1))
+    offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64))
+    depth1 = ak.contents.ListOffsetArray(offsets3, content2)
+    depth1 = ak.to_backend(depth1, "cuda")
+
+    assert to_list(ak.prod(depth1, axis=-1, highlevel=False)) == [0, 1, 0, 0]
+    assert to_list(ak.all(depth1, axis=-1, highlevel=False)) == [0, 1, 0, 0]
+    assert to_list(ak.min(depth1, axis=-1, highlevel=False)) == [0, None, 0, 0]
+    del depth1

From 8df6bd4332b7184db38eaa4d9c34129dbc3c9e19 Mon Sep 17 00:00:00 2001
From: Manasvi Goyal <55101825+ManasviGoyal@users.noreply.github.com>
Date: Mon, 24 Jun 2024 14:27:47 +0200
Subject: [PATCH 33/33] Update dev/generate-tests.py

Co-authored-by: Ianna Osborne <ianna.osborne@cern.ch>
---
 dev/generate-tests.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dev/generate-tests.py b/dev/generate-tests.py
index 45068c590c..7c97628101 100644
--- a/dev/generate-tests.py
+++ b/dev/generate-tests.py
@@ -961,6 +961,7 @@ def gencudakerneltests(specdict):
                 f.write(
                     "import cupy\n"
                     "import cupy.testing as cpt\n"
+                    "import numpy as np\n"
                     "import pytest\n\n"
                     "import awkward as ak\n"
                     "import awkward._connect.cuda as ak_cu\n"