diff --git a/dev/generate-kernel-signatures.py b/dev/generate-kernel-signatures.py
index b44155acab..6e342ae868 100644
--- a/dev/generate-kernel-signatures.py
+++ b/dev/generate-kernel-signatures.py
@@ -108,6 +108,7 @@
     "awkward_ListOffsetArray_drop_none_indexes",
     "awkward_ListOffsetArray_reduce_local_nextparents_64",
     "awkward_ListOffsetArray_reduce_nonlocal_maxcount_offsetscopy_64",
+    "awkward_ListOffsetArray_reduce_local_outoffsets_64",
     "awkward_UnionArray_flatten_length",
     "awkward_UnionArray_flatten_combine",
     "awkward_UnionArray_nestedfill_tags_index",
@@ -123,6 +124,7 @@
     "awkward_reduce_sum_int32_bool_64",
     "awkward_reduce_sum_int64_bool_64",
     "awkward_reduce_sum_bool",
+    "awkward_reduce_prod",
     "awkward_reduce_prod_bool",
     "awkward_reduce_countnonzero",
     "awkward_sorting_ranges",
@@ -381,6 +383,8 @@ def kernel_signatures_cuda_py(specification):
 from awkward._connect.cuda import fetch_specialization
 from awkward._connect.cuda import import_cupy
 
+import math
+
 cupy = import_cupy("Awkward Arrays with CUDA")
 """
         )
diff --git a/dev/generate-tests.py b/dev/generate-tests.py
index 12c65708ec..7c97628101 100644
--- a/dev/generate-tests.py
+++ b/dev/generate-tests.py
@@ -424,7 +424,6 @@ def genspectests(specdict):
 
 """
             )
-            f.write("import pytest\nimport kernels\n\n")
             f.write("import pytest\nimport numpy as np\nimport kernels\n\n")
             num = 1
             if spec.tests == []:
@@ -894,6 +893,7 @@ def gencpuunittests(specdict):
     "awkward_ListOffsetArray_drop_none_indexes",
     "awkward_ListOffsetArray_reduce_local_nextparents_64",
     "awkward_ListOffsetArray_reduce_nonlocal_maxcount_offsetscopy_64",
+    "awkward_ListOffsetArray_reduce_local_outoffsets_64",
     "awkward_UnionArray_flatten_length",
     "awkward_UnionArray_flatten_combine",
     "awkward_UnionArray_nestedfill_tags_index",
@@ -909,6 +909,7 @@ def gencpuunittests(specdict):
     "awkward_reduce_sum_int32_bool_64",
     "awkward_reduce_sum_int64_bool_64",
     "awkward_reduce_sum_bool",
+    "awkward_reduce_prod",
     "awkward_reduce_prod_bool",
     "awkward_reduce_countnonzero",
     "awkward_sorting_ranges",
@@ -959,6 +960,8 @@ def gencudakerneltests(specdict):
 
                 f.write(
                     "import cupy\n"
+                    "import cupy.testing as cpt\n"
+                    "import numpy as np\n"
                     "import pytest\n\n"
                     "import awkward as ak\n"
                     "import awkward._connect.cuda as ak_cu\n"
@@ -1028,7 +1031,7 @@ def gencudakerneltests(specdict):
                             if isinstance(val, list):
                                 f.write(
                                     " " * 4
-                                    + f"assert cupy.array_equal({arg}[:len(pytest_{arg})], cupy.array(pytest_{arg}))\n"
+                                    + f"cpt.assert_allclose({arg}[:len(pytest_{arg})], cupy.array(pytest_{arg}))\n"
                                 )
                             else:
                                 f.write(" " * 4 + f"assert {arg} == pytest_{arg}\n")
@@ -1088,6 +1091,7 @@ def gencudaunittests(specdict):
                 f.write(
                     "import re\n"
                     "import cupy\n"
+                    "import cupy.testing as cpt\n"
                     "import pytest\n\n"
                     "import awkward as ak\n"
                     "import awkward._connect.cuda as ak_cu\n"
@@ -1224,7 +1228,7 @@ def gencudaunittests(specdict):
                                 if isinstance(val, list):
                                     f.write(
                                         " " * 4
-                                        + f"assert cupy.array_equal({arg}[:len(pytest_{arg})], cupy.array(pytest_{arg}))\n"
+                                        + f"cpt.assert_allclose({arg}[:len(pytest_{arg})], cupy.array(pytest_{arg}))\n"
                                     )
                                 else:
                                     f.write(" " * 4 + f"assert {arg} == pytest_{arg}\n")
diff --git a/kernel-test-data.json b/kernel-test-data.json
index c9d778d692..b13f5fcc29 100644
--- a/kernel-test-data.json
+++ b/kernel-test-data.json
@@ -19046,7 +19046,7 @@
         },
         {
             "name": "awkward_ListOffsetArray_reduce_local_outoffsets_64",
-            "status": false,
+            "status": true,
             "tests": [
                 {
                     "error": false,
@@ -23128,7 +23128,7 @@
         },
         {
             "name": "awkward_reduce_sum_bool",
-            "status": false,
+            "status": true,
             "tests": [
                 {
                     "error": false,
@@ -23251,7 +23251,7 @@
         },
         {
             "name": "awkward_reduce_prod_bool",
-            "status": false,
+            "status": true,
             "tests": [
                 {
                     "error": false,
@@ -23543,7 +23543,7 @@
         },
         {
             "name": "awkward_reduce_argmax",
-            "status": false,
+            "status": true,
             "tests": [
                 {
                     "error": false,
@@ -23705,7 +23705,7 @@
         },
         {
             "name": "awkward_reduce_max",
-            "status": false,
+            "status": true,
             "tests": [
                 {
                     "error": false,
@@ -23893,7 +23893,7 @@
         },
         {
             "name": "awkward_reduce_countnonzero",
-            "status": false,
+            "status": true,
             "tests": [
                 {
                     "error": false,
@@ -23990,7 +23990,7 @@
         },
         {
             "name": "awkward_reduce_count_64",
-            "status": false,
+            "status": true,
             "tests": [
                 {
                     "error": false,
@@ -24034,7 +24034,7 @@
                     "inputs": {
                         "lenparents": 1696,
                         "outlength": 331,
-                        "parents": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 103, 103, 103, 103, 103, 103, 103, 103, 103, 103, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 137, 137, 137, 137, 137, 137, 137, 137, 137, 137, 154, 154, 154, 154, 154, 154, 154, 154, 154, 154, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 104, 104, 104, 104, 104, 104, 104, 104, 104, 104, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 138, 138, 138, 138, 138, 138, 138, 138, 138, 138, 155, 155, 155, 155, 155, 155, 155, 155, 155, 155, 172, 172, 172, 172, 172, 172, 172, 172, 172, 172, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 173, 173, 173, 173, 173, 173, 173, 173, 173, 173, 190, 190, 190, 190, 190, 190, 190, 190, 190, 190, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 89, 89, 89, 89, 89, 89, 89, 89, 89, 89, 106, 106, 106, 106, 106, 106, 106, 106, 106, 106, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 140, 140, 140, 140, 140, 140, 140, 140, 140, 140, 157, 157, 157, 157, 157, 157, 157, 157, 157, 157, 174, 174, 174, 174, 174, 174, 174, 174, 174, 174, 191, 191, 191, 191, 191, 191, 191, 191, 191, 191, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 141, 141, 141, 141, 141, 141, 141, 141, 141, 141, 158, 158, 158, 158, 158, 158, 158, 158, 158, 158, 175, 175, 175, 175, 175, 175, 175, 175, 175, 175, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 159, 159, 159, 159, 159, 159, 159, 159, 159, 159, 176, 176, 176, 176, 176, 176, 176, 176, 176, 176, 193, 193, 193, 193, 193, 193, 193, 193, 193, 193, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 143, 143, 143, 143, 143, 143, 143, 143, 143, 143, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 177, 177, 177, 177, 177, 177, 177, 177, 177, 177, 194, 194, 194, 194, 194, 194, 194, 194, 194, 194, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16]
+                        "parents": [194, 194, 194, 194, 194, 194, 194, 194, 194, 194, 193, 193, 193, 193, 193, 193, 193, 193, 193, 193, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 191, 191, 191, 191, 191, 191, 191, 191, 191, 191, 190, 190, 190, 190, 190, 190, 190, 190, 190, 190, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 177, 177, 177, 177, 177, 177, 177, 177, 177, 177, 176, 176, 176, 176, 176, 176, 176, 176, 176, 176, 175, 175, 175, 175, 175, 175, 175, 175, 175, 175, 174, 174, 174, 174, 174, 174, 174, 174, 174, 174, 173, 173, 173, 173, 173, 173, 173, 173, 173, 173, 172, 172, 172, 172, 172, 172, 172, 172, 172, 172, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 159, 159, 159, 159, 159, 159, 159, 159, 159, 159, 158, 158, 158, 158, 158, 158, 158, 158, 158, 158, 157, 157, 157, 157, 157, 157, 157, 157, 157, 157, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 155, 155, 155, 155, 155, 155, 155, 155, 155, 155, 154, 154, 154, 154, 154, 154, 154, 154, 154, 154, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 143, 143, 143, 143, 143, 143, 143, 143, 143, 143, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 141, 141, 141, 141, 141, 141, 141, 141, 141, 141, 140, 140, 140, 140, 140, 140, 140, 140, 140, 140, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 138, 138, 138, 138, 138, 138, 138, 138, 138, 138, 137, 137, 137, 137, 137, 137, 137, 137, 137, 137, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 106, 106, 106, 106, 106, 106, 106, 106, 106, 106, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 104, 104, 104, 104, 104, 104, 104, 104, 104, 104, 103, 103, 103, 103, 103, 103, 103, 103, 103, 103, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 89, 89, 89, 89, 89, 89, 89, 89, 89, 89, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
                     },
                     "outputs": {
                         "toptr": [626, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 0, 0, 0, 0, 0, 0, 10, 10, 10, 10, 10, 10, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 10, 10, 10, 10, 10, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 10, 10, 10, 10, 10, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 10, 10, 10, 10, 10, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 10, 10, 10, 10, 10, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 10, 10, 10, 10, 10, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 10, 10, 10, 10, 10, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 10, 10, 10, 10, 10, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 10, 10, 10, 10, 10, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 10, 10, 10, 10, 10, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
@@ -24368,7 +24368,7 @@
         },
         {
             "name": "awkward_reduce_sum",
-            "status": false,
+            "status": true,
             "tests": [
                 {
                     "error": false,
@@ -24763,22 +24763,9 @@
             ]
         },
         {
-            "name": "awkward_reduce_prod",
-            "status": false,
+            "name": "awkward_reduce_sum_int32_bool_64",
+            "status": true,
             "tests": [
-                {
-                    "error": false,
-                    "message": "",
-                    "inputs": {
-                        "fromptr": [1, 0, 0, 1, 0, 0],
-                        "lenparents": 6,
-                        "outlength": 4,
-                        "parents": [0, 0, 0, 2, 2, 3]
-                    },
-                    "outputs": {
-                        "toptr": [0, 1, 0, 0]
-                    }
-                },
                 {
                     "error": false,
                     "message": "",
@@ -24796,208 +24783,195 @@
                     "error": false,
                     "message": "",
                     "inputs": {
-                        "fromptr": [0, 1, 2, 3, 4, 5],
-                        "lenparents": 6,
-                        "outlength": 4,
-                        "parents": [0, 0, 0, 2, 2, 3]
+                        "fromptr": [0],
+                        "lenparents": 1,
+                        "outlength": 1,
+                        "parents": [0]
                     },
                     "outputs": {
-                        "toptr": [0, 1, 12, 5]
+                        "toptr": [0]
                     }
                 },
                 {
                     "error": false,
                     "message": "",
                     "inputs": {
-                        "fromptr": [2, 53, 31, 101, 3, 59, 37, 103, 5, 61, 41, 107, 7, 67, 43, 109, 11, 71, 47, 113],
-                        "lenparents": 20,
-                        "outlength": 15,
-                        "parents": [0, 0, 10, 10, 1, 1, 11, 11, 2, 2, 12, 12, 3, 3, 13, 13, 4, 4, 14, 14]
+                        "fromptr": [0, 5, 20, 1, 6, 21, 2, 7, 22, 3, 8, 23, 4, 9, 24],
+                        "lenparents": 15,
+                        "outlength": 10,
+                        "parents": [0, 5, 5, 1, 6, 6, 2, 7, 7, 3, 8, 8, 4, 9, 9]
                     },
                     "outputs": {
-                        "toptr": [106, 177, 305, 469, 781, 1, 1, 1, 1, 1, 3131, 3811, 4387, 4687, 5311]
+                        "toptr": [0, 1, 1, 1, 1, 2, 2, 2, 2, 2]
                     }
                 },
                 {
                     "error": false,
                     "message": "",
                     "inputs": {
-                        "fromptr": [2, 53, 13, 73, 31, 101, 3, 59, 17, 79, 37, 103, 5, 61, 19, 83, 41, 107, 7, 67, 23, 89, 43, 109, 11, 71, 47, 113],
-                        "lenparents": 28,
-                        "outlength": 15,
-                        "parents": [0, 0, 5, 5, 10, 10, 1, 1, 6, 6, 11, 11, 2, 2, 7, 7, 12, 12, 3, 3, 8, 8, 13, 13, 4, 4, 14, 14]
+                        "fromptr": [2, 3, 5, 7, 11, 13, 17, 19, 23],
+                        "lenparents": 9,
+                        "outlength": 6,
+                        "parents": [0, 0, 0, 2, 2, 3, 4, 4, 5]
                     },
                     "outputs": {
-                        "toptr": [106, 177, 305, 469, 781, 949, 1343, 1577, 2047, 1, 3131, 3811, 4387, 4687, 5311]
+                        "toptr": [3, 0, 2, 1, 2, 1]
                     }
                 },
                 {
                     "error": false,
                     "message": "",
                     "inputs": {
-                        "fromptr": [2, 53, 13, 73, 31, 101, 3, 59, 17, 79, 37, 103, 5, 61, 19, 83, 41, 107, 7, 67, 23, 89, 43, 11, 71, 29, 97, 47],
-                        "lenparents": 28,
-                        "outlength": 15,
-                        "parents": [0, 0, 5, 5, 10, 10, 1, 1, 6, 6, 11, 11, 2, 2, 7, 7, 12, 12, 3, 3, 8, 8, 13, 4, 4, 9, 9, 14]
+                        "fromptr": [1, 0, 0, 1, 0, 0],
+                        "lenparents": 6,
+                        "outlength": 4,
+                        "parents": [0, 0, 0, 2, 2, 3]
                     },
                     "outputs": {
-                        "toptr": [106, 177, 305, 469, 781, 949, 1343, 1577, 2047, 2813, 3131, 3811, 4387, 43, 47]
+                        "toptr": [1, 0, 1, 0]
                     }
                 },
                 {
                     "error": false,
                     "message": "",
                     "inputs": {
-                        "fromptr": [2, 53, 13, 73, 31, 101, 3, 59, 17, 79, 37, 103, 5, 61, 19, 83, 41, 107, 7, 67, 23, 89, 43, 109, 11, 71, 29, 97],
-                        "lenparents": 28,
-                        "outlength": 14,
-                        "parents": [0, 0, 5, 5, 10, 10, 1, 1, 6, 6, 11, 11, 2, 2, 7, 7, 12, 12, 3, 3, 8, 8, 13, 13, 4, 4, 9, 9]
+                        "fromptr": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24],
+                        "lenparents": 15,
+                        "outlength": 3,
+                        "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2]
                     },
                     "outputs": {
-                        "toptr": [106, 177, 305, 469, 781, 949, 1343, 1577, 2047, 2813, 3131, 3811, 4387, 4687]
+                        "toptr": [4, 5, 5]
                     }
                 },
                 {
                     "error": false,
                     "message": "",
                     "inputs": {
-                        "fromptr": [2, 53, 13, 73, 31, 101, 3, 59, 17, 79, 37, 103, 5, 61, 19, 83, 41, 107, 7, 67, 23, 89, 43, 109, 11, 71, 29, 97, 47],
-                        "lenparents": 29,
-                        "outlength": 15,
-                        "parents": [0, 0, 5, 5, 10, 10, 1, 1, 6, 6, 11, 11, 2, 2, 7, 7, 12, 12, 3, 3, 8, 8, 13, 13, 4, 4, 9, 9, 14]
+                        "fromptr": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
+                        "lenparents": 30,
+                        "outlength": 6,
+                        "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5]
                     },
                     "outputs": {
-                        "toptr": [106, 177, 305, 469, 781, 949, 1343, 1577, 2047, 2813, 3131, 3811, 4387, 4687, 47]
+                        "toptr": [4, 5, 5, 5, 5, 5]
                     }
                 },
                 {
                     "error": false,
                     "message": "",
                     "inputs": {
-                        "fromptr": [2, 53, 13, 73, 31, 101, 3, 59, 17, 79, 37, 103, 5, 61, 19, 83, 41, 107, 7, 67, 23, 89, 43, 109, 11, 71, 29, 97, 47, 113],
-                        "lenparents": 30,
-                        "outlength": 15,
-                        "parents": [0, 0, 5, 5, 10, 10, 1, 1, 6, 6, 11, 11, 2, 2, 7, 7, 12, 12, 3, 3, 8, 8, 13, 13, 4, 4, 9, 9, 14, 14]
+                        "fromptr": [0, 1, 3, 4, 5, 6],
+                        "lenparents": 6,
+                        "outlength": 4,
+                        "parents": [0, 0, 1, 3, 3, 3]
                     },
                     "outputs": {
-                        "toptr": [106, 177, 305, 469, 781, 949, 1343, 1577, 2047, 2813, 3131, 3811, 4387, 4687, 5311]
+                        "toptr": [1, 1, 0, 3]
                     }
                 },
                 {
                     "error": false,
                     "message": "",
                     "inputs": {
-                        "fromptr": [2, 53, 13, 73, 31, 101, 3, 59, 17, 79, 37, 103, 5, 61, 19, 83, 41, 107, 7, 67, 23, 89, 43, 109, 11, 71, 29, 47],
-                        "lenparents": 28,
-                        "outlength": 15,
-                        "parents": [0, 0, 5, 5, 10, 10, 1, 1, 6, 6, 11, 11, 2, 2, 7, 7, 12, 12, 3, 3, 8, 8, 13, 13, 4, 4, 9, 14]
+                        "fromptr": [0, 5, 10, 15, 25, 1, 11, 16, 26, 2, 12, 17, 27, 8, 18, 28, 4, 9, 14, 29],
+                        "lenparents": 20,
+                        "outlength": 10,
+                        "parents": [0, 0, 0, 5, 5, 1, 1, 6, 6, 2, 2, 7, 7, 3, 8, 8, 4, 4, 4, 9]
                     },
                     "outputs": {
-                        "toptr": [106, 177, 305, 469, 781, 949, 1343, 1577, 2047, 29, 3131, 3811, 4387, 4687, 47]
+                        "toptr": [2, 2, 2, 1, 3, 2, 2, 2, 2, 1]
                     }
                 },
                 {
                     "error": false,
                     "message": "",
                     "inputs": {
-                        "fromptr": [0],
-                        "lenparents": 1,
-                        "outlength": 3,
-                        "parents": [2]
+                        "fromptr": [15, 20, 25, 16, 21, 26, 17, 22, 27, 18, 23, 28, 19, 24, 29],
+                        "lenparents": 15,
+                        "outlength": 15,
+                        "parents": [0, 5, 10, 1, 6, 11, 2, 7, 12, 3, 8, 13, 4, 9, 14]
                     },
                     "outputs": {
-                        "toptr": [1, 1, 0]
+                        "toptr": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
                     }
                 },
                 {
                     "error": false,
                     "message": "",
                     "inputs": {
-                        "fromptr": [101, 103, 107, 109, 113, 53, 59, 61, 67, 71, 31, 37, 41, 43, 47, 2, 3, 5, 7, 11],
+                        "fromptr": [0, 15, 5, 10, 25, 1, 16, 11, 26, 2, 17, 12, 27, 18, 8, 28, 4, 9, 14, 29],
                         "lenparents": 20,
-                        "outlength": 6,
-                        "parents": [0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 5, 5, 5, 5, 5]
+                        "outlength": 15,
+                        "parents": [0, 0, 5, 10, 10, 1, 1, 11, 11, 2, 2, 12, 12, 3, 8, 13, 4, 9, 14, 14]
                     },
                     "outputs": {
-                        "toptr": [13710311357, 1, 907383479, 95041567, 1, 2310]
+                        "toptr": [1, 2, 2, 1, 1, 1, 0, 0, 1, 1, 2, 2, 2, 1, 2]
                     }
                 },
                 {
                     "error": false,
                     "message": "",
                     "inputs": {
-                        "fromptr": [101, 103, 107, 109, 113, 73, 79, 83, 89, 97, 53, 59, 61, 67, 71, 31, 37, 41, 43, 47, 13, 17, 19, 23, 29, 2, 3, 5, 7, 11],
+                        "fromptr": [0, 15, 5, 20, 10, 25, 1, 16, 6, 21, 11, 26, 2, 17, 7, 22, 12, 27, 3, 18, 8, 23, 13, 28, 4, 19, 9, 24, 14, 29],
                         "lenparents": 30,
-                        "outlength": 6,
-                        "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5]
+                        "outlength": 15,
+                        "parents": [0, 0, 5, 5, 10, 10, 1, 1, 6, 6, 11, 11, 2, 2, 7, 7, 12, 12, 3, 3, 8, 8, 13, 13, 4, 4, 9, 9, 14, 14]
                     },
                     "outputs": {
-                        "toptr": [13710311357, 4132280413, 907383479, 95041567, 2800733, 2310]
+                        "toptr": [1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
                     }
                 },
                 {
                     "error": false,
                     "message": "",
                     "inputs": {
-                        "fromptr": [101, 103, 107, 109, 113, 53, 59, 61, 67, 71, 31, 37, 41, 43, 47, 2, 3, 5, 7, 11],
-                        "lenparents": 20,
-                        "outlength": 4,
-                        "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3]
+                        "fromptr": [0, 5, 10, 15, 20, 25, 1, 6, 11, 16, 21, 26, 2, 7, 12, 17, 22, 27, 3, 8, 13, 18, 23, 28, 4, 9, 14, 19, 24, 29],
+                        "lenparents": 30,
+                        "outlength": 10,
+                        "parents": [0, 0, 0, 5, 5, 5, 1, 1, 1, 6, 6, 6, 2, 2, 2, 7, 7, 7, 3, 3, 3, 8, 8, 8, 4, 4, 4, 9, 9, 9]
                     },
                     "outputs": {
-                        "toptr": [13710311357, 907383479, 95041567, 2310]
+                        "toptr": [2, 3, 3, 3, 3, 3, 3, 3, 3, 3]
                     }
                 },
                 {
                     "error": false,
                     "message": "",
                     "inputs": {
-                        "fromptr": [2, 7, 17, 29, 3, 11, 19, 31, 5, 13, 23, 37],
+                        "fromptr": [1, 2, 4, 8, 16, 32, 64, 128, 0, 0, 0, 0],
                         "lenparents": 12,
-                        "outlength": 6,
-                        "parents": [0, 0, 3, 3, 1, 1, 4, 4, 2, 2, 5, 5]
-                    },
-                    "outputs": {
-                        "toptr": [14, 33, 65, 493, 589, 851]
-                    }
-                },
-                {
-                    "error": false,
-                    "message": "",
-                    "inputs": {
-                        "fromptr": [3, 53, 13, 73, 31, 101, 5, 59, 17, 79, 37, 103, 7, 61, 19, 83, 41, 107, 67, 23, 89, 43, 109, 71, 29, 97, 47, 113],
-                        "lenparents": 28,
-                        "outlength": 15,
-                        "parents": [0, 0, 5, 5, 10, 10, 1, 1, 6, 6, 11, 11, 2, 2, 7, 7, 12, 12, 3, 8, 8, 13, 13, 4, 9, 9, 14, 14]
+                        "outlength": 3,
+                        "parents": [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
                     },
                     "outputs": {
-                        "toptr": [159, 295, 427, 67, 71, 949, 1343, 1577, 2047, 2813, 3131, 3811, 4387, 4687, 5311]
+                        "toptr": [4, 4, 0]
                     }
                 },
                 {
                     "error": false,
                     "message": "",
                     "inputs": {
-                        "fromptr": [3, 53, 13, 73, 31, 101, 5, 59, 17, 79, 37, 103, 7, 61, 19, 83, 41, 107, 11, 67, 23, 89, 43, 109, 71, 29, 97, 47, 113],
-                        "lenparents": 29,
-                        "outlength": 15,
-                        "parents": [0, 0, 5, 5, 10, 10, 1, 1, 6, 6, 11, 11, 2, 2, 7, 7, 12, 12, 3, 3, 8, 8, 13, 13, 4, 9, 9, 14, 14]
+                        "fromptr": [1, 2, 3, 4, 5, 1, 2, 3, 4, 5],
+                        "lenparents": 10,
+                        "outlength": 2,
+                        "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
                     },
                     "outputs": {
-                        "toptr": [159, 295, 427, 737, 71, 949, 1343, 1577, 2047, 2813, 3131, 3811, 4387, 4687, 5311]
+                        "toptr": [5, 5]
                     }
                 },
                 {
                     "error": false,
                     "message": "",
                     "inputs": {
-                        "fromptr": [3, 53, 13, 73, 31, 101, 5, 59, 17, 79, 37, 103, 7, 61, 19, 83, 41, 107, 11, 67, 23, 89, 43, 109, 71, 97, 47, 113],
-                        "lenparents": 28,
-                        "outlength": 15,
-                        "parents": [0, 0, 5, 5, 10, 10, 1, 1, 6, 6, 11, 11, 2, 2, 7, 7, 12, 12, 3, 3, 8, 8, 13, 13, 4, 9, 14, 14]
+                        "fromptr": [1, 2, 3, 4, 5, 6],
+                        "lenparents": 6,
+                        "outlength": 1,
+                        "parents": [0, 0, 0, 0, 0, 0]
                     },
                     "outputs": {
-                        "toptr": [159, 295, 427, 737, 71, 949, 1343, 1577, 2047, 97, 3131, 3811, 4387, 4687, 5311]
+                        "toptr": [6]
                     }
                 },
                 {
@@ -25010,119 +24984,937 @@
                         "parents": [0, 0, 0, 6, 6, 1, 1, 7, 2]
                     },
                     "outputs": {
-                        "toptr": [182, 33, 5, 1, 1, 1, 391, 19]
+                        "toptr": [3, 2, 1, 0, 0, 0, 2, 1]
                     }
                 },
                 {
                     "error": false,
                     "message": "",
                     "inputs": {
-                        "fromptr": [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37],
+                        "fromptr": [1, 16, 0, 2, 32, 0, 4, 64, 0, 8, 128, 0],
                         "lenparents": 12,
-                        "outlength": 3,
-                        "parents": [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
+                        "outlength": 4,
+                        "parents": [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]
                     },
                     "outputs": {
-                        "toptr": [210, 46189, 765049]
+                        "toptr": [2, 2, 2, 2]
                     }
                 },
                 {
                     "error": false,
                     "message": "",
                     "inputs": {
-                        "fromptr": [2, 3, 5, 7, 11, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 101, 103, 107, 109, 113],
-                        "lenparents": 20,
-                        "outlength": 6,
-                        "parents": [0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 5, 5, 5, 5, 5]
+                        "fromptr": [0, 1, 2, 3, 4, 5],
+                        "lenparents": 6,
+                        "outlength": 4,
+                        "parents": [0, 0, 0, 2, 2, 3]
                     },
                     "outputs": {
-                        "toptr": [2310, 1, 95041567, 907383479, 1, 13710311357]
+                        "toptr": [2, 0, 2, 1]
                     }
                 },
                 {
                     "error": false,
                     "message": "",
                     "inputs": {
-                        "fromptr": [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113],
-                        "lenparents": 30,
-                        "outlength": 6,
-                        "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5]
+                        "fromptr": [0, 4, 1, 3, 5, 6],
+                        "lenparents": 6,
+                        "outlength": 4,
+                        "parents": [0, 0, 1, 1, 1, 3]
                     },
                     "outputs": {
-                        "toptr": [2310, 2800733, 95041567, 907383479, 4132280413, 13710311357]
+                        "toptr": [1, 3, 0, 1]
                     }
                 },
                 {
                     "error": false,
                     "message": "",
                     "inputs": {
-                        "fromptr": [2, 3, 5, 7, 11, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 101, 103, 107, 109, 113],
-                        "lenparents": 20,
-                        "outlength": 4,
-                        "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3]
+                        "fromptr": [1, 4, 9, 16, 25, 1, 4, 9, 16, 25],
+                        "lenparents": 10,
+                        "outlength": 2,
+                        "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
                     },
                     "outputs": {
-                        "toptr": [2310, 95041567, 907383479, 13710311357]
+                        "toptr": [5, 5]
                     }
                 },
                 {
                     "error": false,
                     "message": "",
                     "inputs": {
-                        "fromptr": [2, 7, 3, 11, 5],
-                        "lenparents": 5,
-                        "outlength": 8,
-                        "parents": [0, 6, 1, 7, 2]
+                        "fromptr": [1, 4, 9, 16, 26, 1, 4, 10, 16, 24],
+                        "lenparents": 10,
+                        "outlength": 2,
+                        "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
                     },
                     "outputs": {
-                        "toptr": [2, 3, 5, 1, 1, 1, 7, 11]
+                        "toptr": [5, 5]
                     }
                 },
                 {
                     "error": false,
                     "message": "",
                     "inputs": {
-                        "fromptr": [5, 53, 13, 73, 31, 101, 7, 59, 17, 79, 37, 103, 11, 61, 19, 83, 41, 107, 67, 23, 89, 43, 109, 71, 29, 97, 47, 113],
-                        "lenparents": 28,
-                        "outlength": 15,
-                        "parents": [0, 0, 5, 5, 10, 10, 1, 1, 6, 6, 11, 11, 2, 2, 7, 7, 12, 12, 3, 8, 8, 13, 13, 4, 9, 9, 14, 14]
+                        "fromptr": [0, 5, 20, 1, 6, 21, 2, 7, 22, 3, 8, 23, 4, 9, 24],
+                        "lenparents": 15,
+                        "outlength": 10,
+                        "parents": [0, 0, 5, 1, 1, 6, 2, 2, 7, 3, 3, 8, 4, 4, 9]
                     },
                     "outputs": {
-                        "toptr": [265, 413, 671, 67, 71, 949, 1343, 1577, 2047, 2813, 3131, 3811, 4387, 4687, 5311]
+                        "toptr": [1, 2, 2, 2, 2, 1, 1, 1, 1, 1]
                     }
                 },
                 {
                     "error": false,
                     "message": "",
                     "inputs": {
-                        "fromptr": [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37],
-                        "lenparents": 12,
-                        "outlength": 8,
-                        "parents": [0, 0, 0, 3, 3, 3, 4, 4, 4, 7, 7, 7]
+                        "fromptr": [15, 20, 25, 16, 21, 26, 17, 22, 27, 18, 23, 28, 19, 24, 29],
+                        "lenparents": 15,
+                        "outlength": 5,
+                        "parents": [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4]
                     },
                     "outputs": {
-                        "toptr": [30, 1, 1, 1001, 7429, 1, 1, 33263]
+                        "toptr": [3, 3, 3, 3, 3]
                     }
                 },
                 {
                     "error": false,
                     "message": "",
                     "inputs": {
-                        "fromptr": [2, 3, 5, 7, 11, 13],
-                        "lenparents": 6,
-                        "outlength": 4,
-                        "parents": [0, 0, 0, 2, 2, 3]
+                        "fromptr": [1, 2, 3],
+                        "lenparents": 3,
+                        "outlength": 1,
+                        "parents": [0, 0, 0]
                     },
                     "outputs": {
-                        "toptr": [30, 1, 77, 13]
+                        "toptr": [3]
                     }
                 },
                 {
                     "error": false,
                     "message": "",
                     "inputs": {
-                        "fromptr": [2, 3, 5, 7, 11, 13, 17, 19, 23],
-                        "lenparents": 9,
+                        "fromptr": [0, 1, 2, 4, 5, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 25, 26, 27, 28, 29],
+                        "lenparents": 20,
+                        "outlength": 6,
+                        "parents": [0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 5, 5, 5, 5, 5]
+                    },
+                    "outputs": {
+                        "toptr": [3, 3, 4, 4, 0, 5]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [2, 2, 4, 5, 5],
+                        "lenparents": 5,
+                        "outlength": 3,
+                        "parents": [0, 0, 0, 2, 2]
+                    },
+                    "outputs": {
+                        "toptr": [3, 0, 2]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
+                        "lenparents": 15,
+                        "outlength": 3,
+                        "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2]
+                    },
+                    "outputs": {
+                        "toptr": [5, 5, 5]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [4, 1, 0, 1, 4, 5, 1, 0, 1, 3],
+                        "lenparents": 10,
+                        "outlength": 2,
+                        "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+                    },
+                    "outputs": {
+                        "toptr": [4, 4]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [4, 1, 0, 1, 4, 4, 1, 0, 1, 4],
+                        "lenparents": 10,
+                        "outlength": 2,
+                        "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+                    },
+                    "outputs": {
+                        "toptr": [4, 4]
+                    }
+                }
+            ]
+        },
+        {
+            "name": "awkward_reduce_sum_int64_bool_64",
+            "status": true,
+            "tests": [
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [],
+                        "lenparents": 0,
+                        "outlength": 0,
+                        "parents": []
+                    },
+                    "outputs": {
+                        "toptr": []
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [0],
+                        "lenparents": 1,
+                        "outlength": 1,
+                        "parents": [0]
+                    },
+                    "outputs": {
+                        "toptr": [0]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [0, 5, 20, 1, 6, 21, 2, 7, 22, 3, 8, 23, 4, 9, 24],
+                        "lenparents": 15,
+                        "outlength": 10,
+                        "parents": [0, 5, 5, 1, 6, 6, 2, 7, 7, 3, 8, 8, 4, 9, 9]
+                    },
+                    "outputs": {
+                        "toptr": [0, 1, 1, 1, 1, 2, 2, 2, 2, 2]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [2, 3, 5, 7, 11, 13, 17, 19, 23],
+                        "lenparents": 9,
+                        "outlength": 6,
+                        "parents": [0, 0, 0, 2, 2, 3, 4, 4, 5]
+                    },
+                    "outputs": {
+                        "toptr": [3, 0, 2, 1, 2, 1]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [1, 0, 0, 1, 0, 0],
+                        "lenparents": 6,
+                        "outlength": 4,
+                        "parents": [0, 0, 0, 2, 2, 3]
+                    },
+                    "outputs": {
+                        "toptr": [1, 0, 1, 0]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24],
+                        "lenparents": 15,
+                        "outlength": 3,
+                        "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2]
+                    },
+                    "outputs": {
+                        "toptr": [4, 5, 5]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
+                        "lenparents": 30,
+                        "outlength": 6,
+                        "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5]
+                    },
+                    "outputs": {
+                        "toptr": [4, 5, 5, 5, 5, 5]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [0, 1, 3, 4, 5, 6],
+                        "lenparents": 6,
+                        "outlength": 4,
+                        "parents": [0, 0, 1, 3, 3, 3]
+                    },
+                    "outputs": {
+                        "toptr": [1, 1, 0, 3]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [0, 5, 10, 15, 25, 1, 11, 16, 26, 2, 12, 17, 27, 8, 18, 28, 4, 9, 14, 29],
+                        "lenparents": 20,
+                        "outlength": 10,
+                        "parents": [0, 0, 0, 5, 5, 1, 1, 6, 6, 2, 2, 7, 7, 3, 8, 8, 4, 4, 4, 9]
+                    },
+                    "outputs": {
+                        "toptr": [2, 2, 2, 1, 3, 2, 2, 2, 2, 1]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [15, 20, 25, 16, 21, 26, 17, 22, 27, 18, 23, 28, 19, 24, 29],
+                        "lenparents": 15,
+                        "outlength": 15,
+                        "parents": [0, 5, 10, 1, 6, 11, 2, 7, 12, 3, 8, 13, 4, 9, 14]
+                    },
+                    "outputs": {
+                        "toptr": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [0, 15, 5, 10, 25, 1, 16, 11, 26, 2, 17, 12, 27, 18, 8, 28, 4, 9, 14, 29],
+                        "lenparents": 20,
+                        "outlength": 15,
+                        "parents": [0, 0, 5, 10, 10, 1, 1, 11, 11, 2, 2, 12, 12, 3, 8, 13, 4, 9, 14, 14]
+                    },
+                    "outputs": {
+                        "toptr": [1, 2, 2, 1, 1, 1, 0, 0, 1, 1, 2, 2, 2, 1, 2]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [0, 15, 5, 20, 10, 25, 1, 16, 6, 21, 11, 26, 2, 17, 7, 22, 12, 27, 3, 18, 8, 23, 13, 28, 4, 19, 9, 24, 14, 29],
+                        "lenparents": 30,
+                        "outlength": 15,
+                        "parents": [0, 0, 5, 5, 10, 10, 1, 1, 6, 6, 11, 11, 2, 2, 7, 7, 12, 12, 3, 3, 8, 8, 13, 13, 4, 4, 9, 9, 14, 14]
+                    },
+                    "outputs": {
+                        "toptr": [1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [0, 5, 10, 15, 20, 25, 1, 6, 11, 16, 21, 26, 2, 7, 12, 17, 22, 27, 3, 8, 13, 18, 23, 28, 4, 9, 14, 19, 24, 29],
+                        "lenparents": 30,
+                        "outlength": 10,
+                        "parents": [0, 0, 0, 5, 5, 5, 1, 1, 1, 6, 6, 6, 2, 2, 2, 7, 7, 7, 3, 3, 3, 8, 8, 8, 4, 4, 4, 9, 9, 9]
+                    },
+                    "outputs": {
+                        "toptr": [2, 3, 3, 3, 3, 3, 3, 3, 3, 3]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [1, 2, 4, 8, 16, 32, 64, 128, 0, 0, 0, 0],
+                        "lenparents": 12,
+                        "outlength": 3,
+                        "parents": [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
+                    },
+                    "outputs": {
+                        "toptr": [4, 4, 0]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [1, 2, 3, 4, 5, 1, 2, 3, 4, 5],
+                        "lenparents": 10,
+                        "outlength": 2,
+                        "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+                    },
+                    "outputs": {
+                        "toptr": [5, 5]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [1, 2, 3, 4, 5, 6],
+                        "lenparents": 6,
+                        "outlength": 1,
+                        "parents": [0, 0, 0, 0, 0, 0]
+                    },
+                    "outputs": {
+                        "toptr": [6]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [2, 7, 13, 17, 23, 3, 11, 19, 5],
+                        "lenparents": 9,
+                        "outlength": 8,
+                        "parents": [0, 0, 0, 6, 6, 1, 1, 7, 2]
+                    },
+                    "outputs": {
+                        "toptr": [3, 2, 1, 0, 0, 0, 2, 1]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [1, 16, 0, 2, 32, 0, 4, 64, 0, 8, 128, 0],
+                        "lenparents": 12,
+                        "outlength": 4,
+                        "parents": [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]
+                    },
+                    "outputs": {
+                        "toptr": [2, 2, 2, 2]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [0, 1, 2, 3, 4, 5],
+                        "lenparents": 6,
+                        "outlength": 4,
+                        "parents": [0, 0, 0, 2, 2, 3]
+                    },
+                    "outputs": {
+                        "toptr": [2, 0, 2, 1]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [0, 4, 1, 3, 5, 6],
+                        "lenparents": 6,
+                        "outlength": 4,
+                        "parents": [0, 0, 1, 1, 1, 3]
+                    },
+                    "outputs": {
+                        "toptr": [1, 3, 0, 1]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [1, 4, 9, 16, 25, 1, 4, 9, 16, 25],
+                        "lenparents": 10,
+                        "outlength": 2,
+                        "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+                    },
+                    "outputs": {
+                        "toptr": [5, 5]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [1, 4, 9, 16, 26, 1, 4, 10, 16, 24],
+                        "lenparents": 10,
+                        "outlength": 2,
+                        "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+                    },
+                    "outputs": {
+                        "toptr": [5, 5]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [0, 5, 20, 1, 6, 21, 2, 7, 22, 3, 8, 23, 4, 9, 24],
+                        "lenparents": 15,
+                        "outlength": 10,
+                        "parents": [0, 0, 5, 1, 1, 6, 2, 2, 7, 3, 3, 8, 4, 4, 9]
+                    },
+                    "outputs": {
+                        "toptr": [1, 2, 2, 2, 2, 1, 1, 1, 1, 1]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [15, 20, 25, 16, 21, 26, 17, 22, 27, 18, 23, 28, 19, 24, 29],
+                        "lenparents": 15,
+                        "outlength": 5,
+                        "parents": [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4]
+                    },
+                    "outputs": {
+                        "toptr": [3, 3, 3, 3, 3]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [1, 2, 3],
+                        "lenparents": 3,
+                        "outlength": 1,
+                        "parents": [0, 0, 0]
+                    },
+                    "outputs": {
+                        "toptr": [3]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [0, 1, 2, 4, 5, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 25, 26, 27, 28, 29],
+                        "lenparents": 20,
+                        "outlength": 6,
+                        "parents": [0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 5, 5, 5, 5, 5]
+                    },
+                    "outputs": {
+                        "toptr": [3, 3, 4, 4, 0, 5]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [2, 2, 4, 5, 5],
+                        "lenparents": 5,
+                        "outlength": 3,
+                        "parents": [0, 0, 0, 2, 2]
+                    },
+                    "outputs": {
+                        "toptr": [3, 0, 2]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
+                        "lenparents": 15,
+                        "outlength": 3,
+                        "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2]
+                    },
+                    "outputs": {
+                        "toptr": [5, 5, 5]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [4, 1, 0, 1, 4, 5, 1, 0, 1, 3],
+                        "lenparents": 10,
+                        "outlength": 2,
+                        "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+                    },
+                    "outputs": {
+                        "toptr": [4, 4]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [4, 1, 0, 1, 4, 4, 1, 0, 1, 4],
+                        "lenparents": 10,
+                        "outlength": 2,
+                        "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+                    },
+                    "outputs": {
+                        "toptr": [4, 4]
+                    }
+                }
+            ]
+        },
+        {
+            "name": "awkward_reduce_prod",
+            "status": true,
+            "tests": [
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [1, 0, 0, 1, 0, 0],
+                        "lenparents": 6,
+                        "outlength": 4,
+                        "parents": [0, 0, 0, 2, 2, 3]
+                    },
+                    "outputs": {
+                        "toptr": [0, 1, 0, 0]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [],
+                        "lenparents": 0,
+                        "outlength": 0,
+                        "parents": []
+                    },
+                    "outputs": {
+                        "toptr": []
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [0, 1, 2, 3, 4, 5],
+                        "lenparents": 6,
+                        "outlength": 4,
+                        "parents": [0, 0, 0, 2, 2, 3]
+                    },
+                    "outputs": {
+                        "toptr": [0, 1, 12, 5]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [2, 53, 31, 101, 3, 59, 37, 103, 5, 61, 41, 107, 7, 67, 43, 109, 11, 71, 47, 113],
+                        "lenparents": 20,
+                        "outlength": 15,
+                        "parents": [0, 0, 10, 10, 1, 1, 11, 11, 2, 2, 12, 12, 3, 3, 13, 13, 4, 4, 14, 14]
+                    },
+                    "outputs": {
+                        "toptr": [106, 177, 305, 469, 781, 1, 1, 1, 1, 1, 3131, 3811, 4387, 4687, 5311]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [2, 53, 13, 73, 31, 101, 3, 59, 17, 79, 37, 103, 5, 61, 19, 83, 41, 107, 7, 67, 23, 89, 43, 109, 11, 71, 47, 113],
+                        "lenparents": 28,
+                        "outlength": 15,
+                        "parents": [0, 0, 5, 5, 10, 10, 1, 1, 6, 6, 11, 11, 2, 2, 7, 7, 12, 12, 3, 3, 8, 8, 13, 13, 4, 4, 14, 14]
+                    },
+                    "outputs": {
+                        "toptr": [106, 177, 305, 469, 781, 949, 1343, 1577, 2047, 1, 3131, 3811, 4387, 4687, 5311]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [2, 53, 13, 73, 31, 101, 3, 59, 17, 79, 37, 103, 5, 61, 19, 83, 41, 107, 7, 67, 23, 89, 43, 11, 71, 29, 97, 47],
+                        "lenparents": 28,
+                        "outlength": 15,
+                        "parents": [0, 0, 5, 5, 10, 10, 1, 1, 6, 6, 11, 11, 2, 2, 7, 7, 12, 12, 3, 3, 8, 8, 13, 4, 4, 9, 9, 14]
+                    },
+                    "outputs": {
+                        "toptr": [106, 177, 305, 469, 781, 949, 1343, 1577, 2047, 2813, 3131, 3811, 4387, 43, 47]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [2, 53, 13, 73, 31, 101, 3, 59, 17, 79, 37, 103, 5, 61, 19, 83, 41, 107, 7, 67, 23, 89, 43, 109, 11, 71, 29, 97],
+                        "lenparents": 28,
+                        "outlength": 14,
+                        "parents": [0, 0, 5, 5, 10, 10, 1, 1, 6, 6, 11, 11, 2, 2, 7, 7, 12, 12, 3, 3, 8, 8, 13, 13, 4, 4, 9, 9]
+                    },
+                    "outputs": {
+                        "toptr": [106, 177, 305, 469, 781, 949, 1343, 1577, 2047, 2813, 3131, 3811, 4387, 4687]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [2, 53, 13, 73, 31, 101, 3, 59, 17, 79, 37, 103, 5, 61, 19, 83, 41, 107, 7, 67, 23, 89, 43, 109, 11, 71, 29, 97, 47],
+                        "lenparents": 29,
+                        "outlength": 15,
+                        "parents": [0, 0, 5, 5, 10, 10, 1, 1, 6, 6, 11, 11, 2, 2, 7, 7, 12, 12, 3, 3, 8, 8, 13, 13, 4, 4, 9, 9, 14]
+                    },
+                    "outputs": {
+                        "toptr": [106, 177, 305, 469, 781, 949, 1343, 1577, 2047, 2813, 3131, 3811, 4387, 4687, 47]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [2, 53, 13, 73, 31, 101, 3, 59, 17, 79, 37, 103, 5, 61, 19, 83, 41, 107, 7, 67, 23, 89, 43, 109, 11, 71, 29, 97, 47, 113],
+                        "lenparents": 30,
+                        "outlength": 15,
+                        "parents": [0, 0, 5, 5, 10, 10, 1, 1, 6, 6, 11, 11, 2, 2, 7, 7, 12, 12, 3, 3, 8, 8, 13, 13, 4, 4, 9, 9, 14, 14]
+                    },
+                    "outputs": {
+                        "toptr": [106, 177, 305, 469, 781, 949, 1343, 1577, 2047, 2813, 3131, 3811, 4387, 4687, 5311]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [2, 53, 13, 73, 31, 101, 3, 59, 17, 79, 37, 103, 5, 61, 19, 83, 41, 107, 7, 67, 23, 89, 43, 109, 11, 71, 29, 47],
+                        "lenparents": 28,
+                        "outlength": 15,
+                        "parents": [0, 0, 5, 5, 10, 10, 1, 1, 6, 6, 11, 11, 2, 2, 7, 7, 12, 12, 3, 3, 8, 8, 13, 13, 4, 4, 9, 14]
+                    },
+                    "outputs": {
+                        "toptr": [106, 177, 305, 469, 781, 949, 1343, 1577, 2047, 29, 3131, 3811, 4387, 4687, 47]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [0],
+                        "lenparents": 1,
+                        "outlength": 3,
+                        "parents": [2]
+                    },
+                    "outputs": {
+                        "toptr": [1, 1, 0]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [101, 103, 107, 109, 113, 53, 59, 61, 67, 71, 31, 37, 41, 43, 47, 2, 3, 5, 7, 11],
+                        "lenparents": 20,
+                        "outlength": 6,
+                        "parents": [0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 5, 5, 5, 5, 5]
+                    },
+                    "outputs": {
+                        "toptr": [13710311357, 1, 907383479, 95041567, 1, 2310]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [101, 103, 107, 109, 113, 73, 79, 83, 89, 97, 53, 59, 61, 67, 71, 31, 37, 41, 43, 47, 13, 17, 19, 23, 29, 2, 3, 5, 7, 11],
+                        "lenparents": 30,
+                        "outlength": 6,
+                        "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5]
+                    },
+                    "outputs": {
+                        "toptr": [13710311357, 4132280413, 907383479, 95041567, 2800733, 2310]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [101, 103, 107, 109, 113, 53, 59, 61, 67, 71, 31, 37, 41, 43, 47, 2, 3, 5, 7, 11],
+                        "lenparents": 20,
+                        "outlength": 4,
+                        "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3]
+                    },
+                    "outputs": {
+                        "toptr": [13710311357, 907383479, 95041567, 2310]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [2, 7, 17, 29, 3, 11, 19, 31, 5, 13, 23, 37],
+                        "lenparents": 12,
+                        "outlength": 6,
+                        "parents": [0, 0, 3, 3, 1, 1, 4, 4, 2, 2, 5, 5]
+                    },
+                    "outputs": {
+                        "toptr": [14, 33, 65, 493, 589, 851]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [3, 53, 13, 73, 31, 101, 5, 59, 17, 79, 37, 103, 7, 61, 19, 83, 41, 107, 67, 23, 89, 43, 109, 71, 29, 97, 47, 113],
+                        "lenparents": 28,
+                        "outlength": 15,
+                        "parents": [0, 0, 5, 5, 10, 10, 1, 1, 6, 6, 11, 11, 2, 2, 7, 7, 12, 12, 3, 8, 8, 13, 13, 4, 9, 9, 14, 14]
+                    },
+                    "outputs": {
+                        "toptr": [159, 295, 427, 67, 71, 949, 1343, 1577, 2047, 2813, 3131, 3811, 4387, 4687, 5311]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [3, 53, 13, 73, 31, 101, 5, 59, 17, 79, 37, 103, 7, 61, 19, 83, 41, 107, 11, 67, 23, 89, 43, 109, 71, 29, 97, 47, 113],
+                        "lenparents": 29,
+                        "outlength": 15,
+                        "parents": [0, 0, 5, 5, 10, 10, 1, 1, 6, 6, 11, 11, 2, 2, 7, 7, 12, 12, 3, 3, 8, 8, 13, 13, 4, 9, 9, 14, 14]
+                    },
+                    "outputs": {
+                        "toptr": [159, 295, 427, 737, 71, 949, 1343, 1577, 2047, 2813, 3131, 3811, 4387, 4687, 5311]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [3, 53, 13, 73, 31, 101, 5, 59, 17, 79, 37, 103, 7, 61, 19, 83, 41, 107, 11, 67, 23, 89, 43, 109, 71, 97, 47, 113],
+                        "lenparents": 28,
+                        "outlength": 15,
+                        "parents": [0, 0, 5, 5, 10, 10, 1, 1, 6, 6, 11, 11, 2, 2, 7, 7, 12, 12, 3, 3, 8, 8, 13, 13, 4, 9, 14, 14]
+                    },
+                    "outputs": {
+                        "toptr": [159, 295, 427, 737, 71, 949, 1343, 1577, 2047, 97, 3131, 3811, 4387, 4687, 5311]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [2, 7, 13, 17, 23, 3, 11, 19, 5],
+                        "lenparents": 9,
+                        "outlength": 8,
+                        "parents": [0, 0, 0, 6, 6, 1, 1, 7, 2]
+                    },
+                    "outputs": {
+                        "toptr": [182, 33, 5, 1, 1, 1, 391, 19]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37],
+                        "lenparents": 12,
+                        "outlength": 3,
+                        "parents": [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
+                    },
+                    "outputs": {
+                        "toptr": [210, 46189, 765049]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [2, 3, 5, 7, 11, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 101, 103, 107, 109, 113],
+                        "lenparents": 20,
+                        "outlength": 6,
+                        "parents": [0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 5, 5, 5, 5, 5]
+                    },
+                    "outputs": {
+                        "toptr": [2310, 1, 95041567, 907383479, 1, 13710311357]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113],
+                        "lenparents": 30,
+                        "outlength": 6,
+                        "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5]
+                    },
+                    "outputs": {
+                        "toptr": [2310, 2800733, 95041567, 907383479, 4132280413, 13710311357]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [2, 3, 5, 7, 11, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 101, 103, 107, 109, 113],
+                        "lenparents": 20,
+                        "outlength": 4,
+                        "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3]
+                    },
+                    "outputs": {
+                        "toptr": [2310, 95041567, 907383479, 13710311357]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [2, 7, 3, 11, 5],
+                        "lenparents": 5,
+                        "outlength": 8,
+                        "parents": [0, 6, 1, 7, 2]
+                    },
+                    "outputs": {
+                        "toptr": [2, 3, 5, 1, 1, 1, 7, 11]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [5, 53, 13, 73, 31, 101, 7, 59, 17, 79, 37, 103, 11, 61, 19, 83, 41, 107, 67, 23, 89, 43, 109, 71, 29, 97, 47, 113],
+                        "lenparents": 28,
+                        "outlength": 15,
+                        "parents": [0, 0, 5, 5, 10, 10, 1, 1, 6, 6, 11, 11, 2, 2, 7, 7, 12, 12, 3, 8, 8, 13, 13, 4, 9, 9, 14, 14]
+                    },
+                    "outputs": {
+                        "toptr": [265, 413, 671, 67, 71, 949, 1343, 1577, 2047, 2813, 3131, 3811, 4387, 4687, 5311]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37],
+                        "lenparents": 12,
+                        "outlength": 8,
+                        "parents": [0, 0, 0, 3, 3, 3, 4, 4, 4, 7, 7, 7]
+                    },
+                    "outputs": {
+                        "toptr": [30, 1, 1, 1001, 7429, 1, 1, 33263]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [2, 3, 5, 7, 11, 13],
+                        "lenparents": 6,
+                        "outlength": 4,
+                        "parents": [0, 0, 0, 2, 2, 3]
+                    },
+                    "outputs": {
+                        "toptr": [30, 1, 77, 13]
+                    }
+                },
+                {
+                    "error": false,
+                    "message": "",
+                    "inputs": {
+                        "fromptr": [2, 3, 5, 7, 11, 13, 17, 19, 23],
+                        "lenparents": 9,
                         "outlength": 6,
                         "parents": [0, 0, 0, 2, 2, 3, 4, 4, 5]
                     },
@@ -25537,7 +26329,7 @@
         },
         {
             "name": "awkward_reduce_min",
-            "status": false,
+            "status": true,
             "tests": [
                 {
                     "error": false,
@@ -25557,7 +26349,7 @@
                     "error": false,
                     "message": "",
                     "inputs": {
-                        "fromptr": [0, 4, 1, 3, 5, 6],
+                        "fromptr": [0, 4, 1, 1, 5, 6],
                         "identity": 9223372036854775807,
                         "lenparents": 6,
                         "outlength": 4,
@@ -25739,7 +26531,7 @@
         },
         {
             "name": "awkward_reduce_argmin",
-            "status": false,
+            "status": true,
             "tests": [
                 {
                     "error": false,
diff --git a/src/awkward/_connect/cuda/__init__.py b/src/awkward/_connect/cuda/__init__.py
index 494fca77d6..c8b2db7ff3 100644
--- a/src/awkward/_connect/cuda/__init__.py
+++ b/src/awkward/_connect/cuda/__init__.py
@@ -108,6 +108,7 @@ def fetch_template_specializations(kernel_dict):
         "awkward_ListArray_rpad_axis1",
         "awkward_ListOffsetArray_drop_none_indexes",
         "awkward_ListOffsetArray_reduce_nonlocal_maxcount_offsetscopy_64",
+        "awkward_ListOffsetArray_reduce_local_outoffsets_64",
         "awkward_UnionArray_regular_index",
         "awkward_ListOffsetArray_reduce_nonlocal_nextstarts_64",
         "awkward_ListOffsetArray_rpad_axis1",
@@ -119,6 +120,7 @@ def fetch_template_specializations(kernel_dict):
         "awkward_reduce_sum_int32_bool_64",
         "awkward_reduce_sum_int64_bool_64",
         "awkward_reduce_sum_bool",
+        "awkward_reduce_prod",
         "awkward_reduce_prod_bool",
         "awkward_reduce_argmax",
         "awkward_reduce_argmin",
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu
new file mode 100644
index 0000000000..42e8119d46
--- /dev/null
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu
@@ -0,0 +1,100 @@
+// BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE
+
+// BEGIN PYTHON
+// def f(grid, block, args):
+//     (outoffsets, parents, lenparents, outlength, invocation_index, err_code) = args
+//     if block[0] > 0:
+//         segment = math.floor((outlength + block[0] - 1) / block[0])
+//         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
+//     else:
+//         grid_size = 1
+//     temp = cupy.zeros(lenparents, dtype=cupy.int64)
+//     scan_in_array = cupy.zeros(outlength, dtype=cupy.uint64)
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_a", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, scan_in_array, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_b", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, scan_in_array, temp, invocation_index, err_code))
+//     scan_in_array = cupy.cumsum(scan_in_array)
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_c", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, scan_in_array, temp, invocation_index, err_code))
+// out["awkward_ListOffsetArray_reduce_local_outoffsets_64_a", {dtype_specializations}] = None
+// out["awkward_ListOffsetArray_reduce_local_outoffsets_64_b", {dtype_specializations}] = None
+// out["awkward_ListOffsetArray_reduce_local_outoffsets_64_c", {dtype_specializations}] = None
+// END PYTHON
+
+template <typename T, typename C>
+__global__ void
+awkward_ListOffsetArray_reduce_local_outoffsets_64_a(
+    T* outoffsets,
+    const C* parents,
+    int64_t lenparents,
+    int64_t outlength,
+    uint64_t* scan_in_array,
+    int64_t* temp,
+    uint64_t invocation_index,
+    uint64_t* err_code) {
+  if (err_code[0] == NO_ERROR) {
+    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (thread_id < outlength) {
+      outoffsets[thread_id] = 0;
+    }
+  }
+}
+
+template <typename T, typename C>
+__global__ void
+awkward_ListOffsetArray_reduce_local_outoffsets_64_b(
+    T* outoffsets,
+    const C* parents,
+    int64_t lenparents,
+    int64_t outlength,
+    uint64_t* scan_in_array,
+    int64_t* temp,
+    uint64_t invocation_index,
+    uint64_t* err_code) {
+  if (err_code[0] == NO_ERROR) {
+    int64_t idx = threadIdx.x;
+    int64_t thread_id = blockIdx.x * blockDim.x + idx;
+
+    if (thread_id < lenparents) {
+      temp[thread_id] = 1;
+    }
+    __syncthreads();
+
+    for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
+      int64_t val = 0;
+      if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+        val = temp[thread_id - stride];
+      }
+      __syncthreads();
+      temp[thread_id] += val;
+      __syncthreads();
+    }
+
+    if (thread_id < lenparents) {
+      int64_t parent = parents[thread_id];
+      if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+        atomicAdd(&scan_in_array[parent], temp[thread_id]);
+      }
+    }
+  }
+}
+
+template <typename T, typename C>
+__global__ void
+awkward_ListOffsetArray_reduce_local_outoffsets_64_c(
+    T* outoffsets,
+    const C* parents,
+    int64_t lenparents,
+    int64_t outlength,
+    uint64_t* scan_in_array,
+    int64_t* temp,
+    uint64_t invocation_index,
+    uint64_t* err_code) {
+  if (err_code[0] == NO_ERROR) {
+    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    outoffsets[0] = 0;
+
+    if (thread_id < outlength) {
+      outoffsets[thread_id + 1] = (T)(scan_in_array[thread_id]);
+    }
+  }
+}
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu
index 555420c3f8..df515f05a4 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu
@@ -3,10 +3,18 @@
 // BEGIN PYTHON
 // def f(grid, block, args):
 //     (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_a", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_b", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code))
+//     if block[0] > 0:
+//         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
+//     else:
+//         grid_size = 1
+//     atomic_toptr = cupy.array(toptr, dtype=cupy.uint64)
+//     temp = cupy.zeros(lenparents, dtype=toptr.dtype)
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
 // out["awkward_reduce_argmax_a", {dtype_specializations}] = None
 // out["awkward_reduce_argmax_b", {dtype_specializations}] = None
+// out["awkward_reduce_argmax_c", {dtype_specializations}] = None
 // END PYTHON
 
 template <typename T, typename C, typename U>
@@ -17,12 +25,15 @@ awkward_reduce_argmax_a(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
+    uint64_t* atomic_toptr,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
     if (thread_id < outlength) {
-      toptr[thread_id] = -1;
+      atomic_toptr[thread_id] = -1;
     }
   }
 }
@@ -35,17 +46,57 @@ awkward_reduce_argmax_b(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
+    uint64_t* atomic_toptr,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
-    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    int64_t idx = threadIdx.x;
+    int64_t thread_id = blockIdx.x * blockDim.x + idx;
+
+    if (thread_id < lenparents) {
+      temp[thread_id] = thread_id;
+    }
+    __syncthreads();
+
+    for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
+      int64_t index = -1;
+      if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+        index = temp[thread_id - stride];
+      }
+      if (index != -1 && (temp[thread_id] == -1 || fromptr[index] > fromptr[temp[thread_id]] ||
+         (fromptr[index] == fromptr[temp[thread_id]] && index < temp[thread_id]))) {
+        temp[thread_id] = index;
+      }
+      __syncthreads();
+    }
 
     if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
-      if (toptr[parent] == -1 ||
-          (fromptr[thread_id] > (fromptr[toptr[parent]]))) {
-        toptr[parent] = thread_id; // we need the last parent filled, thread random order problem, find max arg at that index
+      if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+        atomicExch(&atomic_toptr[parent], temp[thread_id]);
       }
     }
   }
 }
+
+template <typename T, typename C, typename U>
+__global__ void
+awkward_reduce_argmax_c(
+    T* toptr,
+    const C* fromptr,
+    const U* parents,
+    int64_t lenparents,
+    int64_t outlength,
+    uint64_t* atomic_toptr,
+    T* temp,
+    uint64_t invocation_index,
+    uint64_t* err_code) {
+  if (err_code[0] == NO_ERROR) {
+    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (thread_id < outlength) {
+      toptr[thread_id] = (T)(atomic_toptr[thread_id]);
+    }
+  }
+}
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu
index 282ebd11cc..af1d3fd93d 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu
@@ -3,10 +3,18 @@
 // BEGIN PYTHON
 // def f(grid, block, args):
 //     (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmin_a", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmin_b", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code))
+//     if block[0] > 0:
+//         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
+//     else:
+//         grid_size = 1
+//     atomic_toptr = cupy.array(toptr, dtype=cupy.uint64)
+//     temp = cupy.zeros(lenparents, dtype=toptr.dtype)
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmin_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmin_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmin_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
 // out["awkward_reduce_argmin_a", {dtype_specializations}] = None
 // out["awkward_reduce_argmin_b", {dtype_specializations}] = None
+// out["awkward_reduce_argmin_c", {dtype_specializations}] = None
 // END PYTHON
 
 template <typename T, typename C, typename U>
@@ -17,12 +25,15 @@ awkward_reduce_argmin_a(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
+    uint64_t* atomic_toptr,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
     if (thread_id < outlength) {
-      toptr[thread_id] = -1;
+      atomic_toptr[thread_id] = -1;
     }
   }
 }
@@ -35,17 +46,57 @@ awkward_reduce_argmin_b(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
+    uint64_t* atomic_toptr,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
-    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    int64_t idx = threadIdx.x;
+    int64_t thread_id = blockIdx.x * blockDim.x + idx;
+
+    if (thread_id < lenparents) {
+      temp[thread_id] = thread_id;
+    }
+    __syncthreads();
+
+    for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
+      int64_t index = -1;
+      if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+        index = temp[thread_id - stride];
+      }
+      if (index != -1 && (temp[thread_id] == -1 || fromptr[index] < fromptr[temp[thread_id]] ||
+         (fromptr[index] == fromptr[temp[thread_id]] && index < temp[thread_id]))) {
+        temp[thread_id] = index;
+      }
+      __syncthreads();
+    }
 
     if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
-      if (toptr[parent] == -1 ||
-          (fromptr[thread_id] < (fromptr[toptr[parent]]))) {
-        toptr[parent] = thread_id;
+      if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+        atomicExch(&atomic_toptr[parent], temp[thread_id]);
       }
     }
   }
 }
+
+template <typename T, typename C, typename U>
+__global__ void
+awkward_reduce_argmin_c(
+    T* toptr,
+    const C* fromptr,
+    const U* parents,
+    int64_t lenparents,
+    int64_t outlength,
+    uint64_t* atomic_toptr,
+    T* temp,
+    uint64_t invocation_index,
+    uint64_t* err_code) {
+  if (err_code[0] == NO_ERROR) {
+    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (thread_id < outlength) {
+      toptr[thread_id] = (T)(atomic_toptr[thread_id]);
+    }
+  }
+}
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu
index 311f04012b..9c55e69600 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu
@@ -2,68 +2,71 @@
 
 // BEGIN PYTHON
 // def f(grid, block, args):
-//     (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args
-//     atomicAdd_toptr = cupy.array(toptr, dtype=cupy.uint64)
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_a", toptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_b", toptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_c", toptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code))
+//     (toptr, parents, lenparents, outlength, invocation_index, err_code) = args
+//     if block[0] > 0:
+//         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
+//     else:
+//         grid_size = 1
+//     temp = cupy.zeros(lenparents, dtype=toptr.dtype)
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_a", cupy.dtype(toptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, parents, lenparents, outlength, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_b", cupy.dtype(toptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, parents, lenparents, outlength, temp, invocation_index, err_code))
 // out["awkward_reduce_count_64_a", {dtype_specializations}] = None
 // out["awkward_reduce_count_64_b", {dtype_specializations}] = None
-// out["awkward_reduce_count_64_c", {dtype_specializations}] = None
 // END PYTHON
 
-template <typename T, typename U>
+template <typename T, typename C>
 __global__ void
 awkward_reduce_count_64_a(
     T* toptr,
-    const bool* fromptr,
-    const U* parents,
+    const C* parents,
     int64_t lenparents,
     int64_t outlength,
-    uint64_t* atomicAdd_toptr,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
     if (thread_id < outlength) {
-      atomicAdd_toptr[thread_id] = 0;
+      toptr[thread_id] = 0;
     }
   }
 }
 
-template <typename T, typename U>
+template <typename T, typename C>
 __global__ void
 awkward_reduce_count_64_b(
     T* toptr,
-    const bool* fromptr,
-    const U* parents,
+    const C* parents,
     int64_t lenparents,
     int64_t outlength,
-    uint64_t* atomicAdd_toptr,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
-    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    int64_t idx = threadIdx.x;
+    int64_t thread_id = blockIdx.x * blockDim.x + idx;
+
     if (thread_id < lenparents) {
-      atomicAdd(atomicAdd_toptr + parents[thread_id], (uint64_t)1);
+      temp[thread_id] = 1;
     }
-  }
-}
+    __syncthreads();
 
-template <typename T, typename U>
-__global__ void
-awkward_reduce_count_64_c(T* toptr,
-                          const bool* fromptr,
-                          const U* parents,
-                          int64_t lenparents,
-                          int64_t outlength,
-                          uint64_t* atomicAdd_toptr,
-                          uint64_t invocation_index,
-                          uint64_t* err_code) {
-  if (err_code[0] == NO_ERROR) {
-    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    if (thread_id < outlength) {
-      toptr[thread_id] = (T)atomicAdd_toptr[thread_id];
+    for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
+      int64_t val = 0;
+      if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+        val = temp[thread_id - stride];
+      }
+      __syncthreads();
+      temp[thread_id] += val;
+      __syncthreads();
+    }
+
+    if (thread_id < lenparents) {
+      int64_t parent = parents[thread_id];
+      if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+        atomicAdd(&toptr[parent], temp[thread_id]);
+      }
     }
   }
 }
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu
index 6b07dfa208..ffcb0b8bd3 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu
@@ -3,13 +3,15 @@
 // BEGIN PYTHON
 // def f(grid, block, args):
 //     (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args
-//     atomicAdd_toptr = cupy.array(toptr, dtype=cupy.uint64)
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_countnonzero_a", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_countnonzero_b", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_countnonzero_c", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code))
+//     if block[0] > 0:
+//         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
+//     else:
+//         grid_size = 1
+//     temp = cupy.zeros(lenparents, dtype=toptr.dtype)
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_countnonzero_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_countnonzero_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, temp, invocation_index, err_code))
 // out["awkward_reduce_countnonzero_a", {dtype_specializations}] = None
 // out["awkward_reduce_countnonzero_b", {dtype_specializations}] = None
-// out["awkward_reduce_countnonzero_c", {dtype_specializations}] = None
 // END PYTHON
 
 template <typename T, typename C, typename U>
@@ -20,14 +22,14 @@ awkward_reduce_countnonzero_a(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    uint64_t* atomicAdd_toptr,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      atomicAdd_toptr[thread_id] = 0;
+      toptr[thread_id] = 0;
     }
   }
 }
@@ -40,34 +42,33 @@ awkward_reduce_countnonzero_b(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    uint64_t* atomicAdd_toptr,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
-    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    int64_t idx = threadIdx.x;
+    int64_t thread_id = blockIdx.x * blockDim.x + idx;
 
     if (thread_id < lenparents) {
-      atomicAdd(atomicAdd_toptr + parents[thread_id],
-                (uint64_t)(fromptr[thread_id] != 0));
+      temp[thread_id] = (fromptr[thread_id] != 0) ? 1 : 0;
     }
-  }
-}
+    __syncthreads();
 
-template <typename T, typename C, typename U>
-__global__ void
-awkward_reduce_countnonzero_c(T* toptr,
-                              const C* fromptr,
-                              const U* parents,
-                              int64_t lenparents,
-                              int64_t outlength,
-                              uint64_t* atomicAdd_toptr,
-                              uint64_t invocation_index,
-                              uint64_t* err_code) {
-  if (err_code[0] == NO_ERROR) {
-    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
+      int64_t val = 0;
+      if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+        val = temp[thread_id - stride];
+      }
+      __syncthreads();
+      temp[thread_id] += val;
+      __syncthreads();
+    }
 
-    if (thread_id < outlength) {
-      toptr[thread_id] = (T)atomicAdd_toptr[thread_id];
+    if (thread_id < lenparents) {
+      int64_t parent = parents[thread_id];
+      if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+        atomicAdd(&toptr[parent], temp[thread_id]);
+      }
     }
   }
 }
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu
index 3c20b653ac..6a3fe66055 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu
@@ -3,8 +3,13 @@
 // BEGIN PYTHON
 // def f(grid, block, args):
 //     (toptr, fromptr, parents, lenparents, outlength, identity, invocation_index, err_code) = args
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_max_a", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, identity, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_max_b", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, identity, invocation_index, err_code))
+//     if block[0] > 0:
+//         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
+//     else:
+//         grid_size = 1
+//     temp = cupy.full(lenparents, identity, dtype=toptr.dtype)
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_max_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, toptr.dtype.type(identity), temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_max_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, toptr.dtype.type(identity), temp, invocation_index, err_code))
 // out["awkward_reduce_max_a", {dtype_specializations}] = None
 // out["awkward_reduce_max_b", {dtype_specializations}] = None
 // END PYTHON
@@ -18,6 +23,7 @@ awkward_reduce_max_a(
     int64_t lenparents,
     int64_t outlength,
     T identity,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
@@ -37,15 +43,34 @@ awkward_reduce_max_b(
     int64_t lenparents,
     int64_t outlength,
     T identity,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
-    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    int64_t idx = threadIdx.x;
+    int64_t thread_id = blockIdx.x * blockDim.x + idx;
+
+    if (thread_id < lenparents) {
+      temp[thread_id] = fromptr[thread_id];
+    }
+    __syncthreads();
+
+    for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
+      T val = identity;
+
+      if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+        val = temp[idx - stride];
+      }
+      __syncthreads();
+      temp[thread_id] = val > temp[thread_id] ? val : temp[thread_id];
+      __syncthreads();
+    }
 
     if (thread_id < lenparents) {
-      C x = fromptr[thread_id];
-      toptr[parents[thread_id]] =
-          (x > toptr[parents[thread_id]] ? x : toptr[parents[thread_id]]);
+      int64_t parent = parents[thread_id];
+      if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+        atomicMax(&toptr[parent], temp[thread_id]);
+      }
     }
   }
 }
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu
index ae0e2dcb61..12a72b338f 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu
@@ -3,8 +3,13 @@
 // BEGIN PYTHON
 // def f(grid, block, args):
 //     (toptr, fromptr, parents, lenparents, outlength, identity, invocation_index, err_code) = args
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_min_a", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, identity, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_min_b", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, identity, invocation_index, err_code))
+//     if block[0] > 0:
+//         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
+//     else:
+//         grid_size = 1
+//     temp = cupy.full(lenparents, identity, dtype=toptr.dtype)
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_min_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, toptr.dtype.type(identity), temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_min_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, toptr.dtype.type(identity), temp, invocation_index, err_code))
 // out["awkward_reduce_min_a", {dtype_specializations}] = None
 // out["awkward_reduce_min_b", {dtype_specializations}] = None
 // END PYTHON
@@ -18,10 +23,12 @@ awkward_reduce_min_a(
     int64_t lenparents,
     int64_t outlength,
     T identity,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
     if (thread_id < outlength) {
       toptr[thread_id] = identity;
     }
@@ -37,15 +44,33 @@ awkward_reduce_min_b(
     int64_t lenparents,
     int64_t outlength,
     T identity,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
-    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    int64_t idx = threadIdx.x;
+    int64_t thread_id = blockIdx.x * blockDim.x + idx;
+
+    if (thread_id < lenparents) {
+      temp[thread_id] = fromptr[thread_id];
+    }
+    __syncthreads();
+
+    for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
+      T val = identity;
+      if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+        val = temp[thread_id - stride];
+      }
+      __syncthreads();
+      temp[thread_id] = val < temp[thread_id] ? val : temp[thread_id];
+      __syncthreads();
+    }
 
     if (thread_id < lenparents) {
-      C x = fromptr[thread_id];
-      toptr[parents[thread_id]] =
-          (x < toptr[parents[thread_id]] ? x : toptr[parents[thread_id]]);
+      int64_t parent = parents[thread_id];
+      if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+        atomicMin(&toptr[parent], temp[thread_id]);
+      }
     }
   }
 }
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod.cu
new file mode 100644
index 0000000000..9248e20efc
--- /dev/null
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod.cu
@@ -0,0 +1,100 @@
+// BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE
+
+// BEGIN PYTHON
+// def f(grid, block, args):
+//     (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args
+//     if block[0] > 0:
+//         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
+//     else:
+//         grid_size = 1
+//     atomic_toptr = cupy.array(toptr, dtype=toptr.dtype)
+//     temp = cupy.ones(lenparents, dtype=toptr.dtype)
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
+// out["awkward_reduce_prod_a", {dtype_specializations}] = None
+// out["awkward_reduce_prod_b", {dtype_specializations}] = None
+// out["awkward_reduce_prod_c", {dtype_specializations}] = None
+// END PYTHON
+
+template <typename T, typename C, typename U>
+__global__ void
+awkward_reduce_prod_a(
+    T* toptr,
+    const C* fromptr,
+    const U* parents,
+    int64_t lenparents,
+    int64_t outlength,
+    T* atomic_toptr,
+    T* temp,
+    uint64_t invocation_index,
+    uint64_t* err_code) {
+  if (err_code[0] == NO_ERROR) {
+    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (thread_id < outlength) {
+      atomic_toptr[thread_id] = 1;
+    }
+  }
+}
+
+template <typename T, typename C, typename U>
+__global__ void
+awkward_reduce_prod_b(
+    T* toptr,
+    const C* fromptr,
+    const U* parents,
+    int64_t lenparents,
+    int64_t outlength,
+    T* atomic_toptr,
+    T* temp,
+    uint64_t invocation_index,
+    uint64_t* err_code) {
+  if (err_code[0] == NO_ERROR) {
+    int64_t idx = threadIdx.x;
+    int64_t thread_id = blockIdx.x * blockDim.x + idx;
+
+    if (thread_id < lenparents) {
+      temp[thread_id] = fromptr[thread_id];
+    }
+    __syncthreads();
+
+    for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
+      T val = 1;
+      if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+        val = temp[thread_id - stride];
+      }
+      __syncthreads();
+      temp[thread_id] *= val;
+      __syncthreads();
+    }
+
+    if (thread_id < lenparents) {
+      int64_t parent = parents[thread_id];
+      if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+        atomicMul(&atomic_toptr[parent], temp[thread_id]);
+      }
+    }
+  }
+}
+
+template <typename T, typename C, typename U>
+__global__ void
+awkward_reduce_prod_c(
+    T* toptr,
+    const C* fromptr,
+    const U* parents,
+    int64_t lenparents,
+    int64_t outlength,
+    T* atomic_toptr,
+    T* temp,
+    uint64_t invocation_index,
+    uint64_t* err_code) {
+  if (err_code[0] == NO_ERROR) {
+    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (thread_id < outlength) {
+      toptr[thread_id] = (T)(atomic_toptr[thread_id]);
+    }
+  }
+}
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod_bool.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod_bool.cu
index 74843af6c0..9d85b366c7 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod_bool.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod_bool.cu
@@ -3,10 +3,15 @@
 // BEGIN PYTHON
 // def f(grid, block, args):
 //     (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args
-//     atomicAdd_toptr = cupy.array(toptr, dtype=cupy.uint64)
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_bool_a", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_bool_b", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_bool_c", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code))
+//     if block[0] > 0:
+//         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
+//     else:
+//         grid_size = 1
+//     atomic_toptr = cupy.array(toptr, dtype=cupy.uint32)
+//     temp = cupy.ones(lenparents, dtype=toptr.dtype)
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_bool_a", bool_, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_bool_b", bool_, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_bool_c", bool_, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
 // out["awkward_reduce_prod_bool_a", {dtype_specializations}] = None
 // out["awkward_reduce_prod_bool_b", {dtype_specializations}] = None
 // out["awkward_reduce_prod_bool_c", {dtype_specializations}] = None
@@ -20,14 +25,15 @@ awkward_reduce_prod_bool_a(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    uint64_t* atomicAdd_toptr,
+    uint32_t* atomic_toptr,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      atomicAdd_toptr[thread_id] = true;
+      atomic_toptr[thread_id] = 1;
     }
   }
 }
@@ -40,34 +46,55 @@ awkward_reduce_prod_bool_b(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    uint64_t* atomicAdd_toptr,
+    uint32_t* atomic_toptr,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
-    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    int64_t idx = threadIdx.x;
+    int64_t thread_id = blockIdx.x * blockDim.x + idx;
 
     if (thread_id < lenparents) {
-      atomicAnd(atomicAdd_toptr + parents[thread_id],
-                (uint64_t)(fromptr[thread_id] != 0));
+      temp[thread_id] = fromptr[thread_id];
+    }
+    __syncthreads();
+
+    for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
+      T val = 1;
+      if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+        val = temp[thread_id - stride];
+      }
+      __syncthreads();
+      temp[thread_id] &= (val != 0);
+      __syncthreads();
+    }
+
+    if (thread_id < lenparents) {
+      int64_t parent = parents[thread_id];
+      if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+        atomicAnd(&atomic_toptr[parent], temp[thread_id]);
+      }
     }
   }
 }
 
 template <typename T, typename C, typename U>
 __global__ void
-awkward_reduce_prod_bool_c(T* toptr,
-                           const C* fromptr,
-                           const U* parents,
-                           int64_t lenparents,
-                           int64_t outlength,
-                           uint64_t* atomicAdd_toptr,
-                           uint64_t invocation_index,
-                           uint64_t* err_code) {
+awkward_reduce_prod_bool_c(
+    T* toptr,
+    const C* fromptr,
+    const U* parents,
+    int64_t lenparents,
+    int64_t outlength,
+    uint32_t* atomic_toptr,
+    T* temp,
+    uint64_t invocation_index,
+    uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      toptr[thread_id] = (T)atomicAdd_toptr[thread_id];
+      toptr[thread_id] = (T)(atomic_toptr[thread_id]);
     }
   }
 }
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum.cu
index 13c5a31dbf..8ce2b8159c 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum.cu
@@ -3,13 +3,15 @@
 // BEGIN PYTHON
 // def f(grid, block, args):
 //     (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args
-//     atomicAdd_toptr = cupy.array(toptr, dtype=cupy.uint64)
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_a", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_b", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_c", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code))
+//     if block[0] > 0:
+//         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
+//     else:
+//         grid_size = 1
+//     temp = cupy.zeros(lenparents, dtype=toptr.dtype)
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, temp, invocation_index, err_code))
 // out["awkward_reduce_sum_a", {dtype_specializations}] = None
 // out["awkward_reduce_sum_b", {dtype_specializations}] = None
-// out["awkward_reduce_sum_c", {dtype_specializations}] = None
 // END PYTHON
 
 template <typename T, typename C, typename U>
@@ -20,14 +22,14 @@ awkward_reduce_sum_a(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    uint64_t* atomicAdd_toptr,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      atomicAdd_toptr[thread_id] = 0;
+      toptr[thread_id] = 0;
     }
   }
 }
@@ -40,35 +42,33 @@ awkward_reduce_sum_b(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    uint64_t* atomicAdd_toptr,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
-    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    int64_t idx = threadIdx.x;
+    int64_t thread_id = blockIdx.x * blockDim.x + idx;
 
     if (thread_id < lenparents) {
-      atomicAdd(atomicAdd_toptr + parents[thread_id],
-                (uint64_t)fromptr[thread_id]);
+      temp[thread_id] = fromptr[thread_id];
     }
-  }
-}
+    __syncthreads();
 
-template <typename T, typename C, typename U>
-__global__ void
-awkward_reduce_sum_c(
-    T* toptr,
-    const C* fromptr,
-    const U* parents,
-    int64_t lenparents,
-    int64_t outlength,
-    uint64_t* atomicAdd_toptr,
-    uint64_t invocation_index,
-    uint64_t* err_code) {
-  if (err_code[0] == NO_ERROR) {
-    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
+      T val = 0;
+      if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+        val = temp[thread_id - stride];
+      }
+      __syncthreads();
+      temp[thread_id] += val;
+      __syncthreads();
+    }
 
-    if (thread_id < outlength) {
-      toptr[thread_id] = (T)atomicAdd_toptr[thread_id];
+    if (thread_id < lenparents) {
+      int64_t parent = parents[thread_id];
+      if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+        atomicAdd(&toptr[parent], temp[thread_id]);
+      }
     }
   }
 }
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_bool.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_bool.cu
index 0e062a6c78..f85df8e20a 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_bool.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_bool.cu
@@ -3,10 +3,15 @@
 // BEGIN PYTHON
 // def f(grid, block, args):
 //     (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args
-//     atomicAdd_toptr = cupy.array(toptr, dtype=cupy.uint64)
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_bool_a", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_bool_b", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_bool_c", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code))
+//     if block[0] > 0:
+//         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
+//     else:
+//         grid_size = 1
+//     atomic_toptr = cupy.array(toptr, dtype=cupy.uint32)
+//     temp = cupy.zeros(lenparents, dtype=toptr.dtype)
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_bool_a", bool_, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_bool_b", bool_, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_bool_c", bool_, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
 // out["awkward_reduce_sum_bool_a", {dtype_specializations}] = None
 // out["awkward_reduce_sum_bool_b", {dtype_specializations}] = None
 // out["awkward_reduce_sum_bool_c", {dtype_specializations}] = None
@@ -20,14 +25,15 @@ awkward_reduce_sum_bool_a(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    uint64_t* atomicAdd_toptr,
+    uint32_t* atomic_toptr,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      atomicAdd_toptr[thread_id] = false;
+      atomic_toptr[thread_id] = 0;
     }
   }
 }
@@ -40,34 +46,55 @@ awkward_reduce_sum_bool_b(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    uint64_t* atomicAdd_toptr,
+    uint32_t* atomic_toptr,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
-    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    int64_t idx = threadIdx.x;
+    int64_t thread_id = blockIdx.x * blockDim.x + idx;
 
     if (thread_id < lenparents) {
-      atomicOr(atomicAdd_toptr + parents[thread_id],
-               (uint64_t)(fromptr[thread_id] != 0));
+      temp[thread_id] = fromptr[thread_id];
+    }
+    __syncthreads();
+
+    for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
+      T val = 0;
+      if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+        val = temp[thread_id - stride];
+      }
+      __syncthreads();
+      temp[thread_id] |= (val != 0);
+      __syncthreads();
+    }
+
+    if (thread_id < lenparents) {
+      int64_t parent = parents[thread_id];
+      if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+        atomicOr(&atomic_toptr[parent], temp[thread_id]);
+      }
     }
   }
 }
 
 template <typename T, typename C, typename U>
 __global__ void
-awkward_reduce_sum_bool_c(T* toptr,
-                          const C* fromptr,
-                          const U* parents,
-                          int64_t lenparents,
-                          int64_t outlength,
-                          uint64_t* atomicAdd_toptr,
-                          uint64_t invocation_index,
-                          uint64_t* err_code) {
+awkward_reduce_sum_bool_c(
+    T* toptr,
+    const C* fromptr,
+    const U* parents,
+    int64_t lenparents,
+    int64_t outlength,
+    uint32_t* atomic_toptr,
+    T* temp,
+    uint64_t invocation_index,
+    uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      toptr[thread_id] = (T)atomicAdd_toptr[thread_id];
+      toptr[thread_id] = (T)(atomic_toptr[thread_id]);
     }
   }
 }
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu
index 8bdb3fccc2..f52b6fb21c 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu
@@ -3,13 +3,15 @@
 // BEGIN PYTHON
 // def f(grid, block, args):
 //     (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args
-//     atomicAdd_toptr = cupy.array(toptr, dtype=cupy.uint64)
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int32_bool_64_a", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int32_bool_64_b", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int32_bool_64_c", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code))
+//     if block[0] > 0:
+//         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
+//     else:
+//         grid_size = 1
+//     temp = cupy.zeros(lenparents, dtype=toptr.dtype)
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int32_bool_64_a", int32, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int32_bool_64_b", int32, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, temp, invocation_index, err_code))
 // out["awkward_reduce_sum_int32_bool_64_a", {dtype_specializations}] = None
 // out["awkward_reduce_sum_int32_bool_64_b", {dtype_specializations}] = None
-// out["awkward_reduce_sum_int32_bool_64_c", {dtype_specializations}] = None
 // END PYTHON
 
 template <typename T, typename C, typename U>
@@ -20,14 +22,14 @@ awkward_reduce_sum_int32_bool_64_a(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    uint64_t* atomicAdd_toptr,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      atomicAdd_toptr[thread_id] = 0;
+      toptr[thread_id] = 0;
     }
   }
 }
@@ -40,34 +42,33 @@ awkward_reduce_sum_int32_bool_64_b(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    uint64_t* atomicAdd_toptr,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
-    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    int64_t idx = threadIdx.x;
+    int64_t thread_id = blockIdx.x * blockDim.x + idx;
 
     if (thread_id < lenparents) {
-      atomicAdd(atomicAdd_toptr + parents[thread_id],
-                (uint64_t)(fromptr[thread_id] != 0));
+      temp[thread_id] = (fromptr[thread_id] != 0) ? 1 : 0;;
     }
-  }
-}
+    __syncthreads();
 
-template <typename T, typename C, typename U>
-__global__ void
-awkward_reduce_sum_int32_bool_64_c(T* toptr,
-                                   const C* fromptr,
-                                   const U* parents,
-                                   int64_t lenparents,
-                                   int64_t outlength,
-                                   uint64_t* atomicAdd_toptr,
-                                   uint64_t invocation_index,
-                                   uint64_t* err_code) {
-  if (err_code[0] == NO_ERROR) {
-    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
+      T val = 0;
+      if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+        val = temp[thread_id - stride];
+      }
+      __syncthreads();
+      temp[thread_id] += val;
+      __syncthreads();
+    }
 
-    if (thread_id < outlength) {
-      toptr[thread_id] = (T)atomicAdd_toptr[thread_id];
+    if (thread_id < lenparents) {
+      int64_t parent = parents[thread_id];
+      if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+        atomicAdd(&toptr[parent], temp[thread_id]);
+      }
     }
   }
 }
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu
index 041558a663..7e220cccc0 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu
@@ -3,13 +3,15 @@
 // BEGIN PYTHON
 // def f(grid, block, args):
 //     (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args
-//     atomicAdd_toptr = cupy.array(toptr, dtype=cupy.uint64)
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int64_bool_64_a", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int64_bool_64_b", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int64_bool_64_c", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code))
+//     if block[0] > 0:
+//         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
+//     else:
+//         grid_size = 1
+//     temp = cupy.zeros(lenparents, dtype=toptr.dtype)
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int64_bool_64_a", int64, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int64_bool_64_b", int64, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, temp, invocation_index, err_code))
 // out["awkward_reduce_sum_int64_bool_64_a", {dtype_specializations}] = None
 // out["awkward_reduce_sum_int64_bool_64_b", {dtype_specializations}] = None
-// out["awkward_reduce_sum_int64_bool_64_c", {dtype_specializations}] = None
 // END PYTHON
 
 template <typename T, typename C, typename U>
@@ -20,14 +22,14 @@ awkward_reduce_sum_int64_bool_64_a(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    uint64_t* atomicAdd_toptr,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (thread_id < outlength) {
-      atomicAdd_toptr[thread_id] = 0;
+      toptr[thread_id] = 0;
     }
   }
 }
@@ -40,34 +42,33 @@ awkward_reduce_sum_int64_bool_64_b(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
-    uint64_t* atomicAdd_toptr,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
-    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    int64_t idx = threadIdx.x;
+    int64_t thread_id = blockIdx.x * blockDim.x + idx;
 
     if (thread_id < lenparents) {
-      atomicAdd(atomicAdd_toptr + parents[thread_id],
-                (uint64_t)(fromptr[thread_id] != 0));
+      temp[thread_id] = (fromptr[thread_id] != 0) ? 1 : 0;;
     }
-  }
-}
+    __syncthreads();
 
-template <typename T, typename C, typename U>
-__global__ void
-awkward_reduce_sum_int64_bool_64_c(T* toptr,
-                                   const C* fromptr,
-                                   const U* parents,
-                                   int64_t lenparents,
-                                   int64_t outlength,
-                                   uint64_t* atomicAdd_toptr,
-                                   uint64_t invocation_index,
-                                   uint64_t* err_code) {
-  if (err_code[0] == NO_ERROR) {
-    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
+      T val = 0;
+      if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+        val = temp[thread_id - stride];
+      }
+      __syncthreads();
+      temp[thread_id] += val;
+      __syncthreads();
+    }
 
-    if (thread_id < outlength) {
-      toptr[thread_id] = (T)atomicAdd_toptr[thread_id];
+    if (thread_id < lenparents) {
+      int64_t parent = parents[thread_id];
+      if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+        atomicAdd(&toptr[parent], temp[thread_id]);
+      }
     }
   }
 }
diff --git a/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu b/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu
index e101a7f3d8..9e8eb2bb35 100644
--- a/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu
+++ b/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu
@@ -32,6 +32,31 @@ typedef unsigned long long uintmax_t;
   atomicMin(err_code,                  \
             invocation_index*(1 << ERROR_BITS) + (int)(ERROR_KERNEL_CODE));
 
+// BEGIN PYTHON
+// def min_max_type(dtype):
+//   supported_types = {
+//       'bool': cupy.int32,
+//       'int8': cupy.int32,
+//       'int16': cupy.int32,
+//       'int32': cupy.int32,
+//       'int64': cupy.int64,
+//       'uint8': cupy.uint32,
+//       'uint16': cupy.uint32,
+//       'uint32': cupy.uint32,
+//       'uint64': cupy.uint64,
+//       'float32': cupy.float32,
+//       'float64': cupy.float64
+//   }
+//   if str(dtype) in supported_types:
+//       return supported_types[str(dtype)]
+//   else:
+//       raise ValueError("Unsupported dtype.", dtype)
+// END PYTHON
+
+
+// used by awkward_ListArray_getitem_next_range_carrylength
+// and awkward_ListArray_getitem_next_range kernels
+
 const int64_t  kMaxInt64  = 9223372036854775806;   // 2**63 - 2: see below
 const int64_t  kSliceNone = kMaxInt64 + 1;         // for Slice::none()
 
@@ -43,29 +68,281 @@ awkward_regularize_rangeslice(
     bool hasstart,
     bool hasstop,
     int64_t length) {
-    if (posstep) {
-      if (!hasstart)           *start = 0;
-      else if (*start < 0)     *start += length;
-      if (*start < 0)          *start = 0;
-      if (*start > length)     *start = length;
-
-      if (!hasstop)            *stop = length;
-      else if (*stop < 0)      *stop += length;
-      if (*stop < 0)           *stop = 0;
-      if (*stop > length)      *stop = length;
-      if (*stop < *start)      *stop = *start;
-    }
-
-    else {
-      if (!hasstart)           *start = length - 1;
-      else if (*start < 0)     *start += length;
-      if (*start < -1)         *start = -1;
-      if (*start > length - 1) *start = length - 1;
-
-      if (!hasstop)            *stop = -1;
-      else if (*stop < 0)      *stop += length;
-      if (*stop < -1)          *stop = -1;
-      if (*stop > length - 1)  *stop = length - 1;
-      if (*stop > *start)      *stop = *start;
-    }
+  if (posstep) {
+    if (!hasstart)           *start = 0;
+    else if (*start < 0)     *start += length;
+    if (*start < 0)          *start = 0;
+    if (*start > length)     *start = length;
+
+    if (!hasstop)            *stop = length;
+    else if (*stop < 0)      *stop += length;
+    if (*stop < 0)           *stop = 0;
+    if (*stop > length)      *stop = length;
+    if (*stop < *start)      *stop = *start;
   }
+
+  else {
+    if (!hasstart)           *start = length - 1;
+    else if (*start < 0)     *start += length;
+    if (*start < -1)         *start = -1;
+    if (*start > length - 1) *start = length - 1;
+
+    if (!hasstop)            *stop = -1;
+    else if (*stop < 0)      *stop += length;
+    if (*stop < -1)          *stop = -1;
+    if (*stop > length - 1)  *stop = length - 1;
+    if (*stop > *start)      *stop = *start;
+  }
+}
+
+
+// atomicMin() specializations
+template <typename T>
+__device__ T atomicMin(T* address, T val);
+
+// atomicMin() specialization for int8_t
+template <>
+__device__ int8_t atomicMin<int8_t>(int8_t* address, int8_t val) {
+  unsigned int *base_address = (unsigned int *)((size_t)address & ~3);
+  unsigned int selectors[] = {0x3214, 0x3240, 0x3410, 0x4210};
+  unsigned int sel = selectors[(size_t)address & 3];
+  unsigned int old, assumed, min_, new_;
+  old = *base_address;
+  do {
+    assumed = old;
+    min_ = min(val, (int8_t)__byte_perm(old, 0, ((size_t)address & 3)));
+    new_ = __byte_perm(old, min_, sel);
+    old = atomicCAS(base_address, assumed, new_);
+  } while (assumed != old);
+  return old;
+}
+
+// atomicMin() specialization for uint8_t
+template <>
+__device__ uint8_t atomicMin<uint8_t>(uint8_t* address, uint8_t val) {
+  unsigned int *base_address = (unsigned int *)((size_t)address & ~3);
+  unsigned int selectors[] = {0x3214, 0x3240, 0x3410, 0x4210};
+  unsigned int sel = selectors[(size_t)address & 3];
+  unsigned int old, assumed, min_, new_;
+  old = *base_address;
+  do {
+    assumed = old;
+    min_ = min(val, (uint8_t)__byte_perm(old, 0, ((size_t)address & 3)));
+    new_ = __byte_perm(old, min_, sel);
+    old = atomicCAS(base_address, assumed, new_);
+  } while (assumed != old);
+  return old;
+}
+
+// atomicMin() specialization for int16_t
+template <>
+__device__ int16_t atomicMin<int16_t>(int16_t* address, int16_t val) {
+  uint16_t* address_as_ush = reinterpret_cast<uint16_t*>(address);
+  uint16_t old = *address_as_ush, assumed;
+  do {
+    assumed = old;
+    int16_t temp = min(val, reinterpret_cast<int16_t&>(assumed));
+    old = atomicCAS(
+        address_as_ush, assumed, reinterpret_cast<uint16_t&>(temp)
+    );
+  } while (assumed != old);
+  return reinterpret_cast<int16_t&>(old);
+}
+
+// atomicMin() specialization for uint16_t
+template <>
+__device__ uint16_t atomicMin<uint16_t>(uint16_t* address, uint16_t val) {
+  uint16_t old = *address, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address, assumed, min(val, assumed));
+  } while (assumed != old);
+  return old;
+}
+
+// atomicMin() specialization for float
+template <>
+__device__ float atomicMin<float>(float* addr, float value) {
+  float old;
+  old = !signbit(value) ? __int_as_float(atomicMin((int*)addr, __float_as_int(value))) :
+      __uint_as_float(atomicMax((unsigned int*)addr, __float_as_uint(value)));
+  return old;
+}
+
+// atomicMin() specialization for double
+template <>
+__device__ double atomicMin<double>(double* addr, double value) {
+  double old;
+  old = !signbit(value) ? __longlong_as_double(atomicMin((long long int*)addr, __double_as_longlong(value))) :
+      __ull2double_rz(atomicMax((unsigned long long int*)addr, __double2ull_ru(value)));
+  return old;
+}
+
+
+// atomicMax() specializations
+template <typename T>
+__device__ T atomicMax(T* address, T val);
+
+// atomicMax() specialization for int8_t
+template <>
+__device__ int8_t atomicMax<int8_t>(int8_t* address, int8_t val) {
+  unsigned int *base_address = (unsigned int *)((size_t)address & ~3);
+  unsigned int selectors[] = {0x3214, 0x3240, 0x3410, 0x4210};
+  unsigned int sel = selectors[(size_t)address & 3];
+  unsigned int old, assumed, max_, new_;
+  old = *base_address;
+  do {
+    assumed = old;
+    max_ = max(val, (int8_t)__byte_perm(old, 0, ((size_t)address & 3)));
+    new_ = __byte_perm(old, max_, sel);
+    old = atomicCAS(base_address, assumed, new_);
+  } while (assumed != old);
+  return old;
+}
+
+// atomicMax() specialization for uint8_t
+template <>
+__device__ uint8_t atomicMax<uint8_t>(uint8_t* address, uint8_t val) {
+  unsigned int *base_address = (unsigned int *)((size_t)address & ~3);
+  unsigned int selectors[] = {0x3214, 0x3240, 0x3410, 0x4210};
+  unsigned int sel = selectors[(size_t)address & 3];
+  unsigned int old, assumed, max_, new_;
+  old = *base_address;
+  do {
+    assumed = old;
+    max_ = max(val, (uint8_t)__byte_perm(old, 0, ((size_t)address & 3)));
+    new_ = __byte_perm(old, max_, sel);
+    old = atomicCAS(base_address, assumed, new_);
+  } while (assumed != old);
+  return old;
+}
+
+// atomicMax() specialization for int16_t
+template <>
+__device__ int16_t atomicMax<int16_t>(int16_t* address, int16_t val) {
+  uint16_t* address_as_ush = reinterpret_cast<uint16_t*>(address);
+  uint16_t old = *address_as_ush, assumed;
+  do {
+    assumed = old;
+    int16_t temp = max(val, reinterpret_cast<int16_t&>(assumed));
+    old = atomicCAS(
+        address_as_ush, assumed, reinterpret_cast<uint16_t&>(temp)
+    );
+  } while (assumed != old);
+  return reinterpret_cast<int16_t&>(old);
+}
+
+// atomicMax() specialization for uint16_t
+template <>
+__device__ uint16_t atomicMax<uint16_t>(uint16_t* address, uint16_t val) {
+  uint16_t old = *address, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address, assumed, max(val, assumed));
+  } while (assumed != old);
+  return old;
+}
+
+// atomicMax() specialization for float
+template <>
+__device__ float atomicMax<float>(float* addr, float value) {
+  float old;
+  old = !signbit(value) ? __int_as_float(atomicMax((int*)addr, __float_as_int(value))) :
+      __uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(value)));
+  return old;
+}
+// atomicMax() specialization for double
+template <>
+__device__ double atomicMax<double>(double* addr, double value) {
+  double old;
+  old = !signbit(value) ? __longlong_as_double(atomicMax((long long int*)addr, __double_as_longlong(value))) :
+      __ull2double_rz(atomicMin((unsigned long long int*)addr, __double2ull_ru(value)));
+  return old;
+}
+
+
+// atomicAdd() specialization for int64_t
+// uses 2's complement
+__device__ int64_t atomicAdd(int64_t* address, int64_t val) {
+  uint64_t* address_as_ull = (uint64_t*)address;
+  uint64_t old = *address_as_ull, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed, assumed + (uint64_t)val);
+  } while (assumed != old);
+  return (int64_t)old;
+}
+
+
+// atomicMul() specializations
+template <typename T>
+__device__ T atomicMul(T* address, T val);
+
+// atomicMul() specialization for int32_t
+template <>
+__device__ int32_t atomicMul<int32_t>(int32_t* address, int32_t val) {
+  int32_t old = *address, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address, assumed, assumed * val);
+  } while (assumed != old);
+  return old;
+}
+
+// atomicMul() specialization for uint32_t
+template <>
+__device__ uint32_t atomicMul<uint32_t>(uint32_t* address, uint32_t val) {
+  uint32_t old = *address, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address, assumed, assumed * val);
+  } while (assumed != old);
+  return old;
+}
+
+// atomicMul() specialization for int64_t
+template <>
+__device__ int64_t atomicMul<int64_t>(int64_t* address, int64_t val) {
+  uint64_t* address_as_uint64 = reinterpret_cast<uint64_t*>(address);
+  uint64_t old = *address_as_uint64, assumed;
+  uint64_t val_as_uint64 = *reinterpret_cast<uint64_t*>(&val);
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_uint64, assumed, assumed * val_as_uint64);
+  } while (assumed != old);
+  return *reinterpret_cast<int64_t*>(&old);
+}
+
+// atomicMul() specialization for uint64_t
+template <>
+__device__ uint64_t atomicMul<uint64_t>(uint64_t* address, uint64_t val) {
+  uint64_t old = *address, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address, assumed, assumed * val);
+  } while (assumed != old);
+  return old;
+}
+
+// atomicMul() specialization for float
+template <>
+__device__ float atomicMul<float>(float* address, float val) {
+  float old = *address, assumed;
+  do {
+    assumed = old;
+    old = __int_as_float(atomicCAS((int*)address, __float_as_int(assumed), __float_as_int(assumed * val)));
+  } while (assumed != old);
+  return old;
+}
+
+// atomicMul() specialization for double
+template <>
+__device__ double atomicMul<double>(double* address, double val) {
+  uint64_t* address_as_ull = (uint64_t*)address;
+  uint64_t old = *address_as_ull, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed, __double_as_longlong(__longlong_as_double(assumed) * val));
+  } while (assumed != old);
+  return __longlong_as_double(old);
+}
diff --git a/src/awkward/contents/regulararray.py b/src/awkward/contents/regulararray.py
index 2d144a0b7a..a5a16fcdff 100644
--- a/src/awkward/contents/regulararray.py
+++ b/src/awkward/contents/regulararray.py
@@ -358,7 +358,6 @@ def _carry(self, carry: Index, allow_lazy: bool) -> Content:
             nextcarry = ak.index.Index64.empty(
                 where.shape[0] * self._size, self._backend.index_nplike
             )
-
         assert nextcarry.nplike is self._backend.index_nplike
         self._maybe_index_error(
             self._backend[
@@ -472,6 +471,8 @@ def _getitem_next(
             nexthead, nexttail = ak._slicing.head_tail(tail)
             nextcarry = ak.index.Index64.empty(self._length, index_nplike)
             assert nextcarry.nplike is index_nplike
+            if ak.backend(head) == "cuda":
+                head = int(ak.to_backend(head, backend=self._backend)[0])
             self._maybe_index_error(
                 self._backend[
                     "awkward_RegularArray_getitem_next_at", nextcarry.dtype.type
diff --git a/tests-cuda/test_2922a_new_cuda_kernels.py b/tests-cuda/test_2922a_new_cuda_kernels.py
index feb800ecac..fa71d13e63 100644
--- a/tests-cuda/test_2922a_new_cuda_kernels.py
+++ b/tests-cuda/test_2922a_new_cuda_kernels.py
@@ -16,6 +16,13 @@
 to_list = ak.operations.to_list
 
 
+@pytest.fixture(scope="function", autouse=True)
+def cleanup_cuda():
+    yield
+    cp._default_memory_pool.free_all_blocks()
+    cp.cuda.Device().synchronize()
+
+
 def test_0184_concatenate_operation_records():
     one = ak.highlevel.Array([[1, 2, 3], [None, 4], None, [None, 5]]).layout
     two = ak.highlevel.Array([6, 7, 8]).layout
diff --git a/tests-cuda/test_2922b_new_cuda_kernels.py b/tests-cuda/test_2922b_new_cuda_kernels.py
index 5666dabf59..f03a5ffe71 100644
--- a/tests-cuda/test_2922b_new_cuda_kernels.py
+++ b/tests-cuda/test_2922b_new_cuda_kernels.py
@@ -10,6 +10,13 @@
 to_list = ak.operations.to_list
 
 
+@pytest.fixture(scope="function", autouse=True)
+def cleanup_cuda():
+    yield
+    cp._default_memory_pool.free_all_blocks()
+    cp.cuda.Device().synchronize()
+
+
 def test_2651_parameter_union():
     layout = ak.contents.IndexedArray(
         ak.index.Index64([0, 1, 2]),
diff --git a/tests-cuda/test_3065a_cuda_kernels.py b/tests-cuda/test_3065a_cuda_kernels.py
index de8b634da0..798d690a41 100644
--- a/tests-cuda/test_3065a_cuda_kernels.py
+++ b/tests-cuda/test_3065a_cuda_kernels.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import cupy as cp
 import numpy as np
 import pytest
 
@@ -9,6 +10,13 @@
 to_list = ak.operations.to_list
 
 
+@pytest.fixture(scope="function", autouse=True)
+def cleanup_cuda():
+    yield
+    cp._default_memory_pool.free_all_blocks()
+    cp.cuda.Device().synchronize()
+
+
 def test_0449_merge_many_arrays_in_one_pass_concatenate():
     one = ak.highlevel.Array([1, 2, 3]).layout
     two = ak.highlevel.Array([4.4, 5.5]).layout
diff --git a/tests-cuda/test_3065b_cuda_kernels.py b/tests-cuda/test_3065b_cuda_kernels.py
index bad768249c..91e77bd37d 100644
--- a/tests-cuda/test_3065b_cuda_kernels.py
+++ b/tests-cuda/test_3065b_cuda_kernels.py
@@ -11,6 +11,13 @@
 to_list = ak.operations.to_list
 
 
+@pytest.fixture(scope="function", autouse=True)
+def cleanup_cuda():
+    yield
+    cp._default_memory_pool.free_all_blocks()
+    cp.cuda.Device().synchronize()
+
+
 def test_0582_propagate_context_in_broadcast_and_apply_firsts():
     array = ak.Array([[[0, 1, 2], []], [[3, 4]], [], [[5], [6, 7, 8, 9]]])
     cuda_array = ak.to_backend(array, "cuda")
diff --git a/tests-cuda/test_3065c_cuda_kernels.py b/tests-cuda/test_3065c_cuda_kernels.py
index 74ac927189..fef0b49181 100644
--- a/tests-cuda/test_3065c_cuda_kernels.py
+++ b/tests-cuda/test_3065c_cuda_kernels.py
@@ -9,6 +9,13 @@
 to_list = ak.operations.to_list
 
 
+@pytest.fixture(scope="function", autouse=True)
+def cleanup_cuda():
+    yield
+    cp._default_memory_pool.free_all_blocks()
+    cp.cuda.Device().synchronize()
+
+
 def test_0546_fill_none_replacement_value_type():
     array = ak.operations.values_astype(
         ak.highlevel.Array([1.1, 2.2, None, 3.3]), np.float32
diff --git a/tests-cuda/test_3086_cuda_concatenate.py b/tests-cuda/test_3086_cuda_concatenate.py
index e35206b55a..ccf06d22df 100644
--- a/tests-cuda/test_3086_cuda_concatenate.py
+++ b/tests-cuda/test_3086_cuda_concatenate.py
@@ -10,6 +10,13 @@
 to_list = ak.operations.to_list
 
 
+@pytest.fixture(scope="function", autouse=True)
+def cleanup_cuda():
+    yield
+    cp._default_memory_pool.free_all_blocks()
+    cp.cuda.Device().synchronize()
+
+
 def test_0184_concatenate_number():
     a1 = ak.highlevel.Array([[1, 2, 3], [], [4, 5]]).layout
     a2 = ak.highlevel.Array([[[1.1], [2.2, 3.3]], [[]], [[4.4], [5.5]]]).layout
diff --git a/tests-cuda/test_3130_cuda_listarray_getitem_next.py b/tests-cuda/test_3130_cuda_listarray_getitem_next.py
index c26c8f9319..66783ad014 100644
--- a/tests-cuda/test_3130_cuda_listarray_getitem_next.py
+++ b/tests-cuda/test_3130_cuda_listarray_getitem_next.py
@@ -19,6 +19,13 @@
 offsets2 = ak.index.IndexU32(np.array([0, 2, 3, 3, 5], np.uint32))
 
 
+@pytest.fixture(scope="function", autouse=True)
+def cleanup_cuda():
+    yield
+    cp._default_memory_pool.free_all_blocks()
+    cp.cuda.Device().synchronize()
+
+
 def tests_0020_support_unsigned_indexes_listarray_ellipsis():
     array1 = ak.contents.ListArray(starts1, stops1, content)
     array2 = ak.contents.ListArray(starts2, stops2, array1)
diff --git a/tests-cuda/test_3136_cuda_argmin_and_argmax.py b/tests-cuda/test_3136_cuda_argmin_and_argmax.py
new file mode 100644
index 0000000000..861ced70c5
--- /dev/null
+++ b/tests-cuda/test_3136_cuda_argmin_and_argmax.py
@@ -0,0 +1,195 @@
+from __future__ import annotations
+
+import cupy as cp
+import pytest
+
+import awkward as ak
+
+to_list = ak.operations.to_list
+
+
+@pytest.fixture(scope="function", autouse=True)
+def cleanup_cuda():
+    yield
+    cp._default_memory_pool.free_all_blocks()
+    cp.cuda.Device().synchronize()
+
+
+def test_0835_argmin_argmax_axis_None():
+    array = ak.highlevel.Array(
+        [
+            [
+                [2022, 2023, 2025],
+                [],
+                [2027, 2011],
+                [2013],
+            ],
+            [],
+            [[2017, 2019], [2023]],
+        ],
+    )
+    cuda_array = ak.to_backend(array, "cuda")
+
+    assert ak.operations.argmin(cuda_array) == 4
+    assert ak.operations.argmax(cuda_array) == 3
+
+
+def test_1106_argminmax_axis_None_missing_values():
+    array = ak.highlevel.Array([1, 2, 3, None, 4])
+
+    cuda_array = ak.to_backend(array, "cuda")
+
+    assert ak.operations.argmax(cuda_array) == 4
+
+
+def test_0070_argmin_and_argmax_jagged():
+    v2_array = ak.operations.from_iter(
+        [[2.2, 1.1, 3.3], [], [4.4, 5.5], [5.5], [-4.4, -5.5, -6.6]], highlevel=False
+    )
+
+    cuda_v2_array = ak.to_backend(v2_array, "cuda", highlevel=False)
+
+    assert to_list(ak.argmin(cuda_v2_array, axis=1, highlevel=False)) == [
+        1,
+        None,
+        0,
+        0,
+        2,
+    ]
+    assert (
+        ak.argmin(cuda_v2_array.to_typetracer(), axis=1, highlevel=False).form
+        == ak.argmin(cuda_v2_array, axis=1, highlevel=False).form
+    )
+
+    index2 = ak.index.Index64(cp.array([4, 3, 2, 1, 0], dtype=cp.int64))
+    cuda_v2_array2 = ak.contents.IndexedArray(index2, cuda_v2_array)
+
+    assert to_list(ak.argmin(cuda_v2_array2, axis=1, highlevel=False)) == [
+        2,
+        0,
+        0,
+        None,
+        1,
+    ]
+    assert (
+        ak.argmin(cuda_v2_array2.to_typetracer(), axis=1, highlevel=False).form
+        == ak.argmin(cuda_v2_array2, axis=1, highlevel=False).form
+    )
+
+    index3 = ak.index.Index64(cp.array([4, 3, -1, 4, 0], dtype=cp.int64))
+    cuda_v2_array2 = ak.contents.IndexedOptionArray(index3, cuda_v2_array)
+
+    assert to_list(ak.argmin(cuda_v2_array2, axis=1, highlevel=False)) == [
+        2,
+        0,
+        None,
+        2,
+        1,
+    ]
+    assert (
+        ak.argmin(cuda_v2_array2.to_typetracer(), axis=1, highlevel=False).form
+        == ak.argmin(cuda_v2_array2, axis=1, highlevel=False).form
+    )
+    assert to_list(ak.argmin(cuda_v2_array2, axis=-1, highlevel=False)) == [
+        2,
+        0,
+        None,
+        2,
+        1,
+    ]
+    assert (
+        ak.argmin(cuda_v2_array2.to_typetracer(), axis=-1, highlevel=False).form
+        == ak.argmin(cuda_v2_array2, axis=-1, highlevel=False).form
+    )
+
+
+def test_0070_argmin_and_argmax_missing():
+    array = ak.operations.from_iter(
+        [[[2.2, 1.1, 3.3]], [[]], [None, None, None], [[-4.4, -5.5, -6.6]]],
+        highlevel=False,
+    )
+
+    cuda_array = ak.to_backend(array, "cuda", highlevel=False)
+
+    assert to_list(ak.argmin(cuda_array, axis=2, highlevel=False)) == [
+        [1],
+        [None],
+        [None, None, None],
+        [2],
+    ]
+    assert (
+        ak.argmin(cuda_array.to_typetracer(), axis=2, highlevel=False).form
+        == ak.argmin(cuda_array, axis=2, highlevel=False).form
+    )
+
+
+def test_0115_generic_reducer_operation_ByteMaskedArray():
+    content = ak.operations.from_iter(
+        [
+            [[1.1, 0.0, 2.2], [], [3.3, 4.4]],
+            [],
+            [[5.5]],
+            [[6.6, 9.9, 8.8, 7.7]],
+            [[], [12.2, 11.1, 10.0]],
+        ],
+        highlevel=False,
+    )
+    mask = ak.index.Index8(cp.array([0, 0, 1, 1, 0], dtype=cp.int8))
+    content = ak.to_backend(content, "cuda", highlevel=False)
+
+    cuda_v2_array = ak.contents.ByteMaskedArray(mask, content, valid_when=False)
+
+    assert to_list(cuda_v2_array) == [
+        [[1.1, 0.0, 2.2], [], [3.3, 4.4]],
+        [],
+        None,
+        None,
+        [[], [12.2, 11.1, 10.0]],
+    ]
+    assert to_list(ak.argmin(cuda_v2_array, axis=-1, highlevel=False)) == [
+        [1, None, 0],
+        [],
+        None,
+        None,
+        [None, 2],
+    ]
+    assert (
+        ak.argmin(cuda_v2_array.to_typetracer(), axis=-1, highlevel=False).form
+        == ak.argmin(cuda_v2_array, axis=-1, highlevel=False).form
+    )
+
+
+@pytest.mark.parametrize(
+    "func",
+    [
+        ak.argmin,
+        ak.argmax,
+    ],
+)
+def test_2754_highlevel_behavior_missing_reducers(func):
+    behavior_1 = {"foo": "bar"}
+    behavior_2 = {"baz": "bargh!"}
+
+    array = ak.Array([[1, 2, 3, 4], [5], [10]])
+
+    cuda_array = ak.to_backend(array, "cuda")
+
+    assert isinstance(func(cuda_array, axis=1, highlevel=True), ak.Array)
+    assert isinstance(func(cuda_array, axis=1, highlevel=False), ak.contents.Content)
+    assert (
+        func(
+            ak.Array(cuda_array, behavior=behavior_1),
+            axis=1,
+            highlevel=True,
+            behavior=behavior_2,
+        ).behavior
+        == behavior_2
+    )
+    assert (
+        func(
+            ak.Array(cuda_array, behavior=behavior_1),
+            axis=1,
+            highlevel=True,
+        ).behavior
+        == behavior_1
+    )
diff --git a/tests-cuda/test_3136_cuda_reducers.py b/tests-cuda/test_3136_cuda_reducers.py
new file mode 100644
index 0000000000..06ab47117a
--- /dev/null
+++ b/tests-cuda/test_3136_cuda_reducers.py
@@ -0,0 +1,454 @@
+from __future__ import annotations
+
+import cupy as cp
+import cupy.testing as cpt
+import numpy as np
+import pytest
+
+import awkward as ak
+
+to_list = ak.operations.to_list
+
+
+@pytest.fixture(scope="function", autouse=True)
+def cleanup_cuda():
+    yield
+    cp._default_memory_pool.free_all_blocks()
+    cp.cuda.Device().synchronize()
+
+
+def prod(xs):
+    out = 1
+    for x in xs:
+        out *= x
+    return out
+
+
+def test_0115_generic_reducer_operation_sumprod_types():
+    array = np.array([[True, False, False], [True, False, False]])
+    content2 = ak.contents.NumpyArray(array.reshape(-1))
+    offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64))
+    depth1 = ak.contents.ListOffsetArray(offsets3, content2)
+
+    depth1 = ak.to_backend(depth1, "cuda")
+
+    assert sum(to_list(np.sum(array, axis=-1))) == sum(
+        to_list(ak.sum(depth1, axis=-1, highlevel=False))
+    )
+    assert prod(to_list(np.prod(array, axis=-1))) == prod(
+        to_list(ak.prod(depth1, axis=-1, highlevel=False))
+    )
+    del depth1
+
+
+def test_0115_generic_reducer_operation_sumprod_types_1():
+    array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.int8)
+    content2 = ak.contents.NumpyArray(array.reshape(-1))
+    offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64))
+    depth1 = ak.contents.ListOffsetArray(offsets3, content2)
+
+    depth1 = ak.to_backend(depth1, "cuda")
+
+    assert (
+        np.sum(array, axis=-1).dtype
+        == ak.to_numpy(ak.sum(depth1, axis=-1, highlevel=False)).dtype
+    )
+    assert (
+        np.prod(array, axis=-1).dtype
+        == ak.to_numpy(ak.prod(depth1, axis=-1, highlevel=False)).dtype
+    )
+    assert sum(to_list(np.sum(array, axis=-1))) == sum(
+        to_list(ak.sum(depth1, axis=-1, highlevel=False))
+    )
+    assert prod(to_list(np.prod(array, axis=-1))) == prod(
+        to_list(ak.prod(depth1, axis=-1, highlevel=False))
+    )
+    del depth1
+
+
+def test_0115_generic_reducer_operation_sumprod_types_2():
+    array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.uint8)
+    content2 = ak.contents.NumpyArray(array.reshape(-1))
+    offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64))
+    depth1 = ak.contents.ListOffsetArray(offsets3, content2)
+
+    depth1 = ak.to_backend(depth1, "cuda")
+
+    assert (
+        np.sum(array, axis=-1).dtype
+        == ak.to_numpy(ak.sum(depth1, axis=-1, highlevel=False)).dtype
+    )
+    assert (
+        np.prod(array, axis=-1).dtype
+        == ak.to_numpy(ak.prod(depth1, axis=-1, highlevel=False)).dtype
+    )
+    assert sum(to_list(np.sum(array, axis=-1))) == sum(
+        to_list(ak.sum(depth1, axis=-1, highlevel=False))
+    )
+    assert prod(to_list(np.prod(array, axis=-1))) == prod(
+        to_list(ak.prod(depth1, axis=-1, highlevel=False))
+    )
+    del depth1
+
+
+def test_0115_generic_reducer_operation_sumprod_types_3():
+    array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.int16)
+    content2 = ak.contents.NumpyArray(array.reshape(-1))
+    offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64))
+    depth1 = ak.contents.ListOffsetArray(offsets3, content2)
+
+    depth1 = ak.to_backend(depth1, "cuda")
+
+    assert (
+        np.sum(array, axis=-1).dtype
+        == ak.to_numpy(ak.sum(depth1, axis=-1, highlevel=False)).dtype
+    )
+    assert (
+        np.prod(array, axis=-1).dtype
+        == ak.to_numpy(ak.prod(depth1, axis=-1, highlevel=False)).dtype
+    )
+    assert sum(to_list(np.sum(array, axis=-1))) == sum(
+        to_list(ak.sum(depth1, axis=-1, highlevel=False))
+    )
+    assert prod(to_list(np.prod(array, axis=-1))) == prod(
+        to_list(ak.prod(depth1, axis=-1, highlevel=False))
+    )
+    del depth1
+
+
+def test_0115_generic_reducer_operation_sumprod_types_4():
+    array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.uint16)
+    content2 = ak.contents.NumpyArray(array.reshape(-1))
+    offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64))
+    depth1 = ak.contents.ListOffsetArray(offsets3, content2)
+
+    depth1 = ak.to_backend(depth1, "cuda")
+
+    assert (
+        np.sum(array, axis=-1).dtype
+        == ak.to_numpy(ak.sum(depth1, axis=-1, highlevel=False)).dtype
+    )
+    assert (
+        np.prod(array, axis=-1).dtype
+        == ak.to_numpy(ak.prod(depth1, axis=-1, highlevel=False)).dtype
+    )
+    assert sum(to_list(np.sum(array, axis=-1))) == sum(
+        to_list(ak.sum(depth1, axis=-1, highlevel=False))
+    )
+    assert prod(to_list(np.prod(array, axis=-1))) == prod(
+        to_list(ak.prod(depth1, axis=-1, highlevel=False))
+    )
+
+
+def test_0115_generic_reducer_operation_sumprod_types_5():
+    array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.int32)
+    content2 = ak.contents.NumpyArray(array.reshape(-1))
+    offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64))
+    depth1 = ak.contents.ListOffsetArray(offsets3, content2)
+
+    depth1 = ak.to_backend(depth1, "cuda")
+
+    assert (
+        np.sum(array, axis=-1).dtype
+        == ak.to_numpy(ak.sum(depth1, axis=-1, highlevel=False)).dtype
+    )
+    assert (
+        np.prod(array, axis=-1).dtype
+        == ak.to_numpy(ak.prod(depth1, axis=-1, highlevel=False)).dtype
+    )
+    assert sum(to_list(np.sum(array, axis=-1))) == sum(
+        to_list(ak.sum(depth1, axis=-1, highlevel=False))
+    )
+    assert prod(to_list(np.prod(array, axis=-1))) == prod(
+        to_list(ak.prod(depth1, axis=-1, highlevel=False))
+    )
+    del depth1
+
+
+def test_0115_generic_reducer_operation_sumprod_types_6():
+    array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.uint32)
+    content2 = ak.contents.NumpyArray(array.reshape(-1))
+    offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64))
+    depth1 = ak.contents.ListOffsetArray(offsets3, content2)
+
+    depth1 = ak.to_backend(depth1, "cuda")
+
+    assert (
+        np.sum(array, axis=-1).dtype
+        == ak.to_numpy(ak.sum(depth1, axis=-1, highlevel=False)).dtype
+    )
+    assert (
+        np.prod(array, axis=-1).dtype
+        == ak.to_numpy(ak.prod(depth1, axis=-1, highlevel=False)).dtype
+    )
+    assert sum(to_list(np.sum(array, axis=-1))) == sum(
+        to_list(ak.sum(depth1, axis=-1, highlevel=False))
+    )
+    assert prod(to_list(np.prod(array, axis=-1))) == prod(
+        to_list(ak.prod(depth1, axis=-1, highlevel=False))
+    )
+    del depth1
+
+
+def test_0115_generic_reducer_operation_sumprod_types_7():
+    array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.int64)
+    content2 = ak.contents.NumpyArray(array.reshape(-1))
+    offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64))
+    depth1 = ak.contents.ListOffsetArray(offsets3, content2)
+
+    depth1 = ak.to_backend(depth1, "cuda")
+
+    assert (
+        np.sum(array, axis=-1).dtype
+        == ak.to_numpy(ak.sum(depth1, axis=-1, highlevel=False)).dtype
+    )
+    assert (
+        np.prod(array, axis=-1).dtype
+        == ak.to_numpy(ak.prod(depth1, axis=-1, highlevel=False)).dtype
+    )
+    assert sum(to_list(np.sum(array, axis=-1))) == sum(
+        to_list(ak.sum(depth1, axis=-1, highlevel=False))
+    )
+    assert prod(to_list(np.prod(array, axis=-1))) == prod(
+        to_list(ak.prod(depth1, axis=-1, highlevel=False))
+    )
+    del depth1
+
+
+def test_0115_generic_reducer_operation_sumprod_types_8():
+    array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.uint64)
+    content2 = ak.contents.NumpyArray(array.reshape(-1))
+    offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64))
+    depth1 = ak.contents.ListOffsetArray(offsets3, content2)
+
+    depth1 = ak.to_backend(depth1, "cuda")
+
+    assert (
+        np.sum(array, axis=-1).dtype
+        == ak.to_numpy(ak.sum(depth1, axis=-1, highlevel=False)).dtype
+    )
+    assert (
+        np.prod(array, axis=-1).dtype
+        == ak.to_numpy(ak.prod(depth1, axis=-1, highlevel=False)).dtype
+    )
+    assert sum(to_list(np.sum(array, axis=-1))) == sum(
+        to_list(ak.sum(depth1, axis=-1, highlevel=False))
+    )
+    assert prod(to_list(np.prod(array, axis=-1))) == prod(
+        to_list(ak.prod(depth1, axis=-1, highlevel=False))
+    )
+    del depth1
+
+
+def test_0115_generic_reducer_operation_sumprod_types_FIXME():
+    array = np.array([[True, False, False], [True, False, False]])
+    content2 = ak.contents.NumpyArray(array.reshape(-1))
+    offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64))
+    depth1 = ak.contents.ListOffsetArray(offsets3, content2)
+    depth1 = ak.to_backend(depth1, "cuda")
+
+    assert (
+        np.sum(array, axis=-1).dtype
+        == ak.to_numpy(ak.sum(depth1, axis=-1, highlevel=False)).dtype
+    )
+    assert (
+        np.prod(array, axis=-1).dtype
+        == ak.to_numpy(ak.prod(depth1, axis=-1, highlevel=False)).dtype
+    )
+    del depth1
+
+
+def test_2020_reduce_axis_none_sum():
+    array = ak.Array(
+        [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda"
+    )
+    cpt.assert_allclose(ak.sum(array, axis=None), 63.0)
+    assert ak.almost_equal(
+        ak.sum(array, axis=None, keepdims=True),
+        ak.to_regular(ak.Array([[63.0]], backend="cuda")),
+    )
+    assert ak.almost_equal(
+        ak.sum(array, axis=None, keepdims=True, mask_identity=True),
+        ak.to_regular(
+            ak.Array([[63.0]], backend="cuda").mask[ak.Array([[True]], backend="cuda")]
+        ),
+    )
+    assert ak.sum(array[2], axis=None, mask_identity=True) is None
+    del array
+
+
+def test_2020_reduce_axis_none_prod():
+    array = ak.Array(
+        [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda"
+    )
+    cpt.assert_allclose(ak.prod(array[1:], axis=None), 4838400.0)
+    assert ak.prod(array, axis=None) == 0
+    assert ak.almost_equal(
+        ak.prod(array, axis=None, keepdims=True),
+        ak.to_regular(ak.Array([[0.0]], backend="cuda")),
+    )
+    assert ak.almost_equal(
+        ak.prod(array[1:], axis=None, keepdims=True),
+        ak.to_regular(ak.Array([[4838400.0]], backend="cuda")),
+    )
+    assert ak.almost_equal(
+        ak.prod(array[1:], axis=None, keepdims=True, mask_identity=True),
+        ak.to_regular(
+            ak.Array([[4838400.0]], backend="cuda").mask[
+                ak.Array([[True]], backend="cuda")
+            ]
+        ),
+    )
+    assert ak.prod(array[2], axis=None, mask_identity=True) is None
+    del array
+
+
+def test_2020_reduce_axis_none_min():
+    array = ak.Array(
+        [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda"
+    )
+    cpt.assert_allclose(ak.min(array, axis=None), 0.0)
+    assert ak.almost_equal(
+        ak.min(array, axis=None, keepdims=True, mask_identity=False),
+        ak.to_regular(ak.Array([[0.0]], backend="cuda")),
+    )
+    assert ak.almost_equal(
+        ak.min(array, axis=None, keepdims=True, initial=-100.0, mask_identity=False),
+        ak.to_regular(ak.Array([[-100.0]], backend="cuda")),
+    )
+    assert ak.almost_equal(
+        ak.min(array, axis=None, keepdims=True, mask_identity=True),
+        ak.to_regular(
+            ak.Array([[0.0]], backend="cuda").mask[ak.Array([[True]], backend="cuda")]
+        ),
+    )
+    assert ak.almost_equal(
+        ak.min(array[-1:], axis=None, keepdims=True, mask_identity=True),
+        ak.to_regular(
+            ak.Array(ak.Array([[np.inf]], backend="cuda")).mask[
+                ak.Array([[False]], backend="cuda")
+            ]
+        ),
+    )
+    assert ak.min(array[2], axis=None, mask_identity=True) is None
+    del array
+
+
+def test_2020_reduce_axis_none_max():
+    array = ak.Array(
+        [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda"
+    )
+    cpt.assert_allclose(ak.max(array, axis=None), 10.0)
+    assert ak.almost_equal(
+        ak.max(array, axis=None, keepdims=True, mask_identity=False),
+        ak.to_regular(ak.Array([[10.0]], backend="cuda")),
+    )
+    assert ak.almost_equal(
+        ak.max(array, axis=None, keepdims=True, initial=100.0, mask_identity=False),
+        ak.to_regular(ak.Array([[100.0]], backend="cuda")),
+    )
+    assert ak.almost_equal(
+        ak.max(array, axis=None, keepdims=True, mask_identity=True),
+        ak.to_regular(
+            ak.Array([[10.0]], backend="cuda").mask[ak.Array([[True]], backend="cuda")]
+        ),
+    )
+    assert ak.almost_equal(
+        ak.max(array[-1:], axis=None, keepdims=True, mask_identity=True),
+        ak.to_regular(
+            ak.Array(ak.Array([[np.inf]], backend="cuda")).mask[
+                ak.Array([[False]], backend="cuda")
+            ]
+        ),
+    )
+    assert ak.max(array[2], axis=None, mask_identity=True) is None
+    del array
+
+
+def test_2020_reduce_axis_none_count():
+    array = ak.Array(
+        [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda"
+    )
+    assert ak.count(array, axis=None) == 12
+    assert ak.almost_equal(
+        ak.count(array, axis=None, keepdims=True, mask_identity=False),
+        ak.to_regular(ak.Array([[12]], backend="cuda")),
+    )
+    assert ak.almost_equal(
+        ak.count(array, axis=None, keepdims=True, mask_identity=True),
+        ak.to_regular(
+            ak.Array([[12]], backend="cuda").mask[ak.Array([[True]], backend="cuda")]
+        ),
+    )
+    assert ak.almost_equal(
+        ak.count(array[-1:], axis=None, keepdims=True, mask_identity=True),
+        ak.to_regular(
+            ak.Array([[0]], backend="cuda").mask[ak.Array([[False]], backend="cuda")]
+        ),
+    )
+    assert ak.count(array[2], axis=None, mask_identity=True) is None
+    assert ak.count(array[2], axis=None, mask_identity=False) == 0
+    del array
+
+
+def test_2020_reduce_axis_none_count_nonzero():
+    array = ak.Array(
+        [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda"
+    )
+    assert ak.count_nonzero(array, axis=None) == 11
+    assert ak.almost_equal(
+        ak.count_nonzero(array, axis=None, keepdims=True, mask_identity=False),
+        ak.to_regular(ak.Array([[11]], backend="cuda")),
+    )
+    assert ak.almost_equal(
+        ak.count_nonzero(array, axis=None, keepdims=True, mask_identity=True),
+        ak.to_regular(
+            ak.Array([[11]], backend="cuda").mask[ak.Array([[True]], backend="cuda")]
+        ),
+    )
+    assert ak.almost_equal(
+        ak.count_nonzero(array[-1:], axis=None, keepdims=True, mask_identity=True),
+        ak.to_regular(
+            ak.Array([[0]], backend="cuda").mask[ak.Array([[False]], backend="cuda")]
+        ),
+    )
+    assert ak.count_nonzero(array[2], axis=None, mask_identity=True) is None
+    assert ak.count_nonzero(array[2], axis=None, mask_identity=False) == 0
+    del array
+
+
+def test_2020_reduce_axis_none_std_no_mask_axis_none():
+    array = ak.Array(
+        [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda"
+    )
+    out1 = ak.std(array[-1:], axis=None, keepdims=True, mask_identity=True)
+    out2 = ak.to_regular(
+        ak.Array([[0.0]], backend="cuda").mask[ak.Array([[False]], backend="cuda")]
+    )
+    assert ak.almost_equal(out1, out2)
+
+    out3 = ak.std(array[2], axis=None, mask_identity=True)
+    assert out3 is None
+    del array
+    del out1, out2, out3
+
+
+def test_2020_reduce_axis_none_std():
+    array = ak.Array(
+        [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda"
+    )
+    cpt.assert_allclose(ak.std(array, axis=None), 3.139134700306227)
+    cpt.assert_allclose(
+        ak.std(array, axis=None, keepdims=True, mask_identity=False),
+        ak.to_regular([[3.139134700306227]]),
+    )
+    cpt.assert_allclose(
+        ak.std(array, axis=None, keepdims=True, mask_identity=True),
+        ak.to_regular(
+            ak.Array([[3.139134700306227]], backend="cuda").mask[
+                ak.Array([[True]], backend="cuda")
+            ]
+        ),
+    )
+    assert np.isnan(ak.std(array[2], axis=None, mask_identity=False))
+    del array
diff --git a/tests-cuda/test_3140_cuda_jagged_and_masked_getitem.py b/tests-cuda/test_3140_cuda_jagged_and_masked_getitem.py
index fff5417c03..064a6a5763 100644
--- a/tests-cuda/test_3140_cuda_jagged_and_masked_getitem.py
+++ b/tests-cuda/test_3140_cuda_jagged_and_masked_getitem.py
@@ -9,6 +9,13 @@
 to_list = ak.operations.to_list
 
 
+@pytest.fixture(scope="function", autouse=True)
+def cleanup_cuda():
+    yield
+    cp._default_memory_pool.free_all_blocks()
+    cp.cuda.Device().synchronize()
+
+
 def test_0111_jagged_and_masked_getitem_bitmaskedarray2b():
     array = ak.operations.from_iter(
         [[0.0, 1.1, 2.2], [3.3, 4.4], [5.5], [6.6, 7.7, 8.8, 9.9]], highlevel=False
@@ -36,6 +43,8 @@ def test_0111_jagged_and_masked_getitem_bitmaskedarray2b():
     ]
     assert maskedarray.to_typetracer()[cuda_array].form == maskedarray[cuda_array].form
 
+    del cuda_array
+
 
 def test_0111_jagged_and_masked_getitem_bytemaskedarray2b():
     array = ak.operations.from_iter(
@@ -62,6 +71,7 @@ def test_0111_jagged_and_masked_getitem_bytemaskedarray2b():
         [6.6, 9.9],
     ]
     assert maskedarray.to_typetracer()[cuda_array].form == maskedarray[cuda_array].form
+    del cuda_array
 
 
 def test_0111_jagged_and_masked_getitem_emptyarray():
@@ -113,6 +123,8 @@ def test_0111_jagged_and_masked_getitem_emptyarray():
     with pytest.raises(IndexError):
         cuda_listoffsetarray[cuda_array5]
 
+    del cuda_listoffsetarray
+
 
 def test_0111_jagged_and_masked_getitem_indexedarray():
     array = ak.operations.from_iter(
@@ -248,6 +260,9 @@ def test_0111_jagged_and_masked_getitem_indexedarray():
         == cuda_indexedarray[cuda_array1].form
     )
 
+    del cuda_indexedarray
+    del cuda_array1
+
 
 def test_0111_jagged_and_masked_getitem_indexedarray2():
     array = ak.operations.from_iter(
@@ -275,6 +290,8 @@ def test_0111_jagged_and_masked_getitem_indexedarray2():
         cuda_indexedarray.to_typetracer()[cuda_array].form
         == cuda_indexedarray[cuda_array].form
     )
+    del cuda_indexedarray
+    del cuda_array
 
 
 def test_0111_jagged_and_masked_getitem_indexedarray2b():
@@ -303,6 +320,8 @@ def test_0111_jagged_and_masked_getitem_indexedarray2b():
         cuda_indexedarray.to_typetracer()[cuda_array].form
         == cuda_indexedarray[cuda_array].form
     )
+    del cuda_indexedarray
+    del cuda_array
 
 
 def test_0111_jagged_and_masked_getitem_indexedarray3():
@@ -381,6 +400,13 @@ def test_0111_jagged_and_masked_getitem_indexedarray3():
     with pytest.raises(IndexError):
         cuda_array[cuda_array6]
 
+    del cuda_array
+    del cuda_array2
+    del cuda_array3
+    del cuda_array4
+    del cuda_array5
+    del cuda_array6
+
 
 def test_0111_jagged_and_masked_getitem_jagged():
     array = ak.highlevel.Array(
@@ -402,6 +428,9 @@ def test_0111_jagged_and_masked_getitem_jagged():
     ]
     assert cuda_array.to_typetracer()[cuda_array2].form == cuda_array[cuda_array2].form
 
+    del cuda_array
+    del cuda_array2
+
 
 def test_0111_jagged_and_masked_getitem_double_jagged():
     array = ak.highlevel.Array(
@@ -533,54 +562,53 @@ def test_0111_jagged_and_masked_getitem_array_boolean_to_int():
     b = ak._slicing._normalise_item_bool_to_int(cuda_a, backend=cuda_a.backend)
     assert to_list(b) == [[1, 2], [], [1], [], [1, 2, 3]]
 
-    # a = ak.operations.from_iter(
-    #     [[True, True, None], [], [True, None], [None], [True, True, True, None]],
-    #     highlevel=False,
-    # )
-    # cuda_a = ak.to_backend(a, "cuda", highlevel=False)
-    # # b = ak._slicing._normalise_item_bool_to_int(cuda_a, backend=cuda_a.backend)
-    # # error in _slicing line 553 - FIXME
-    # assert to_list(b) == [[0, 1, None], [], [0, None], [None], [0, 1, 2, None]]
-    # assert (
-    #     b.content.index.data[b.content.index.data >= 0].tolist()
-    #     == np.arange(6).tolist()  # kernels expect nonnegative entries to be arange
-    # )
-
-    # a = ak.operations.from_iter(
-    #     [[None, True, True], [], [None, True], [None], [None, True, True, True]],
-    #     highlevel=False,
-    # )
-    # cuda_a = ak.to_backend(a, "cuda", highlevel=False)
-    # b = ak._slicing._normalise_item_bool_to_int(cuda_a, backend=cuda_a.backend)
-    # assert to_list(b) == [[None, 1, 2], [], [None, 1], [None], [None, 1, 2, 3]]
-    # assert (
-    #     b.content.index.data[b.content.index.data >= 0].tolist()
-    #     == np.arange(6).tolist()  # kernels expect nonnegative entries to be arange
-    # )
-
-    # a = ak.operations.from_iter(
-    #     [[False, True, None], [], [False, None], [None], [False, True, True, None]],
-    #     highlevel=False,
-    # )
-    # cuda_a = ak.to_backend(a, "cuda", highlevel=False)
-    # b = ak._slicing._normalise_item_bool_to_int(cuda_a, backend=cuda_a.backend)
-    # assert to_list(b) == [[1, None], [], [None], [None], [1, 2, None]]
-    # assert (
-    #     b.content.index.data[b.content.index.data >= 0].tolist()
-    #     == np.arange(3).tolist()  # kernels expect nonnegative entries to be arange
-    # )
-
-    # a = ak.operations.from_iter(
-    #     [[None, True, False], [], [None, False], [None], [None, True, True, False]],
-    #     highlevel=False,
-    # )
-    # cuda_a = ak.to_backend(a, "cuda", highlevel=False)
-    # b = ak._slicing._normalise_item_bool_to_int(cuda_a, backend=cuda_a.backend)
-    # assert to_list(b) == [[None, 1], [], [None], [None], [None, 1, 2]]
-    # assert (
-    #     b.content.index.data[b.content.index.data >= 0].tolist()
-    #     == np.arange(3).tolist()  # kernels expect nonnegative entries to be arange
-    # )
+    a = ak.operations.from_iter(
+        [[True, True, None], [], [True, None], [None], [True, True, True, None]],
+        highlevel=False,
+    )
+    cuda_a = ak.to_backend(a, "cuda", highlevel=False)
+    b = ak._slicing._normalise_item_bool_to_int(cuda_a, backend=cuda_a.backend)
+    assert to_list(b) == [[0, 1, None], [], [0, None], [None], [0, 1, 2, None]]
+    assert (
+        b.content.index.data[b.content.index.data >= 0].tolist()
+        == np.arange(6).tolist()  # kernels expect nonnegative entries to be arange
+    )
+
+    a = ak.operations.from_iter(
+        [[None, True, True], [], [None, True], [None], [None, True, True, True]],
+        highlevel=False,
+    )
+    cuda_a = ak.to_backend(a, "cuda", highlevel=False)
+    b = ak._slicing._normalise_item_bool_to_int(cuda_a, backend=cuda_a.backend)
+    assert to_list(b) == [[None, 1, 2], [], [None, 1], [None], [None, 1, 2, 3]]
+    assert (
+        b.content.index.data[b.content.index.data >= 0].tolist()
+        == np.arange(6).tolist()  # kernels expect nonnegative entries to be arange
+    )
+
+    a = ak.operations.from_iter(
+        [[False, True, None], [], [False, None], [None], [False, True, True, None]],
+        highlevel=False,
+    )
+    cuda_a = ak.to_backend(a, "cuda", highlevel=False)
+    b = ak._slicing._normalise_item_bool_to_int(cuda_a, backend=cuda_a.backend)
+    assert to_list(b) == [[1, None], [], [None], [None], [1, 2, None]]
+    assert (
+        b.content.index.data[b.content.index.data >= 0].tolist()
+        == np.arange(3).tolist()  # kernels expect nonnegative entries to be arange
+    )
+
+    a = ak.operations.from_iter(
+        [[None, True, False], [], [None, False], [None], [None, True, True, False]],
+        highlevel=False,
+    )
+    cuda_a = ak.to_backend(a, "cuda", highlevel=False)
+    b = ak._slicing._normalise_item_bool_to_int(cuda_a, backend=cuda_a.backend)
+    assert to_list(b) == [[None, 1], [], [None], [None], [None, 1, 2]]
+    assert (
+        b.content.index.data[b.content.index.data >= 0].tolist()
+        == np.arange(3).tolist()  # kernels expect nonnegative entries to be arange
+    )
 
 
 def test_0111_jagged_and_masked_getitem_array_slice():
diff --git a/tests-cuda/test_3140_cuda_slicing.py b/tests-cuda/test_3140_cuda_slicing.py
index 047fc7977c..59e2cfcb67 100644
--- a/tests-cuda/test_3140_cuda_slicing.py
+++ b/tests-cuda/test_3140_cuda_slicing.py
@@ -1,12 +1,21 @@
 from __future__ import annotations
 
+import cupy as cp
 import numpy as np
+import pytest
 
 import awkward as ak
 
 to_list = ak.operations.to_list
 
 
+@pytest.fixture(scope="function", autouse=True)
+def cleanup_cuda():
+    yield
+    cp._default_memory_pool.free_all_blocks()
+    cp.cuda.Device().synchronize()
+
+
 def test_0315_integerindex_null_more():
     f = ak.highlevel.Array([[0, None, 2], None, [3, 4], []], backend="cuda").layout
     g1 = ak.highlevel.Array([[1, 2, None], None, [], [None]], backend="cuda").layout
diff --git a/tests-cuda/test_3141_cuda_misc.py b/tests-cuda/test_3141_cuda_misc.py
index eb5adeb78e..7582788d9c 100644
--- a/tests-cuda/test_3141_cuda_misc.py
+++ b/tests-cuda/test_3141_cuda_misc.py
@@ -1,6 +1,8 @@
 from __future__ import annotations
 
+import cupy as cp
 import numpy as np
+import pytest
 
 import awkward as ak
 from awkward.types import ArrayType, NumpyType, RegularType
@@ -8,6 +10,13 @@
 to_list = ak.operations.to_list
 
 
+@pytest.fixture(scope="function", autouse=True)
+def cleanup_cuda():
+    yield
+    cp._default_memory_pool.free_all_blocks()
+    cp.cuda.Device().synchronize()
+
+
 def test_0150_ByteMaskedArray_flatten():
     content = ak.operations.from_iter(
         [