From 1e1b8ea4ade00407d7a584a34887163bb2fa01be Mon Sep 17 00:00:00 2001
From: Manasvi Goyal <mg.manasvi@gmail.com>
Date: Thu, 16 May 2024 17:50:27 +0200
Subject: [PATCH 01/10] chore: trying atomics and tree reduction for CUDA
 reducer kernels

---
 .../reducers/awkward_reduce_sum_atomics.py    | 43 ++++++++++++
 .../awkward_reduce_sum_tree_reduction.py      | 65 +++++++++++++++++++
 2 files changed, 108 insertions(+)
 create mode 100644 studies/cuda-kernels/reducers/awkward_reduce_sum_atomics.py
 create mode 100644 studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py

diff --git a/studies/cuda-kernels/reducers/awkward_reduce_sum_atomics.py b/studies/cuda-kernels/reducers/awkward_reduce_sum_atomics.py
new file mode 100644
index 0000000000..d1e456cf4b
--- /dev/null
+++ b/studies/cuda-kernels/reducers/awkward_reduce_sum_atomics.py
@@ -0,0 +1,43 @@
+import cupy as cp
+
+cuda_kernel = """
+extern "C" {
+    __global__ void reduce_sum_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength) {
+       int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+       if (thread_id < outlength) {
+          toptr[thread_id] = 0;
+       }
+    }
+}
+extern "C" {
+    __global__ void reduce_sum_b(int* toptr, int* fromptr, int* parents, int lenparents, int outlength) {
+       int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+       int stride = blockDim.x * gridDim.x;
+
+       for (int i = thread_id; i < lenparents; i += stride) {
+           atomicAdd(&toptr[parents[i]], fromptr[i]);
+       }
+    }
+}
+"""
+
+parents = cp.array([0, 1, 1, 2, 2, 3, 3, 3, 5], dtype=cp.int32)
+fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=cp.int32)
+lenparents = len(parents)
+outlength = int(cp.max(parents)) + 1
+toptr = cp.zeros(outlength, dtype=cp.int32)
+
+block_size = 256
+grid_size = (lenparents + block_size - 1) // block_size
+
+raw_module = cp.RawModule(code=cuda_kernel)
+
+reduce_sum_a = raw_module.get_function('reduce_sum_a')
+reduce_sum_b = raw_module.get_function('reduce_sum_b')
+
+reduce_sum_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength))
+reduce_sum_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength))
+
+toptr_host = toptr.get()
+print("atomic toptr:", toptr_host)
\ No newline at end of file
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py
new file mode 100644
index 0000000000..82b072cc63
--- /dev/null
+++ b/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py
@@ -0,0 +1,65 @@
+import cupy as cp
+
+cuda_kernel = """
+extern "C" {
+    __global__ void reduce_sum_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength) {
+       int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+       if (thread_id < outlength) {
+          toptr[thread_id] = 0;
+       }
+    }
+}
+    extern "C" {
+    __global__ void reduce_sum_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength) {
+        extern __shared__ int shared[];
+
+        int idx = threadIdx.x;
+        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+        if (thread_id < lenparents) {
+            shared[idx] = fromptr[thread_id];
+            __syncthreads();
+
+            for (int stride = 1; stride < blockDim.x; stride *= 2) {
+                int index = idx - stride;
+                if (index >= 0 && parents[index] == parents[idx]) {
+                    shared[idx] += shared[index];
+                }
+                __syncthreads();
+            }
+
+            fromptr[thread_id] = shared[idx];
+
+            if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+                int parent = parents[thread_id];
+                if (parent < lenparents) {
+                    toptr[parent] = shared[idx];
+                }
+            }
+        }
+    }
+}
+"""
+
+parents = cp.array([0, 1, 1, 2, 2, 3, 3, 3, 5], dtype=cp.int32)
+fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=cp.int32)
+lenparents = len(parents)
+outlength = int(cp.max(parents)) + 1
+toptr = cp.zeros(outlength, dtype=cp.int32)
+
+block_size = 256
+grid_size = (lenparents + block_size - 1) // block_size
+
+toptr = cp.zeros(outlength, dtype=cp.int32)
+
+raw_module = cp.RawModule(code=cuda_kernel)
+
+reduce_sum_a = raw_module.get_function('reduce_sum_a')
+reduce_sum_b = raw_module.get_function('reduce_sum_b')
+
+reduce_sum_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength))
+reduce_sum_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength))
+
+toptr_host = toptr.get()
+print("tree reduction toptr:", toptr_host)
\ No newline at end of file

From f79b2ae3a8440432f1fd6c4bbe553aa6f4ee3500 Mon Sep 17 00:00:00 2001
From: Manasvi Goyal <mg.manasvi@gmail.com>
Date: Thu, 16 May 2024 18:03:29 +0200
Subject: [PATCH 02/10] chore: add prod, min, max

---
 .../awkward_reduce_max_tree_reduction.py      | 65 +++++++++++++++++++
 .../awkward_reduce_min_tree_reduction.py      | 65 +++++++++++++++++++
 .../awkward_reduce_prod_tree_reduction.py     | 65 +++++++++++++++++++
 .../reducers/awkward_reduce_sum_atomics.py    | 12 ++--
 .../awkward_reduce_sum_tree_reduction.py      | 12 ++--
 5 files changed, 207 insertions(+), 12 deletions(-)
 create mode 100644 studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py
 create mode 100644 studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py
 create mode 100644 studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py

diff --git a/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py
new file mode 100644
index 0000000000..a00047672a
--- /dev/null
+++ b/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py
@@ -0,0 +1,65 @@
+import cupy as cp
+
+cuda_kernel = """
+extern "C" {
+    __global__ void awkward_reduce_max_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength) {
+       int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+       if (thread_id < outlength) {
+          toptr[thread_id] = -9223372036854775808;
+       }
+    }
+}
+    extern "C" {
+    __global__ void awkward_reduce_max_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength) {
+        extern __shared__ int shared[];
+
+        int idx = threadIdx.x;
+        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+        if (thread_id < lenparents) {
+            shared[idx] = fromptr[thread_id];
+            __syncthreads();
+
+            for (int stride = 1; stride < blockDim.x; stride *= 2) {
+                int index = idx - stride;
+                if (index >= 0 && parents[index] == parents[idx]) {
+                    shared[tid] = max(shared[tid], shared[index]);
+                }
+                __syncthreads();
+            }
+
+            fromptr[thread_id] = shared[idx];
+
+            if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+                int parent = parents[thread_id];
+                if (parent < lenparents) {
+                    toptr[parent] = shared[idx];
+                }
+            }
+        }
+    }
+}
+"""
+
+parents = cp.array([0, 1, 1, 2, 2, 3, 3, 3, 5], dtype=cp.int32)
+fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=cp.int32)
+lenparents = len(parents)
+outlength = int(cp.max(parents)) + 1
+toptr = cp.zeros(outlength, dtype=cp.int32)
+
+block_size = 256
+grid_size = (lenparents + block_size - 1) // block_size
+
+toptr = cp.zeros(outlength, dtype=cp.int32)
+
+raw_module = cp.RawModule(code=cuda_kernel)
+
+awkward_reduce_max_a = raw_module.get_function('awkward_reduce_max_a')
+awkward_reduce_max_b = raw_module.get_function('awkward_reduce_max_b')
+
+awkward_reduce_max_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength))
+awkward_reduce_max_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength))
+
+toptr_host = toptr.get()
+print("tree reduction toptr:", toptr_host)
\ No newline at end of file
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py
new file mode 100644
index 0000000000..4ee35375c5
--- /dev/null
+++ b/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py
@@ -0,0 +1,65 @@
+import cupy as cp
+
+cuda_kernel = """
+extern "C" {
+    __global__ void awkward_reduce_min_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength) {
+       int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+       if (thread_id < outlength) {
+          toptr[thread_id] = 9223372036854775807;
+       }
+    }
+}
+    extern "C" {
+    __global__ void awkward_reduce_min_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength) {
+        extern __shared__ int shared[];
+
+        int idx = threadIdx.x;
+        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+        if (thread_id < lenparents) {
+            shared[idx] = fromptr[thread_id];
+            __syncthreads();
+
+            for (int stride = 1; stride < blockDim.x; stride *= 2) {
+                int index = idx - stride;
+                if (index >= 0 && parents[index] == parents[idx]) {
+                    shared[tid] = min(shared[tid], shared[index]);
+                }
+                __syncthreads();
+            }
+
+            fromptr[thread_id] = shared[idx];
+
+            if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+                int parent = parents[thread_id];
+                if (parent < lenparents) {
+                    toptr[parent] = shared[idx];
+                }
+            }
+        }
+    }
+}
+"""
+
+parents = cp.array([0, 1, 1, 2, 2, 3, 3, 3, 5], dtype=cp.int32)
+fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=cp.int32)
+lenparents = len(parents)
+outlength = int(cp.max(parents)) + 1
+toptr = cp.zeros(outlength, dtype=cp.int32)
+
+block_size = 256
+grid_size = (lenparents + block_size - 1) // block_size
+
+toptr = cp.zeros(outlength, dtype=cp.int32)
+
+raw_module = cp.RawModule(code=cuda_kernel)
+
+awkward_reduce_min_a = raw_module.get_function('awkward_reduce_min_a')
+awkward_reduce_min_b = raw_module.get_function('awkward_reduce_min_b')
+
+awkward_reduce_min_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength))
+awkward_reduce_min_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength))
+
+toptr_host = toptr.get()
+print("tree reduction toptr:", toptr_host)
\ No newline at end of file
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py
new file mode 100644
index 0000000000..ac17bbfdeb
--- /dev/null
+++ b/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py
@@ -0,0 +1,65 @@
+import cupy as cp
+
+cuda_kernel = """
+extern "C" {
+    __global__ void awkward_reduce_prod_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength) {
+       int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+       if (thread_id < outlength) {
+          toptr[thread_id] = 1;
+       }
+    }
+}
+    extern "C" {
+    __global__ void awkward_reduce_prod_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength) {
+        extern __shared__ int shared[];
+
+        int idx = threadIdx.x;
+        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+        if (thread_id < lenparents) {
+            shared[idx] = fromptr[thread_id];
+            __syncthreads();
+
+            for (int stride = 1; stride < blockDim.x; stride *= 2) {
+                int index = idx - stride;
+                if (index >= 0 && parents[index] == parents[idx]) {
+                    shared[idx] *= shared[index];
+                }
+                __syncthreads();
+            }
+
+            fromptr[thread_id] = shared[idx];
+
+            if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+                int parent = parents[thread_id];
+                if (parent < lenparents) {
+                    toptr[parent] = shared[idx];
+                }
+            }
+        }
+    }
+}
+"""
+
+parents = cp.array([0, 1, 1, 2, 2, 3, 3, 3, 5], dtype=cp.int32)
+fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=cp.int32)
+lenparents = len(parents)
+outlength = int(cp.max(parents)) + 1
+toptr = cp.zeros(outlength, dtype=cp.int32)
+
+block_size = 256
+grid_size = (lenparents + block_size - 1) // block_size
+
+toptr = cp.zeros(outlength, dtype=cp.int32)
+
+raw_module = cp.RawModule(code=cuda_kernel)
+
+awkward_reduce_prod_a = raw_module.get_function('awkward_reduce_prod_a')
+awkward_reduce_prod_b = raw_module.get_function('awkward_reduce_prod_b')
+
+awkward_reduce_prod_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength))
+awkward_reduce_prod_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength))
+
+toptr_host = toptr.get()
+print("tree reduction toptr:", toptr_host)
\ No newline at end of file
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_sum_atomics.py b/studies/cuda-kernels/reducers/awkward_reduce_sum_atomics.py
index d1e456cf4b..dc2d6185ad 100644
--- a/studies/cuda-kernels/reducers/awkward_reduce_sum_atomics.py
+++ b/studies/cuda-kernels/reducers/awkward_reduce_sum_atomics.py
@@ -2,7 +2,7 @@
 
 cuda_kernel = """
 extern "C" {
-    __global__ void reduce_sum_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength) {
+    __global__ void awkward_reduce_sum_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength) {
        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
        if (thread_id < outlength) {
@@ -11,7 +11,7 @@
     }
 }
 extern "C" {
-    __global__ void reduce_sum_b(int* toptr, int* fromptr, int* parents, int lenparents, int outlength) {
+    __global__ void awkward_reduce_sum_b(int* toptr, int* fromptr, int* parents, int lenparents, int outlength) {
        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
        int stride = blockDim.x * gridDim.x;
 
@@ -33,11 +33,11 @@
 
 raw_module = cp.RawModule(code=cuda_kernel)
 
-reduce_sum_a = raw_module.get_function('reduce_sum_a')
-reduce_sum_b = raw_module.get_function('reduce_sum_b')
+awkward_reduce_sum_a = raw_module.get_function('awkward_reduce_sum_a')
+awkward_reduce_sum_b = raw_module.get_function('awkward_reduce_sum_b')
 
-reduce_sum_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength))
-reduce_sum_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength))
+awkward_reduce_sum_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength))
+awkward_reduce_sum_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength))
 
 toptr_host = toptr.get()
 print("atomic toptr:", toptr_host)
\ No newline at end of file
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py
index 82b072cc63..8ec46ad4f8 100644
--- a/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py
+++ b/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py
@@ -2,7 +2,7 @@
 
 cuda_kernel = """
 extern "C" {
-    __global__ void reduce_sum_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength) {
+    __global__ void awkward_reduce_sum_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength) {
        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
        if (thread_id < outlength) {
@@ -11,7 +11,7 @@
     }
 }
     extern "C" {
-    __global__ void reduce_sum_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength) {
+    __global__ void awkward_reduce_sum_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength) {
         extern __shared__ int shared[];
 
         int idx = threadIdx.x;
@@ -55,11 +55,11 @@
 
 raw_module = cp.RawModule(code=cuda_kernel)
 
-reduce_sum_a = raw_module.get_function('reduce_sum_a')
-reduce_sum_b = raw_module.get_function('reduce_sum_b')
+awkward_reduce_sum_a = raw_module.get_function('awkward_reduce_sum_a')
+awkward_reduce_sum_b = raw_module.get_function('awkward_reduce_sum_b')
 
-reduce_sum_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength))
-reduce_sum_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength))
+awkward_reduce_sum_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength))
+awkward_reduce_sum_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength))
 
 toptr_host = toptr.get()
 print("tree reduction toptr:", toptr_host)
\ No newline at end of file

From 49da4f26ae6c2b07e2b43ac82f92d5df6fa7a0cb Mon Sep 17 00:00:00 2001
From: ManasviGoyal <mg.manasvi@gmail.com>
Date: Fri, 17 May 2024 09:33:53 +0200
Subject: [PATCH 03/10] fix: some fixes

---
 .../reducers/awkward_reduce_max_tree_reduction.py | 15 +++++++--------
 .../reducers/awkward_reduce_min_tree_reduction.py | 15 +++++++--------
 .../awkward_reduce_prod_tree_reduction.py         |  2 --
 .../reducers/awkward_reduce_sum_tree_reduction.py |  2 --
 4 files changed, 14 insertions(+), 20 deletions(-)

diff --git a/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py
index a00047672a..eb20720a25 100644
--- a/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py
+++ b/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py
@@ -2,16 +2,16 @@
 
 cuda_kernel = """
 extern "C" {
-    __global__ void awkward_reduce_max_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength) {
+    __global__ void awkward_reduce_max_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int identity) {
        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
        if (thread_id < outlength) {
-          toptr[thread_id] = -9223372036854775808;
+          toptr[thread_id] = identity;
        }
     }
 }
     extern "C" {
-    __global__ void awkward_reduce_max_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength) {
+    __global__ void awkward_reduce_max_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int identity) {
         extern __shared__ int shared[];
 
         int idx = threadIdx.x;
@@ -24,7 +24,7 @@
             for (int stride = 1; stride < blockDim.x; stride *= 2) {
                 int index = idx - stride;
                 if (index >= 0 && parents[index] == parents[idx]) {
-                    shared[tid] = max(shared[tid], shared[index]);
+                    shared[idx] = max(shared[idx], shared[index]);
                 }
                 __syncthreads();
             }
@@ -46,20 +46,19 @@
 fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=cp.int32)
 lenparents = len(parents)
 outlength = int(cp.max(parents)) + 1
+identity = cp.int32(cp.iinfo(cp.int32).min)
 toptr = cp.zeros(outlength, dtype=cp.int32)
 
 block_size = 256
 grid_size = (lenparents + block_size - 1) // block_size
 
-toptr = cp.zeros(outlength, dtype=cp.int32)
-
 raw_module = cp.RawModule(code=cuda_kernel)
 
 awkward_reduce_max_a = raw_module.get_function('awkward_reduce_max_a')
 awkward_reduce_max_b = raw_module.get_function('awkward_reduce_max_b')
 
-awkward_reduce_max_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength))
-awkward_reduce_max_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength))
+awkward_reduce_max_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity))
+awkward_reduce_max_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity))
 
 toptr_host = toptr.get()
 print("tree reduction toptr:", toptr_host)
\ No newline at end of file
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py
index 4ee35375c5..07e26bc878 100644
--- a/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py
+++ b/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py
@@ -2,16 +2,16 @@
 
 cuda_kernel = """
 extern "C" {
-    __global__ void awkward_reduce_min_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength) {
+    __global__ void awkward_reduce_min_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int identity) {
        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
        if (thread_id < outlength) {
-          toptr[thread_id] = 9223372036854775807;
+          toptr[thread_id] = identity;
        }
     }
 }
     extern "C" {
-    __global__ void awkward_reduce_min_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength) {
+    __global__ void awkward_reduce_min_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int identity) {
         extern __shared__ int shared[];
 
         int idx = threadIdx.x;
@@ -24,7 +24,7 @@
             for (int stride = 1; stride < blockDim.x; stride *= 2) {
                 int index = idx - stride;
                 if (index >= 0 && parents[index] == parents[idx]) {
-                    shared[tid] = min(shared[tid], shared[index]);
+                    shared[idx] = min(shared[idx], shared[index]);
                 }
                 __syncthreads();
             }
@@ -46,20 +46,19 @@
 fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=cp.int32)
 lenparents = len(parents)
 outlength = int(cp.max(parents)) + 1
+identity = cp.int32(cp.iinfo(cp.int32).max)
 toptr = cp.zeros(outlength, dtype=cp.int32)
 
 block_size = 256
 grid_size = (lenparents + block_size - 1) // block_size
 
-toptr = cp.zeros(outlength, dtype=cp.int32)
-
 raw_module = cp.RawModule(code=cuda_kernel)
 
 awkward_reduce_min_a = raw_module.get_function('awkward_reduce_min_a')
 awkward_reduce_min_b = raw_module.get_function('awkward_reduce_min_b')
 
-awkward_reduce_min_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength))
-awkward_reduce_min_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength))
+awkward_reduce_min_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity))
+awkward_reduce_min_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity))
 
 toptr_host = toptr.get()
 print("tree reduction toptr:", toptr_host)
\ No newline at end of file
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py
index ac17bbfdeb..0b543d3423 100644
--- a/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py
+++ b/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py
@@ -51,8 +51,6 @@
 block_size = 256
 grid_size = (lenparents + block_size - 1) // block_size
 
-toptr = cp.zeros(outlength, dtype=cp.int32)
-
 raw_module = cp.RawModule(code=cuda_kernel)
 
 awkward_reduce_prod_a = raw_module.get_function('awkward_reduce_prod_a')
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py
index 8ec46ad4f8..a8ae4f3335 100644
--- a/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py
+++ b/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py
@@ -51,8 +51,6 @@
 block_size = 256
 grid_size = (lenparents + block_size - 1) // block_size
 
-toptr = cp.zeros(outlength, dtype=cp.int32)
-
 raw_module = cp.RawModule(code=cuda_kernel)
 
 awkward_reduce_sum_a = raw_module.get_function('awkward_reduce_sum_a')

From 80f91c969e51a2348e62238c536d70fb650c4638 Mon Sep 17 00:00:00 2001
From: ManasviGoyal <mg.manasvi@gmail.com>
Date: Tue, 21 May 2024 14:05:23 +0200
Subject: [PATCH 04/10] fix: handle block boundaries

---
 .../awkward_reduce_max_tree_reduction.py      | 67 ++++++++++++-------
 .../awkward_reduce_min_tree_reduction.py      | 67 ++++++++++++-------
 .../awkward_reduce_prod_tree_reduction.py     | 62 +++++++++++------
 .../awkward_reduce_sum_tree_reduction.py      | 62 +++++++++++------
 4 files changed, 168 insertions(+), 90 deletions(-)

diff --git a/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py
index eb20720a25..0624c3ed8c 100644
--- a/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py
+++ b/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py
@@ -2,7 +2,7 @@
 
 cuda_kernel = """
 extern "C" {
-    __global__ void awkward_reduce_max_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int identity) {
+    __global__ void awkward_reduce_max_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int identity, int* partial) {
        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
        if (thread_id < outlength) {
@@ -10,55 +10,74 @@
        }
     }
 }
-    extern "C" {
-    __global__ void awkward_reduce_max_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int identity) {
+    
+extern "C" {
+    __global__ void awkward_reduce_max_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int identity, int* partial) {
         extern __shared__ int shared[];
 
         int idx = threadIdx.x;
-        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+        int thread_id = blockIdx.x * blockDim.x + idx;
 
         if (thread_id < lenparents) {
             shared[idx] = fromptr[thread_id];
+        }
+        __syncthreads();
+
+        for (int stride = 1; stride < blockDim.x; stride *= 2) {
+            int val = identity;
+            if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+                val = shared[idx - stride];
+            }
+            __syncthreads();
+            shared[idx] = max(shared[idx], val);
             __syncthreads();
+        }
 
-            for (int stride = 1; stride < blockDim.x; stride *= 2) {
-                int index = idx - stride;
-                if (index >= 0 && parents[index] == parents[idx]) {
-                    shared[idx] = max(shared[idx], shared[index]);
-                }
-                __syncthreads();
+        if (thread_id < lenparents) {
+            int parent = parents[thread_id];
+            if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+                partial[blockIdx.x * outlength + parent] = shared[idx];
             }
+        }
+    }
+}
 
-            fromptr[thread_id] = shared[idx];
+extern "C" {
+    __global__ void awkward_reduce_max_c(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int identity, int* partial) {
+        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
-            if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
-                int parent = parents[thread_id];
-                if (parent < lenparents) {
-                    toptr[parent] = shared[idx];
-                }
+        if (thread_id < outlength) {
+            int maximum = identity;
+            int blocks = (lenparents + blockDim.x - 1) / blockDim.x;
+            for (int i = 0; i < blocks; ++i) {
+                maximum = max(maximum, partial[i * outlength + thread_id]);
             }
+            toptr[thread_id] = maximum;
         }
     }
 }
 """
 
-parents = cp.array([0, 1, 1, 2, 2, 3, 3, 3, 5], dtype=cp.int32)
-fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=cp.int32)
+parents = cp.array([0, 1, 1, 2, 2, 2, 2, 2, 2, 5], dtype=cp.int32)
+fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=cp.int32)
 lenparents = len(parents)
 outlength = int(cp.max(parents)) + 1
-identity = cp.int32(cp.iinfo(cp.int32).min)
-toptr = cp.zeros(outlength, dtype=cp.int32)
-
-block_size = 256
+toptr = cp.full(outlength, cp.iinfo(cp.int32).min, dtype=cp.int32)
+identity = cp.iinfo(cp.int32).min
+block_size = 2
+partial = cp.full((outlength * ((lenparents + block_size - 1) // block_size)), cp.iinfo(cp.int32).min, dtype=cp.int32)
 grid_size = (lenparents + block_size - 1) // block_size
+shared_mem_size = block_size * cp.int32().nbytes
 
 raw_module = cp.RawModule(code=cuda_kernel)
 
 awkward_reduce_max_a = raw_module.get_function('awkward_reduce_max_a')
 awkward_reduce_max_b = raw_module.get_function('awkward_reduce_max_b')
+awkward_reduce_max_c = raw_module.get_function('awkward_reduce_max_c')
 
-awkward_reduce_max_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity))
-awkward_reduce_max_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity))
+awkward_reduce_max_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial))
+awkward_reduce_max_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial), shared_mem=shared_mem_size)
+awkward_reduce_max_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial))
 
 toptr_host = toptr.get()
 print("tree reduction toptr:", toptr_host)
\ No newline at end of file
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py
index 07e26bc878..5810ca97e5 100644
--- a/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py
+++ b/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py
@@ -2,7 +2,7 @@
 
 cuda_kernel = """
 extern "C" {
-    __global__ void awkward_reduce_min_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int identity) {
+    __global__ void awkward_reduce_min_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int identity, int* partial) {
        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
        if (thread_id < outlength) {
@@ -10,55 +10,74 @@
        }
     }
 }
-    extern "C" {
-    __global__ void awkward_reduce_min_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int identity) {
+    
+extern "C" {
+    __global__ void awkward_reduce_min_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int identity, int* partial) {
         extern __shared__ int shared[];
 
         int idx = threadIdx.x;
-        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+        int thread_id = blockIdx.x * blockDim.x + idx;
 
         if (thread_id < lenparents) {
             shared[idx] = fromptr[thread_id];
+        }
+        __syncthreads();
+
+        for (int stride = 1; stride < blockDim.x; stride *= 2) {
+            int val = identity;
+            if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+                val = shared[idx - stride];
+            }
+            __syncthreads();
+            shared[idx] = min(shared[idx], val);
             __syncthreads();
+        }
 
-            for (int stride = 1; stride < blockDim.x; stride *= 2) {
-                int index = idx - stride;
-                if (index >= 0 && parents[index] == parents[idx]) {
-                    shared[idx] = min(shared[idx], shared[index]);
-                }
-                __syncthreads();
+        if (thread_id < lenparents) {
+            int parent = parents[thread_id];
+            if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+                partial[blockIdx.x * outlength + parent] = shared[idx];
             }
+        }
+    }
+}
 
-            fromptr[thread_id] = shared[idx];
+extern "C" {
+    __global__ void awkward_reduce_min_c(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int identity, int* partial) {
+        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
-            if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
-                int parent = parents[thread_id];
-                if (parent < lenparents) {
-                    toptr[parent] = shared[idx];
-                }
+        if (thread_id < outlength) {
+            int minimum = identity;
+            int blocks = (lenparents + blockDim.x - 1) / blockDim.x;
+            for (int i = 0; i < blocks; ++i) {
+                minimum = min(minimum, partial[i * outlength + thread_id]);
             }
+            toptr[thread_id] = minimum;
         }
     }
 }
 """
 
-parents = cp.array([0, 1, 1, 2, 2, 3, 3, 3, 5], dtype=cp.int32)
-fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=cp.int32)
+parents = cp.array([0, 1, 1, 2, 2, 2, 2, 2, 2, 5], dtype=cp.int32)
+fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=cp.int32)
 lenparents = len(parents)
 outlength = int(cp.max(parents)) + 1
-identity = cp.int32(cp.iinfo(cp.int32).max)
-toptr = cp.zeros(outlength, dtype=cp.int32)
-
-block_size = 256
+toptr = cp.full(outlength, cp.iinfo(cp.int32).max, dtype=cp.int32)
+identity = cp.iinfo(cp.int32).max
+block_size = 2
+partial = cp.full((outlength * ((lenparents + block_size - 1) // block_size)), cp.iinfo(cp.int32).max, dtype=cp.int32)
 grid_size = (lenparents + block_size - 1) // block_size
+shared_mem_size = block_size * cp.int32().nbytes
 
 raw_module = cp.RawModule(code=cuda_kernel)
 
 awkward_reduce_min_a = raw_module.get_function('awkward_reduce_min_a')
 awkward_reduce_min_b = raw_module.get_function('awkward_reduce_min_b')
+awkward_reduce_min_c = raw_module.get_function('awkward_reduce_min_c')
 
-awkward_reduce_min_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity))
-awkward_reduce_min_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity))
+awkward_reduce_min_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial))
+awkward_reduce_min_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial), shared_mem=shared_mem_size)
+awkward_reduce_min_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial))
 
 toptr_host = toptr.get()
 print("tree reduction toptr:", toptr_host)
\ No newline at end of file
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py
index 0b543d3423..48f33cc5c8 100644
--- a/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py
+++ b/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py
@@ -2,7 +2,7 @@
 
 cuda_kernel = """
 extern "C" {
-    __global__ void awkward_reduce_prod_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength) {
+    __global__ void awkward_reduce_prod_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
        if (thread_id < outlength) {
@@ -10,54 +10,74 @@
        }
     }
 }
-    extern "C" {
-    __global__ void awkward_reduce_prod_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength) {
+    
+extern "C" {
+    __global__ void awkward_reduce_prod_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int* partial) {
         extern __shared__ int shared[];
 
         int idx = threadIdx.x;
-        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+        int thread_id = blockIdx.x * blockDim.x + idx;
 
         if (thread_id < lenparents) {
             shared[idx] = fromptr[thread_id];
+        }
+        __syncthreads();
+
+        for (int stride = 1; stride < blockDim.x; stride *= 2) {
+            int val = 1;
+            if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+                val = shared[idx - stride];
+            }
+            __syncthreads();
+            shared[idx] *= val;
             __syncthreads();
+        }
 
-            for (int stride = 1; stride < blockDim.x; stride *= 2) {
-                int index = idx - stride;
-                if (index >= 0 && parents[index] == parents[idx]) {
-                    shared[idx] *= shared[index];
-                }
-                __syncthreads();
+        if (thread_id < lenparents) {
+            int parent = parents[thread_id];
+            if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+                partial[blockIdx.x * outlength + parent] = shared[idx];
             }
+        }
+    }
+}
 
-            fromptr[thread_id] = shared[idx];
+extern "C" {
+    __global__ void awkward_reduce_prod_c(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int* partial) {
+        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
-            if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
-                int parent = parents[thread_id];
-                if (parent < lenparents) {
-                    toptr[parent] = shared[idx];
-                }
+        if (thread_id < outlength) {
+            int prod = 1;
+            int blocks = (lenparents + blockDim.x - 1) / blockDim.x;
+            for (int i = 0; i < blocks; ++i) {
+                prod *= partial[i * outlength + thread_id];
             }
+            toptr[thread_id] = prod;
         }
     }
 }
 """
 
-parents = cp.array([0, 1, 1, 2, 2, 3, 3, 3, 5], dtype=cp.int32)
-fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=cp.int32)
+parents = cp.array([0, 1, 1, 2, 2, 2, 2, 2, 2, 5], dtype=cp.int32)
+fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=cp.int32)
 lenparents = len(parents)
 outlength = int(cp.max(parents)) + 1
 toptr = cp.zeros(outlength, dtype=cp.int32)
 
-block_size = 256
+block_size = 2
+partial = cp.ones((outlength * ((lenparents + block_size - 1) // block_size)), dtype=cp.int32)
 grid_size = (lenparents + block_size - 1) // block_size
+shared_mem_size = block_size * cp.int32().nbytes
 
 raw_module = cp.RawModule(code=cuda_kernel)
 
 awkward_reduce_prod_a = raw_module.get_function('awkward_reduce_prod_a')
 awkward_reduce_prod_b = raw_module.get_function('awkward_reduce_prod_b')
+awkward_reduce_prod_c = raw_module.get_function('awkward_reduce_prod_c')
 
-awkward_reduce_prod_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength))
-awkward_reduce_prod_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength))
+awkward_reduce_prod_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial))
+awkward_reduce_prod_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size)
+awkward_reduce_prod_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial))
 
 toptr_host = toptr.get()
 print("tree reduction toptr:", toptr_host)
\ No newline at end of file
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py
index a8ae4f3335..c8757ebb64 100644
--- a/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py
+++ b/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py
@@ -2,7 +2,7 @@
 
 cuda_kernel = """
 extern "C" {
-    __global__ void awkward_reduce_sum_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength) {
+    __global__ void awkward_reduce_sum_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
        if (thread_id < outlength) {
@@ -10,54 +10,74 @@
        }
     }
 }
-    extern "C" {
-    __global__ void awkward_reduce_sum_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength) {
+    
+extern "C" {
+    __global__ void awkward_reduce_sum_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int* partial) {
         extern __shared__ int shared[];
 
         int idx = threadIdx.x;
-        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+        int thread_id = blockIdx.x * blockDim.x + idx;
 
         if (thread_id < lenparents) {
             shared[idx] = fromptr[thread_id];
+        }
+        __syncthreads();
+
+        for (int stride = 1; stride < blockDim.x; stride *= 2) {
+            int val = 0;
+            if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+                val = shared[idx - stride];
+            }
+            __syncthreads();
+            shared[idx] += val;
             __syncthreads();
+        }
 
-            for (int stride = 1; stride < blockDim.x; stride *= 2) {
-                int index = idx - stride;
-                if (index >= 0 && parents[index] == parents[idx]) {
-                    shared[idx] += shared[index];
-                }
-                __syncthreads();
+        if (thread_id < lenparents) {
+            int parent = parents[thread_id];
+            if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+                partial[blockIdx.x * outlength + parent] = shared[idx];
             }
+        }
+    }
+}
 
-            fromptr[thread_id] = shared[idx];
+extern "C" {
+    __global__ void awkward_reduce_sum_c(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int* partial) {
+        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
-            if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
-                int parent = parents[thread_id];
-                if (parent < lenparents) {
-                    toptr[parent] = shared[idx];
-                }
+        if (thread_id < outlength) {
+            int sum = 0;
+            int blocks = (lenparents + blockDim.x - 1) / blockDim.x;
+            for (int i = 0; i < blocks; ++i) {
+                sum += partial[i * outlength + thread_id];
             }
+            toptr[thread_id] = sum;
         }
     }
 }
 """
 
-parents = cp.array([0, 1, 1, 2, 2, 3, 3, 3, 5], dtype=cp.int32)
-fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=cp.int32)
+parents = cp.array([0, 1, 1, 2, 2, 2, 2, 2, 2, 5], dtype=cp.int32)
+fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=cp.int32)
 lenparents = len(parents)
 outlength = int(cp.max(parents)) + 1
 toptr = cp.zeros(outlength, dtype=cp.int32)
 
-block_size = 256
+block_size = 2
+partial = cp.zeros((outlength * ((lenparents + block_size - 1) // block_size)), dtype=cp.int32)
 grid_size = (lenparents + block_size - 1) // block_size
+shared_mem_size = block_size * cp.int32().nbytes
 
 raw_module = cp.RawModule(code=cuda_kernel)
 
 awkward_reduce_sum_a = raw_module.get_function('awkward_reduce_sum_a')
 awkward_reduce_sum_b = raw_module.get_function('awkward_reduce_sum_b')
+awkward_reduce_sum_c = raw_module.get_function('awkward_reduce_sum_c')
 
-awkward_reduce_sum_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength))
-awkward_reduce_sum_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength))
+awkward_reduce_sum_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial))
+awkward_reduce_sum_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size)
+awkward_reduce_sum_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial))
 
 toptr_host = toptr.get()
 print("tree reduction toptr:", toptr_host)
\ No newline at end of file

From f000fd83e5eafaf82629bcd629db34c8f868cda0 Mon Sep 17 00:00:00 2001
From: ManasviGoyal <mg.manasvi@gmail.com>
Date: Tue, 21 May 2024 14:26:25 +0200
Subject: [PATCH 05/10] chore: add argmin and argmax

---
 .../awkward_reduce_argmax_tree_reduction.py   | 87 +++++++++++++++++++
 .../awkward_reduce_argmin_tree_reduction.py   | 87 +++++++++++++++++++
 2 files changed, 174 insertions(+)
 create mode 100644 studies/cuda-kernels/reducers/awkward_reduce_argmax_tree_reduction.py
 create mode 100644 studies/cuda-kernels/reducers/awkward_reduce_argmin_tree_reduction.py

diff --git a/studies/cuda-kernels/reducers/awkward_reduce_argmax_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_argmax_tree_reduction.py
new file mode 100644
index 0000000000..41221ddf6b
--- /dev/null
+++ b/studies/cuda-kernels/reducers/awkward_reduce_argmax_tree_reduction.py
@@ -0,0 +1,87 @@
+import cupy as cp
+
+cuda_kernel = """
+extern "C" {
+    __global__ void awkward_reduce_argmax_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
+       int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+       if (thread_id < outlength) {
+          toptr[thread_id] = -1;
+       }
+    }
+}
+    
+extern "C" {
+    __global__ void awkward_reduce_argmax_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int* partial) {
+        extern __shared__ int shared[];
+
+        int idx = threadIdx.x;
+        int thread_id = blockIdx.x * blockDim.x + idx;
+
+        if (thread_id < lenparents) {
+            shared[idx] = thread_id;
+        }
+        __syncthreads();
+
+        for (int stride = 1; stride < blockDim.x; stride *= 2) {
+            int index = -1;
+            if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+                index = shared[idx - stride];
+            }
+            __syncthreads();
+            if (index != -1 && (shared[idx] == -1 || fromptr[index] > fromptr[shared[idx]])) {
+                shared[idx] = index;
+            }
+            __syncthreads();
+        }
+
+        if (thread_id < lenparents) {
+            int parent = parents[thread_id];
+            if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+                partial[blockIdx.x * outlength + parent] = shared[idx];
+            }
+        }
+    }
+}
+
+extern "C" {
+    __global__ void awkward_reduce_argmax_c(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int* partial) {
+        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+        if (thread_id < outlength) {
+            int max_index = -1;
+            int blocks = (lenparents + blockDim.x - 1) / blockDim.x;
+            for (int i = 0; i < blocks; ++i) {
+                int index = partial[i * outlength + thread_id];
+                if (index != -1 && (max_index == -1 || fromptr[index] > fromptr[max_index])) {
+                    max_index = index;
+                }
+            }
+            toptr[thread_id] = max_index;
+        }
+    }
+}
+"""
+
+parents = cp.array([0, 1, 1, 2, 2, 2, 2, 2, 2, 5], dtype=cp.int32)
+fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=cp.int32)
+lenparents = len(parents)
+outlength = int(cp.max(parents)) + 1
+toptr = cp.full(outlength, -1, dtype=cp.int32)
+block_size = 2
+partial = cp.full((outlength * ((lenparents + block_size - 1) // block_size)), -1, dtype=cp.int32)
+grid_size = (lenparents + block_size - 1) // block_size
+shared_mem_size = block_size * cp.int32().nbytes
+
+raw_module = cp.RawModule(code=cuda_kernel)
+
+awkward_reduce_argmax_a = raw_module.get_function('awkward_reduce_argmax_a')
+awkward_reduce_argmax_b = raw_module.get_function('awkward_reduce_argmax_b')
+awkward_reduce_argmax_c = raw_module.get_function('awkward_reduce_argmax_c')
+
+awkward_reduce_argmax_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial))
+awkward_reduce_argmax_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size)
+awkward_reduce_argmax_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial))
+
+toptr_host = toptr.get()
+print("tree reduction toptr:", toptr_host)
\ No newline at end of file
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_argmin_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_argmin_tree_reduction.py
new file mode 100644
index 0000000000..8718339519
--- /dev/null
+++ b/studies/cuda-kernels/reducers/awkward_reduce_argmin_tree_reduction.py
@@ -0,0 +1,87 @@
+import cupy as cp
+
+cuda_kernel = """
+extern "C" {
+    __global__ void awkward_reduce_argmin_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
+       int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+       if (thread_id < outlength) {
+          toptr[thread_id] = -1;
+       }
+    }
+}
+    
+extern "C" {
+    __global__ void awkward_reduce_argmin_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int* partial) {
+        extern __shared__ int shared[];
+
+        int idx = threadIdx.x;
+        int thread_id = blockIdx.x * blockDim.x + idx;
+
+        if (thread_id < lenparents) {
+            shared[idx] = thread_id;
+        }
+        __syncthreads();
+
+        for (int stride = 1; stride < blockDim.x; stride *= 2) {
+            int index = -1;
+            if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+                index = shared[idx - stride];
+            }
+            __syncthreads();
+            if (index != -1 && (shared[idx] == -1 || fromptr[index] < fromptr[shared[idx]])) {
+                shared[idx] = index;
+            }
+            __syncthreads();
+        }
+
+        if (thread_id < lenparents) {
+            int parent = parents[thread_id];
+            if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+                partial[blockIdx.x * outlength + parent] = shared[idx];
+            }
+        }
+    }
+}
+
+extern "C" {
+    __global__ void awkward_reduce_argmin_c(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int* partial) {
+        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+        if (thread_id < outlength) {
+            int min_index = -1;
+            int blocks = (lenparents + blockDim.x - 1) / blockDim.x;
+            for (int i = 0; i < blocks; ++i) {
+                int index = partial[i * outlength + thread_id];
+                if (index != -1 && (min_index == -1 || fromptr[index] < fromptr[min_index])) {
+                    min_index = index;
+                }
+            }
+            toptr[thread_id] = min_index;
+        }
+    }
+}
+"""
+
+parents = cp.array([0, 1, 1, 2, 2, 2, 2, 2, 2, 5], dtype=cp.int32)
+fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=cp.int32)
+lenparents = len(parents)
+outlength = int(cp.max(parents)) + 1
+toptr = cp.full(outlength, -1, dtype=cp.int32)
+block_size = 2
+partial = cp.full((outlength * ((lenparents + block_size - 1) // block_size)), -1, dtype=cp.int32)
+grid_size = (lenparents + block_size - 1) // block_size
+shared_mem_size = block_size * cp.int32().nbytes
+
+raw_module = cp.RawModule(code=cuda_kernel)
+
+awkward_reduce_argmin_a = raw_module.get_function('awkward_reduce_argmin_a')
+awkward_reduce_argmin_b = raw_module.get_function('awkward_reduce_argmin_b')
+awkward_reduce_argmin_c = raw_module.get_function('awkward_reduce_argmin_c')
+
+awkward_reduce_argmin_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial))
+awkward_reduce_argmin_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size)
+awkward_reduce_argmin_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial))
+
+toptr_host = toptr.get()
+print("tree reduction toptr:", toptr_host)
\ No newline at end of file

From c040a039c60c514ce209ba156dde5f19429f60c1 Mon Sep 17 00:00:00 2001
From: ManasviGoyal <mg.manasvi@gmail.com>
Date: Tue, 21 May 2024 16:20:40 +0200
Subject: [PATCH 06/10] chore: add sum and max complex

---
 ...kward_reduce_max_complex_tree_reduction.py | 98 +++++++++++++++++++
 ...kward_reduce_sum_complex_tree_reduction.py | 92 +++++++++++++++++
 2 files changed, 190 insertions(+)
 create mode 100644 studies/cuda-kernels/reducers/awkward_reduce_max_complex_tree_reduction.py
 create mode 100644 studies/cuda-kernels/reducers/awkward_reduce_sum_complex_tree_reduction.py

diff --git a/studies/cuda-kernels/reducers/awkward_reduce_max_complex_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_max_complex_tree_reduction.py
new file mode 100644
index 0000000000..4d81b60a80
--- /dev/null
+++ b/studies/cuda-kernels/reducers/awkward_reduce_max_complex_tree_reduction.py
@@ -0,0 +1,98 @@
+import cupy as cp
+
+cuda_kernel = """
+extern "C" {
+    __global__ void awkward_reduce_max_complex_a(float* toptr, float* fromptr, int* parents, int lenparents, int outlength, int identity, int* partial) {
+       int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+       if (thread_id < outlength) {
+          toptr[thread_id * 2] = identity;
+          toptr[thread_id * 2 + 1] = 0;
+       }
+    }
+}
+    
+extern "C" {
+    __global__ void awkward_reduce_max_complex_b(float* toptr, float* fromptr, int* parents, int lenparents, int outlength, int identity, int* partial) {
+        extern __shared__ float shared[];
+
+        int idx = threadIdx.x;
+        int thread_id = blockIdx.x * blockDim.x + idx;
+
+        if (thread_id < lenparents) {
+            shared[idx * 2] = fromptr[thread_id * 2];
+            shared[idx * 2 + 1] = fromptr[thread_id * 2 + 1];
+        }
+        __syncthreads();
+
+        for (int stride = 1; stride < blockDim.x; stride *= 2) {
+            float real = identity;
+            float imag = 0;
+            if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+                real = shared[(idx - stride) * 2];
+                imag = shared[(idx - stride) * 2 + 1];
+            }
+            __syncthreads();
+            if (shared[idx * 2] < real || shared[idx * 2] == real && shared[idx * 2 + 1] < imag) {
+                shared[idx * 2] = real;
+                shared[idx * 2 + 1] = imag;
+            }
+            __syncthreads();
+        }
+
+        if (thread_id < lenparents) {
+            int parent = parents[thread_id];
+            if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+                partial[(blockIdx.x * outlength + parent) * 2 ] = shared[idx * 2];
+                partial[(blockIdx.x * outlength + parent) * 2 + 1] = shared[idx * 2 + 1];
+            }
+        }
+    }
+}
+
+extern "C" {
+    __global__ void awkward_reduce_max_complex_c(float* toptr, float* fromptr, int* parents, int lenparents, int outlength, int identity, int* partial) {
+        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+        if (thread_id < outlength) {
+            float real = identity;
+            float imag = 0;
+            int blocks = (lenparents + blockDim.x - 1) / blockDim.x;
+            for (int i = 0; i < blocks; ++i) {
+            if (real < partial[2 * (i * outlength + thread_id)] ||
+                (partial[2 * (i * outlength + thread_id)] == real &&
+                imag < partial[2 * (i * outlength + thread_id) + 1])) {
+                    real = partial[(i * outlength + thread_id) * 2];
+                    imag = partial[(i * outlength + thread_id) * 2 + 1];
+                }
+            }
+            toptr[thread_id * 2] = real;
+            toptr[thread_id * 2 + 1] = imag;
+        }
+    }
+}
+"""
+
+parents = cp.array([0, 1, 1, 2, 2, 2, 2, 2, 2, 5], dtype=cp.int32)
+fromptr = cp.array([1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 0], dtype=cp.float32)
+lenparents = len(parents)
+outlength = int(cp.max(parents)) + 1
+identity = cp.finfo(cp.float32).min
+toptr = cp.full(outlength * 2, identity, dtype=cp.float32)
+block_size = 2
+partial = cp.full((outlength * ((lenparents + block_size - 1) // block_size)), identity, dtype=cp.float32)
+grid_size = (lenparents + block_size - 1) // block_size
+shared_mem_size = block_size * cp.float32().nbytes
+
+raw_module = cp.RawModule(code=cuda_kernel)
+
+awkward_reduce_max_complex_a = raw_module.get_function('awkward_reduce_max_complex_a')
+awkward_reduce_max_complex_b = raw_module.get_function('awkward_reduce_max_complex_b')
+awkward_reduce_max_complex_c = raw_module.get_function('awkward_reduce_max_complex_c')
+
+awkward_reduce_max_complex_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial))
+awkward_reduce_max_complex_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial), shared_mem=shared_mem_size)
+awkward_reduce_max_complex_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial))
+
+toptr_host = toptr[0::2] + 1j * toptr[1::2]
+print("tree reduction toptr:", toptr_host)
\ No newline at end of file
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_sum_complex_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_sum_complex_tree_reduction.py
new file mode 100644
index 0000000000..3f042880dc
--- /dev/null
+++ b/studies/cuda-kernels/reducers/awkward_reduce_sum_complex_tree_reduction.py
@@ -0,0 +1,92 @@
+import cupy as cp
+
+cuda_kernel = """
+extern "C" {
+    __global__ void awkward_reduce_sum_complex_a(float* toptr, float* fromptr, int* parents, int lenparents, int outlength, int* partial) {
+       int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+       if (thread_id < outlength) {
+          toptr[thread_id * 2] = 0;
+          toptr[thread_id * 2 + 1] = 0;
+       }
+    }
+}
+    
+extern "C" {
+    __global__ void awkward_reduce_sum_complex_b(float* toptr, float* fromptr, int* parents, int lenparents, int outlength, int* partial) {
+        extern __shared__ float shared[];
+
+        int idx = threadIdx.x;
+        int thread_id = blockIdx.x * blockDim.x + idx;
+
+        if (thread_id < lenparents) {
+            shared[idx * 2] = fromptr[thread_id * 2];
+            shared[idx * 2 + 1] = fromptr[thread_id * 2 + 1];
+        }
+        __syncthreads();
+
+        for (int stride = 1; stride < blockDim.x; stride *= 2) {
+            float real = 0;
+            float imag = 0;
+            if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+                real = shared[(idx - stride) * 2];
+                imag = shared[(idx - stride) * 2 + 1];
+            }
+            __syncthreads();
+            shared[idx * 2] += real;
+            shared[idx * 2 + 1] += imag;
+            __syncthreads();
+        }
+
+        if (thread_id < lenparents) {
+            int parent = parents[thread_id];
+            if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+                partial[(blockIdx.x * outlength + parent) * 2 ] = shared[idx * 2];
+                partial[(blockIdx.x * outlength + parent) * 2 + 1] = shared[idx * 2 + 1];
+            }
+        }
+    }
+}
+
+extern "C" {
+    __global__ void awkward_reduce_sum_complex_c(float* toptr, float* fromptr, int* parents, int lenparents, int outlength, int* partial) {
+        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+        if (thread_id < outlength) {
+            float real = 0;
+            float imag = 0;
+            int blocks = (lenparents + blockDim.x - 1) / blockDim.x;
+            for (int i = 0; i < blocks; ++i) {
+                real += partial[(i * outlength + thread_id) * 2];
+                imag += partial[(i * outlength + thread_id) * 2 + 1];
+            }
+            toptr[thread_id * 2] = real;
+            toptr[thread_id * 2 + 1] = imag;
+        }
+    }
+}
+"""
+
+parents = cp.array([0, 1, 1, 2, 2, 2, 2, 2, 2, 5], dtype=cp.int32)
+fromptr = cp.array([1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 0], dtype=cp.float32)
+lenparents = len(parents)
+outlength = int(cp.max(parents)) + 1
+toptr = cp.zeros(outlength * 2, dtype=cp.float32)
+
+block_size = 2
+partial = cp.zeros((outlength * ((lenparents + block_size - 1) // block_size)), dtype=cp.float32)
+grid_size = (lenparents + block_size - 1) // block_size
+shared_mem_size = block_size * cp.int32().nbytes
+
+raw_module = cp.RawModule(code=cuda_kernel)
+
+awkward_reduce_sum_complex_a = raw_module.get_function('awkward_reduce_sum_complex_a')
+awkward_reduce_sum_complex_b = raw_module.get_function('awkward_reduce_sum_complex_b')
+awkward_reduce_sum_complex_c = raw_module.get_function('awkward_reduce_sum_complex_c')
+
+awkward_reduce_sum_complex_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial))
+awkward_reduce_sum_complex_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size)
+awkward_reduce_sum_complex_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial))
+
+toptr_host = toptr.get()
+print("tree reduction toptr:", toptr_host)
\ No newline at end of file

From a01979782a94c88746a357d5b7bfe77b70e3b70f Mon Sep 17 00:00:00 2001
From: ManasviGoyal <mg.manasvi@gmail.com>
Date: Wed, 22 May 2024 10:27:45 +0200
Subject: [PATCH 07/10] chore: add sum and prod bool

---
 .../awkward_reduce_argmax_tree_reduction.py   |  7 +-
 .../awkward_reduce_argmin_tree_reduction.py   |  7 +-
 .../awkward_reduce_max_tree_reduction.py      | 11 +--
 ...kward_reduce_min_complex_tree_reduction.py | 98 +++++++++++++++++++
 .../awkward_reduce_min_tree_reduction.py      | 11 +--
 ...awkward_reduce_prod_bool_tree_reduction.py | 82 ++++++++++++++++
 .../awkward_reduce_prod_tree_reduction.py     |  7 +-
 .../awkward_reduce_sum_bool_tree_reduction.py | 82 ++++++++++++++++
 ...kward_reduce_sum_complex_tree_reduction.py |  3 +-
 .../awkward_reduce_sum_tree_reduction.py      |  7 +-
 10 files changed, 285 insertions(+), 30 deletions(-)
 create mode 100644 studies/cuda-kernels/reducers/awkward_reduce_min_complex_tree_reduction.py
 create mode 100644 studies/cuda-kernels/reducers/awkward_reduce_prod_bool_tree_reduction.py
 create mode 100644 studies/cuda-kernels/reducers/awkward_reduce_sum_bool_tree_reduction.py

diff --git a/studies/cuda-kernels/reducers/awkward_reduce_argmax_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_argmax_tree_reduction.py
index 41221ddf6b..9743c91c9b 100644
--- a/studies/cuda-kernels/reducers/awkward_reduce_argmax_tree_reduction.py
+++ b/studies/cuda-kernels/reducers/awkward_reduce_argmax_tree_reduction.py
@@ -12,7 +12,7 @@
 }
     
 extern "C" {
-    __global__ void awkward_reduce_argmax_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int* partial) {
+    __global__ void awkward_reduce_argmax_b(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
         extern __shared__ int shared[];
 
         int idx = threadIdx.x;
@@ -45,7 +45,7 @@
 }
 
 extern "C" {
-    __global__ void awkward_reduce_argmax_c(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int* partial) {
+    __global__ void awkward_reduce_argmax_c(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
         int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
         if (thread_id < outlength) {
@@ -83,5 +83,4 @@
 awkward_reduce_argmax_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size)
 awkward_reduce_argmax_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial))
 
-toptr_host = toptr.get()
-print("tree reduction toptr:", toptr_host)
\ No newline at end of file
+assert cp.array_equal(toptr, cp.array([0, 2, 8, -1, -1, 9]))
\ No newline at end of file
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_argmin_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_argmin_tree_reduction.py
index 8718339519..0d8cd527a5 100644
--- a/studies/cuda-kernels/reducers/awkward_reduce_argmin_tree_reduction.py
+++ b/studies/cuda-kernels/reducers/awkward_reduce_argmin_tree_reduction.py
@@ -12,7 +12,7 @@
 }
     
 extern "C" {
-    __global__ void awkward_reduce_argmin_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int* partial) {
+    __global__ void awkward_reduce_argmin_b(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
         extern __shared__ int shared[];
 
         int idx = threadIdx.x;
@@ -45,7 +45,7 @@
 }
 
 extern "C" {
-    __global__ void awkward_reduce_argmin_c(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int* partial) {
+    __global__ void awkward_reduce_argmin_c(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
         int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
         if (thread_id < outlength) {
@@ -83,5 +83,4 @@
 awkward_reduce_argmin_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size)
 awkward_reduce_argmin_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial))
 
-toptr_host = toptr.get()
-print("tree reduction toptr:", toptr_host)
\ No newline at end of file
+assert cp.array_equal(toptr, cp.array([0, 1, 3, -1, -1, 9]))
\ No newline at end of file
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py
index 0624c3ed8c..df60f4417e 100644
--- a/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py
+++ b/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py
@@ -12,7 +12,7 @@
 }
     
 extern "C" {
-    __global__ void awkward_reduce_max_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int identity, int* partial) {
+    __global__ void awkward_reduce_max_b(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int identity, int* partial) {
         extern __shared__ int shared[];
 
         int idx = threadIdx.x;
@@ -43,7 +43,7 @@
 }
 
 extern "C" {
-    __global__ void awkward_reduce_max_c(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int identity, int* partial) {
+    __global__ void awkward_reduce_max_c(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int identity, int* partial) {
         int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
         if (thread_id < outlength) {
@@ -62,10 +62,10 @@
 fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=cp.int32)
 lenparents = len(parents)
 outlength = int(cp.max(parents)) + 1
-toptr = cp.full(outlength, cp.iinfo(cp.int32).min, dtype=cp.int32)
 identity = cp.iinfo(cp.int32).min
+toptr = cp.full(outlength, identity, dtype=cp.int32)
 block_size = 2
-partial = cp.full((outlength * ((lenparents + block_size - 1) // block_size)), cp.iinfo(cp.int32).min, dtype=cp.int32)
+partial = cp.full((outlength * ((lenparents + block_size - 1) // block_size)), identity, dtype=cp.int32)
 grid_size = (lenparents + block_size - 1) // block_size
 shared_mem_size = block_size * cp.int32().nbytes
 
@@ -79,5 +79,4 @@
 awkward_reduce_max_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial), shared_mem=shared_mem_size)
 awkward_reduce_max_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial))
 
-toptr_host = toptr.get()
-print("tree reduction toptr:", toptr_host)
\ No newline at end of file
+assert cp.array_equal(toptr, cp.array([1, 3, 9, -2147483648, -2147483648, 10]))
\ No newline at end of file
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_min_complex_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_min_complex_tree_reduction.py
new file mode 100644
index 0000000000..f9cad9abab
--- /dev/null
+++ b/studies/cuda-kernels/reducers/awkward_reduce_min_complex_tree_reduction.py
@@ -0,0 +1,98 @@
+import cupy as cp
+
+cuda_kernel = """
+extern "C" {
+    __global__ void awkward_reduce_min_complex_a(float* toptr, float* fromptr, int* parents, int lenparents, int outlength, int identity, int* partial) {
+       int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+       if (thread_id < outlength) {
+          toptr[thread_id * 2] = identity;
+          toptr[thread_id * 2 + 1] = 0;
+       }
+    }
+}
+    
+extern "C" {
+    __global__ void awkward_reduce_min_complex_b(float* toptr, float* fromptr, int* parents, int lenparents, int outlength, int identity, int* partial) {
+        extern __shared__ float shared[];
+
+        int idx = threadIdx.x;
+        int thread_id = blockIdx.x * blockDim.x + idx;
+
+        if (thread_id < lenparents) {
+            shared[idx * 2] = fromptr[thread_id * 2];
+            shared[idx * 2 + 1] = fromptr[thread_id * 2 + 1];
+        }
+        __syncthreads();
+
+        for (int stride = 1; stride < blockDim.x; stride *= 2) {
+            float real = identity;
+            float imag = 0;
+            if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+                real = shared[(idx - stride) * 2];
+                imag = shared[(idx - stride) * 2 + 1];
+            }
+            __syncthreads();
+            if (shared[idx * 2] > real || shared[idx * 2] == real && shared[idx * 2 + 1] > imag) {
+                shared[idx * 2] = real;
+                shared[idx * 2 + 1] = imag;
+            }
+            __syncthreads();
+        }
+
+        if (thread_id < lenparents) {
+            int parent = parents[thread_id];
+            if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+                partial[(blockIdx.x * outlength + parent) * 2 ] = shared[idx * 2];
+                partial[(blockIdx.x * outlength + parent) * 2 + 1] = shared[idx * 2 + 1];
+            }
+        }
+    }
+}
+
+extern "C" {
+    __global__ void awkward_reduce_min_complex_c(float* toptr, float* fromptr, int* parents, int lenparents, int outlength, int identity, int* partial) {
+        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+        if (thread_id < outlength) {
+            float real = identity;
+            float imag = 0;
+            int blocks = (lenparents + blockDim.x - 1) / blockDim.x;
+            for (int i = 0; i < blocks; ++i) {
+            if (real > partial[2 * (i * outlength + thread_id)] ||
+                (partial[2 * (i * outlength + thread_id)] == real &&
+                imag > partial[2 * (i * outlength + thread_id) + 1])) {
+                    real = partial[(i * outlength + thread_id) * 2];
+                    imag = partial[(i * outlength + thread_id) * 2 + 1];
+                }
+            }
+            toptr[thread_id * 2] = real;
+            toptr[thread_id * 2 + 1] = imag;
+        }
+    }
+}
+"""
+
+parents = cp.array([0, 1, 1, 2, 2, 2, 2, 2, 2, 5], dtype=cp.int32)
+fromptr = cp.array([1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 0], dtype=cp.float32)
+lenparents = len(parents)
+outlength = int(cp.max(parents)) + 1
+identity = 111111
+toptr = cp.full(outlength * 2, identity, dtype=cp.float32)
+block_size = 2
+partial = cp.full((outlength * ((lenparents + block_size - 1) // block_size)), identity, dtype=cp.float32)
+grid_size = (lenparents + block_size - 1) // block_size
+shared_mem_size = block_size * cp.float32().nbytes
+
+raw_module = cp.RawModule(code=cuda_kernel)
+
+awkward_reduce_min_complex_a = raw_module.get_function('awkward_reduce_min_complex_a')
+awkward_reduce_min_complex_b = raw_module.get_function('awkward_reduce_min_complex_b')
+awkward_reduce_min_complex_c = raw_module.get_function('awkward_reduce_min_complex_c')
+
+awkward_reduce_min_complex_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial))
+awkward_reduce_min_complex_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial), shared_mem=shared_mem_size)
+awkward_reduce_min_complex_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial))
+
+toptr_host = toptr[0::2] + 1j * toptr[1::2]
+print("tree reduction toptr:", toptr_host)
\ No newline at end of file
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py
index 5810ca97e5..e3a64f33d7 100644
--- a/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py
+++ b/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py
@@ -12,7 +12,7 @@
 }
     
 extern "C" {
-    __global__ void awkward_reduce_min_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int identity, int* partial) {
+    __global__ void awkward_reduce_min_b(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int identity, int* partial) {
         extern __shared__ int shared[];
 
         int idx = threadIdx.x;
@@ -43,7 +43,7 @@
 }
 
 extern "C" {
-    __global__ void awkward_reduce_min_c(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int identity, int* partial) {
+    __global__ void awkward_reduce_min_c(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int identity, int* partial) {
         int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
         if (thread_id < outlength) {
@@ -62,10 +62,10 @@
 fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=cp.int32)
 lenparents = len(parents)
 outlength = int(cp.max(parents)) + 1
-toptr = cp.full(outlength, cp.iinfo(cp.int32).max, dtype=cp.int32)
 identity = cp.iinfo(cp.int32).max
+toptr = cp.full(outlength, identity, dtype=cp.int32)
 block_size = 2
-partial = cp.full((outlength * ((lenparents + block_size - 1) // block_size)), cp.iinfo(cp.int32).max, dtype=cp.int32)
+partial = cp.full((outlength * ((lenparents + block_size - 1) // block_size)), identity, dtype=cp.int32)
 grid_size = (lenparents + block_size - 1) // block_size
 shared_mem_size = block_size * cp.int32().nbytes
 
@@ -79,5 +79,4 @@
 awkward_reduce_min_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial), shared_mem=shared_mem_size)
 awkward_reduce_min_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial))
 
-toptr_host = toptr.get()
-print("tree reduction toptr:", toptr_host)
\ No newline at end of file
+assert cp.array_equal(toptr, cp.array([1, 2, 4, 2147483647, 2147483647, 10]))
\ No newline at end of file
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_prod_bool_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_prod_bool_tree_reduction.py
new file mode 100644
index 0000000000..02a657ccb0
--- /dev/null
+++ b/studies/cuda-kernels/reducers/awkward_reduce_prod_bool_tree_reduction.py
@@ -0,0 +1,82 @@
+import cupy as cp
+
+cuda_kernel = """
+extern "C" {
+    __global__ void awkward_reduce_prod_bool_a(bool* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
+       int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+       if (thread_id < outlength) {
+          toptr[thread_id] = true;
+       }
+    }
+}
+    
+extern "C" {
+    __global__ void awkward_reduce_prod_bool_b(bool* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
+        extern __shared__ int shared[];
+
+        int idx = threadIdx.x;
+        int thread_id = blockIdx.x * blockDim.x + idx;
+
+        if (thread_id < lenparents) {
+            shared[idx] = fromptr[thread_id];
+        }
+        __syncthreads();
+
+        for (int stride = 1; stride < blockDim.x; stride *= 2) {
+            int val = 1;
+            if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+                val = shared[idx - stride];
+            }
+            __syncthreads();
+            shared[idx] &= (val != 0);
+            __syncthreads();
+        }
+
+        if (thread_id < lenparents) {
+            int parent = parents[thread_id];
+            if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+                partial[blockIdx.x * outlength + parent] = shared[idx];
+            }
+        }
+    }
+}
+
+extern "C" {
+    __global__ void awkward_reduce_prod_bool_c(bool* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
+        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+        if (thread_id < outlength) {
+            int prod = 1;
+            int blocks = (lenparents + blockDim.x - 1) / blockDim.x;
+            for (int i = 0; i < blocks; ++i) {
+                prod &= (partial[i * outlength + thread_id] != 0);
+            }
+            toptr[thread_id] = prod;
+        }
+    }
+}
+"""
+
+parents = cp.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3], dtype=cp.int32)
+fromptr = cp.array([1, 0, 0, 1, 1, 1, 1, 0, 0, 1], dtype=cp.int32)
+lenparents = len(parents)
+outlength = int(cp.max(parents)) + 1
+toptr = cp.ones(outlength, dtype=cp.bool_)
+
+block_size = 2
+partial = cp.ones((outlength * ((lenparents + block_size - 1) // block_size)), dtype=cp.int32)
+grid_size = (lenparents + block_size - 1) // block_size
+shared_mem_size = block_size * cp.int32().nbytes
+
+raw_module = cp.RawModule(code=cuda_kernel)
+
+awkward_reduce_prod_bool_a = raw_module.get_function('awkward_reduce_prod_bool_a')
+awkward_reduce_prod_bool_b = raw_module.get_function('awkward_reduce_prod_bool_b')
+awkward_reduce_prod_bool_c = raw_module.get_function('awkward_reduce_prod_bool_c')
+
+awkward_reduce_prod_bool_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial))
+awkward_reduce_prod_bool_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size)
+awkward_reduce_prod_bool_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial))
+
+assert cp.array_equal(toptr, cp.array([0, 1, 0, 1]))
\ No newline at end of file
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py
index 48f33cc5c8..d85c3863b6 100644
--- a/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py
+++ b/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py
@@ -12,7 +12,7 @@
 }
     
 extern "C" {
-    __global__ void awkward_reduce_prod_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int* partial) {
+    __global__ void awkward_reduce_prod_b(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
         extern __shared__ int shared[];
 
         int idx = threadIdx.x;
@@ -43,7 +43,7 @@
 }
 
 extern "C" {
-    __global__ void awkward_reduce_prod_c(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int* partial) {
+    __global__ void awkward_reduce_prod_c(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
         int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
         if (thread_id < outlength) {
@@ -79,5 +79,4 @@
 awkward_reduce_prod_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size)
 awkward_reduce_prod_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial))
 
-toptr_host = toptr.get()
-print("tree reduction toptr:", toptr_host)
\ No newline at end of file
+assert cp.array_equal(toptr, cp.array([1, 6, 60480, 1, 1, 10]))
\ No newline at end of file
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_sum_bool_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_sum_bool_tree_reduction.py
new file mode 100644
index 0000000000..6b44646274
--- /dev/null
+++ b/studies/cuda-kernels/reducers/awkward_reduce_sum_bool_tree_reduction.py
@@ -0,0 +1,82 @@
+import cupy as cp
+
+cuda_kernel = """
+extern "C" {
+    __global__ void awkward_reduce_sum_bool_a(bool* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
+       int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+       if (thread_id < outlength) {
+          toptr[thread_id] = 0;
+       }
+    }
+}
+    
+extern "C" {
+    __global__ void awkward_reduce_sum_bool_b(bool* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
+        extern __shared__ int shared[];
+
+        int idx = threadIdx.x;
+        int thread_id = blockIdx.x * blockDim.x + idx;
+
+        if (thread_id < lenparents) {
+            shared[idx] = fromptr[thread_id];
+        }
+        __syncthreads();
+
+        for (int stride = 1; stride < blockDim.x; stride *= 2) {
+            int val = 0;
+            if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+                val = shared[idx - stride];
+            }
+            __syncthreads();
+            shared[idx] |= (val != 0);
+            __syncthreads();
+        }
+
+        if (thread_id < lenparents) {
+            int parent = parents[thread_id];
+            if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+                partial[blockIdx.x * outlength + parent] = shared[idx];
+            }
+        }
+    }
+}
+
+extern "C" {
+    __global__ void awkward_reduce_sum_bool_c(bool* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
+        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+        if (thread_id < outlength) {
+            int sum = 0;
+            int blocks = (lenparents + blockDim.x - 1) / blockDim.x;
+            for (int i = 0; i < blocks; ++i) {
+                sum |= (partial[i * outlength + thread_id] != 0);
+            }
+            toptr[thread_id] = sum;
+        }
+    }
+}
+"""
+
+parents = cp.array([0, 0, 0, 2, 2, 3, 4, 4, 5], dtype=cp.int32)
+fromptr = cp.array([1, 0, 1, 0, 0, 1, 0, 1, 1], dtype=cp.int32)
+lenparents = len(parents)
+outlength = int(cp.max(parents)) + 1
+toptr = cp.zeros(outlength, dtype=cp.bool_)
+
+block_size = 2
+partial = cp.zeros((outlength * ((lenparents + block_size - 1) // block_size)), dtype=cp.int32)
+grid_size = (lenparents + block_size - 1) // block_size
+shared_mem_size = block_size * cp.int32().nbytes
+
+raw_module = cp.RawModule(code=cuda_kernel)
+
+awkward_reduce_sum_bool_a = raw_module.get_function('awkward_reduce_sum_bool_a')
+awkward_reduce_sum_bool_b = raw_module.get_function('awkward_reduce_sum_bool_b')
+awkward_reduce_sum_bool_c = raw_module.get_function('awkward_reduce_sum_bool_c')
+
+awkward_reduce_sum_bool_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial))
+awkward_reduce_sum_bool_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size)
+awkward_reduce_sum_bool_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial))
+
+assert cp.array_equal(toptr, cp.array([1, 0, 0, 1, 1, 1]))
\ No newline at end of file
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_sum_complex_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_sum_complex_tree_reduction.py
index 3f042880dc..06285338de 100644
--- a/studies/cuda-kernels/reducers/awkward_reduce_sum_complex_tree_reduction.py
+++ b/studies/cuda-kernels/reducers/awkward_reduce_sum_complex_tree_reduction.py
@@ -88,5 +88,4 @@
 awkward_reduce_sum_complex_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size)
 awkward_reduce_sum_complex_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial))
 
-toptr_host = toptr.get()
-print("tree reduction toptr:", toptr_host)
\ No newline at end of file
+assert cp.array_equal(toptr, cp.array([1, 0, 5, 3, 39, 33, 0, 0, 0, 0, 10, 0]))
\ No newline at end of file
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py
index c8757ebb64..f04ef5ce85 100644
--- a/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py
+++ b/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py
@@ -12,7 +12,7 @@
 }
     
 extern "C" {
-    __global__ void awkward_reduce_sum_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int* partial) {
+    __global__ void awkward_reduce_sum_b(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
         extern __shared__ int shared[];
 
         int idx = threadIdx.x;
@@ -43,7 +43,7 @@
 }
 
 extern "C" {
-    __global__ void awkward_reduce_sum_c(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int* partial) {
+    __global__ void awkward_reduce_sum_c(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
         int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
         if (thread_id < outlength) {
@@ -79,5 +79,4 @@
 awkward_reduce_sum_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size)
 awkward_reduce_sum_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial))
 
-toptr_host = toptr.get()
-print("tree reduction toptr:", toptr_host)
\ No newline at end of file
+assert cp.array_equal(toptr, cp.array([1, 5, 39, 0, 0, 10]))
\ No newline at end of file

From 10c0a6cf8ea532b1c05e23d46325e469b6e38eb1 Mon Sep 17 00:00:00 2001
From: ManasviGoyal <mg.manasvi@gmail.com>
Date: Wed, 22 May 2024 11:47:26 +0200
Subject: [PATCH 08/10] chore: add count kernels

---
 .../awkward_reduce_argmax_tree_reduction.py   | 27 +++---
 .../awkward_reduce_argmin_tree_reduction.py   | 27 +++---
 .../awkward_reduce_count_64_tree_reduction.py | 83 +++++++++++++++++++
 ...ward_reduce_countnonzero_tree_reduction.py | 83 +++++++++++++++++++
 .../awkward_reduce_max_tree_reduction.py      | 29 ++++---
 .../awkward_reduce_min_tree_reduction.py      | 28 ++++---
 ...awkward_reduce_prod_bool_tree_reduction.py | 25 +++---
 .../awkward_reduce_prod_tree_reduction.py     | 25 +++---
 .../awkward_reduce_sum_bool_tree_reduction.py | 27 +++---
 .../awkward_reduce_sum_tree_reduction.py      | 25 +++---
 10 files changed, 280 insertions(+), 99 deletions(-)
 create mode 100644 studies/cuda-kernels/reducers/awkward_reduce_count_64_tree_reduction.py
 create mode 100644 studies/cuda-kernels/reducers/awkward_reduce_countnonzero_tree_reduction.py

diff --git a/studies/cuda-kernels/reducers/awkward_reduce_argmax_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_argmax_tree_reduction.py
index 9743c91c9b..ca1160f276 100644
--- a/studies/cuda-kernels/reducers/awkward_reduce_argmax_tree_reduction.py
+++ b/studies/cuda-kernels/reducers/awkward_reduce_argmax_tree_reduction.py
@@ -68,19 +68,22 @@
 lenparents = len(parents)
 outlength = int(cp.max(parents)) + 1
 toptr = cp.full(outlength, -1, dtype=cp.int32)
-block_size = 2
-partial = cp.full((outlength * ((lenparents + block_size - 1) // block_size)), -1, dtype=cp.int32)
-grid_size = (lenparents + block_size - 1) // block_size
-shared_mem_size = block_size * cp.int32().nbytes
 
-raw_module = cp.RawModule(code=cuda_kernel)
 
-awkward_reduce_argmax_a = raw_module.get_function('awkward_reduce_argmax_a')
-awkward_reduce_argmax_b = raw_module.get_function('awkward_reduce_argmax_b')
-awkward_reduce_argmax_c = raw_module.get_function('awkward_reduce_argmax_c')
+block_size = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
+for i in range (len(block_size)):
+    partial = cp.full((outlength * ((lenparents + block_size[i] - 1) // block_size[i])), -1, dtype=cp.int32)
+    grid_size = (lenparents + block_size[i] - 1) // block_size[i]
+    shared_mem_size = block_size[i] * cp.int32().nbytes
 
-awkward_reduce_argmax_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial))
-awkward_reduce_argmax_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size)
-awkward_reduce_argmax_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial))
+    raw_module = cp.RawModule(code=cuda_kernel)
 
-assert cp.array_equal(toptr, cp.array([0, 2, 8, -1, -1, 9]))
\ No newline at end of file
+    awkward_reduce_argmax_a = raw_module.get_function('awkward_reduce_argmax_a')
+    awkward_reduce_argmax_b = raw_module.get_function('awkward_reduce_argmax_b')
+    awkward_reduce_argmax_c = raw_module.get_function('awkward_reduce_argmax_c')
+
+    awkward_reduce_argmax_a((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial))
+    awkward_reduce_argmax_b((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size)
+    awkward_reduce_argmax_c(((outlength + block_size[i] - 1) // block_size[i],), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial))
+    
+    assert cp.array_equal(toptr, cp.array([0, 2, 8, -1, -1, 9]))
\ No newline at end of file
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_argmin_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_argmin_tree_reduction.py
index 0d8cd527a5..4756809fc0 100644
--- a/studies/cuda-kernels/reducers/awkward_reduce_argmin_tree_reduction.py
+++ b/studies/cuda-kernels/reducers/awkward_reduce_argmin_tree_reduction.py
@@ -68,19 +68,22 @@
 lenparents = len(parents)
 outlength = int(cp.max(parents)) + 1
 toptr = cp.full(outlength, -1, dtype=cp.int32)
-block_size = 2
-partial = cp.full((outlength * ((lenparents + block_size - 1) // block_size)), -1, dtype=cp.int32)
-grid_size = (lenparents + block_size - 1) // block_size
-shared_mem_size = block_size * cp.int32().nbytes
 
-raw_module = cp.RawModule(code=cuda_kernel)
 
-awkward_reduce_argmin_a = raw_module.get_function('awkward_reduce_argmin_a')
-awkward_reduce_argmin_b = raw_module.get_function('awkward_reduce_argmin_b')
-awkward_reduce_argmin_c = raw_module.get_function('awkward_reduce_argmin_c')
+block_size = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
+for i in range (len(block_size)):
+    partial = cp.full((outlength * ((lenparents + block_size[i] - 1) // block_size[i])), -1, dtype=cp.int32)
+    grid_size = (lenparents + block_size[i] - 1) // block_size[i]
+    shared_mem_size = block_size[i] * cp.int32().nbytes
 
-awkward_reduce_argmin_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial))
-awkward_reduce_argmin_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size)
-awkward_reduce_argmin_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial))
+    raw_module = cp.RawModule(code=cuda_kernel)
 
-assert cp.array_equal(toptr, cp.array([0, 1, 3, -1, -1, 9]))
\ No newline at end of file
+    awkward_reduce_argmin_a = raw_module.get_function('awkward_reduce_argmin_a')
+    awkward_reduce_argmin_b = raw_module.get_function('awkward_reduce_argmin_b')
+    awkward_reduce_argmin_c = raw_module.get_function('awkward_reduce_argmin_c')
+
+    awkward_reduce_argmin_a((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial))
+    awkward_reduce_argmin_b((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size)
+    awkward_reduce_argmin_c(((outlength + block_size[i] - 1) // block_size[i],), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial))
+
+    assert cp.array_equal(toptr, cp.array([0, 1, 3, -1, -1, 9]))
\ No newline at end of file
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_count_64_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_count_64_tree_reduction.py
new file mode 100644
index 0000000000..66fe580399
--- /dev/null
+++ b/studies/cuda-kernels/reducers/awkward_reduce_count_64_tree_reduction.py
@@ -0,0 +1,83 @@
+import cupy as cp
+
+cuda_kernel = """
+extern "C" {
+    __global__ void awkward_reduce_countnonzero_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
+       int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+       if (thread_id < outlength) {
+          toptr[thread_id] = 0;
+       }
+    }
+}
+    
+extern "C" {
+    __global__ void awkward_reduce_countnonzero_b(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
+        extern __shared__ int shared[];
+
+        int idx = threadIdx.x;
+        int thread_id = blockIdx.x * blockDim.x + idx;
+
+        if (thread_id < lenparents) {
+            shared[idx] = 1;
+        }
+        __syncthreads();
+
+        for (int stride = 1; stride < blockDim.x; stride *= 2) {
+            int val = 0;
+            if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+                val = shared[idx - stride];
+            }
+            __syncthreads();
+            shared[idx] += val;
+            __syncthreads();
+        }
+
+        if (thread_id < lenparents) {
+            int parent = parents[thread_id];
+            if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+                partial[blockIdx.x * outlength + parent] = shared[idx];
+            }
+        }
+    }
+}
+
+extern "C" {
+    __global__ void awkward_reduce_countnonzero_c(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
+        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+        if (thread_id < outlength) {
+            int countnonzero = 0;
+            int blocks = (lenparents + blockDim.x - 1) / blockDim.x;
+            for (int i = 0; i < blocks; ++i) {
+                countnonzero += partial[i * outlength + thread_id];
+            }
+            toptr[thread_id] = countnonzero;
+        }
+    }
+}
+"""
+
+parents = cp.array([0, 1, 1, 2, 2, 2, 2, 2, 2, 5], dtype=cp.int32)
+fromptr = cp.array([1, 2, 3, 0, 5, 6, 0, 8, 9, 0], dtype=cp.int32)
+lenparents = len(parents)
+outlength = int(cp.max(parents)) + 1
+toptr = cp.zeros(outlength, dtype=cp.int32)
+
+block_size = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
+for i in range (len(block_size)):
+    partial = cp.zeros((outlength * ((lenparents + block_size[i] - 1) // block_size[i])), dtype=cp.int32)
+    grid_size = (lenparents + block_size[i] - 1) // block_size[i]
+    shared_mem_size = block_size[i] * cp.int32().nbytes
+
+    raw_module = cp.RawModule(code=cuda_kernel)
+
+    awkward_reduce_countnonzero_a = raw_module.get_function('awkward_reduce_countnonzero_a')
+    awkward_reduce_countnonzero_b = raw_module.get_function('awkward_reduce_countnonzero_b')
+    awkward_reduce_countnonzero_c = raw_module.get_function('awkward_reduce_countnonzero_c')
+
+    awkward_reduce_countnonzero_a((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial))
+    awkward_reduce_countnonzero_b((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size)
+    awkward_reduce_countnonzero_c(((outlength + block_size[i] - 1) // block_size[i],), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial))
+
+    assert cp.array_equal(toptr, cp.array([1, 2, 6, 0, 0, 1]))
\ No newline at end of file
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_countnonzero_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_countnonzero_tree_reduction.py
new file mode 100644
index 0000000000..0d7ca6d5e0
--- /dev/null
+++ b/studies/cuda-kernels/reducers/awkward_reduce_countnonzero_tree_reduction.py
@@ -0,0 +1,83 @@
+import cupy as cp
+
+cuda_kernel = """
+extern "C" {
+    __global__ void awkward_reduce_countnonzero_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
+       int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+       if (thread_id < outlength) {
+          toptr[thread_id] = 0;
+       }
+    }
+}
+    
+extern "C" {
+    __global__ void awkward_reduce_countnonzero_b(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
+        extern __shared__ int shared[];
+
+        int idx = threadIdx.x;
+        int thread_id = blockIdx.x * blockDim.x + idx;
+
+        if (thread_id < lenparents) {
+            shared[idx] = (fromptr[thread_id] != 0) ? 1 : 0;
+        }
+        __syncthreads();
+
+        for (int stride = 1; stride < blockDim.x; stride *= 2) {
+            int val = 0;
+            if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+                val = shared[idx - stride];
+            }
+            __syncthreads();
+            shared[idx] += val;
+            __syncthreads();
+        }
+
+        if (thread_id < lenparents) {
+            int parent = parents[thread_id];
+            if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+                partial[blockIdx.x * outlength + parent] = shared[idx];
+            }
+        }
+    }
+}
+
+extern "C" {
+    __global__ void awkward_reduce_countnonzero_c(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
+        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+        if (thread_id < outlength) {
+            int countnonzero = 0;
+            int blocks = (lenparents + blockDim.x - 1) / blockDim.x;
+            for (int i = 0; i < blocks; ++i) {
+                countnonzero += partial[i * outlength + thread_id];
+            }
+            toptr[thread_id] = countnonzero;
+        }
+    }
+}
+"""
+
+parents = cp.array([0, 1, 1, 2, 2, 2, 2, 2, 2, 5], dtype=cp.int32)
+fromptr = cp.array([1, 2, 3, 0, 5, 6, 0, 8, 9, 0], dtype=cp.int32)
+lenparents = len(parents)
+outlength = int(cp.max(parents)) + 1
+toptr = cp.zeros(outlength, dtype=cp.int32)
+
+block_size = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
+for i in range (len(block_size)):
+    partial = cp.zeros((outlength * ((lenparents + block_size[i] - 1) // block_size[i])), dtype=cp.int32)
+    grid_size = (lenparents + block_size[i] - 1) // block_size[i]
+    shared_mem_size = block_size[i] * cp.int32().nbytes
+
+    raw_module = cp.RawModule(code=cuda_kernel)
+
+    awkward_reduce_countnonzero_a = raw_module.get_function('awkward_reduce_countnonzero_a')
+    awkward_reduce_countnonzero_b = raw_module.get_function('awkward_reduce_countnonzero_b')
+    awkward_reduce_countnonzero_c = raw_module.get_function('awkward_reduce_countnonzero_c')
+
+    awkward_reduce_countnonzero_a((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial))
+    awkward_reduce_countnonzero_b((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size)
+    awkward_reduce_countnonzero_c(((outlength + block_size[i] - 1) // block_size[i],), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial))
+
+    assert cp.array_equal(toptr, cp.array([1, 2, 4, 0, 0, 0]))
\ No newline at end of file
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py
index df60f4417e..dfff51a25f 100644
--- a/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py
+++ b/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py
@@ -59,24 +59,27 @@
 """
 
 parents = cp.array([0, 1, 1, 2, 2, 2, 2, 2, 2, 5], dtype=cp.int32)
-fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=cp.int32)
+fromptr = cp.array([1, -2, -3, 4, 5, 6, 7, 8, 9, 10], dtype=cp.int32)
 lenparents = len(parents)
 outlength = int(cp.max(parents)) + 1
 identity = cp.iinfo(cp.int32).min
 toptr = cp.full(outlength, identity, dtype=cp.int32)
-block_size = 2
-partial = cp.full((outlength * ((lenparents + block_size - 1) // block_size)), identity, dtype=cp.int32)
-grid_size = (lenparents + block_size - 1) // block_size
-shared_mem_size = block_size * cp.int32().nbytes
 
-raw_module = cp.RawModule(code=cuda_kernel)
 
-awkward_reduce_max_a = raw_module.get_function('awkward_reduce_max_a')
-awkward_reduce_max_b = raw_module.get_function('awkward_reduce_max_b')
-awkward_reduce_max_c = raw_module.get_function('awkward_reduce_max_c')
+block_size = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
+for i in range (len(block_size)):
+    partial = cp.full((outlength * ((lenparents + block_size[i] - 1) // block_size[i])), identity, dtype=cp.int32)
+    grid_size = (lenparents + block_size[i] - 1) // block_size[i]
+    shared_mem_size = block_size[i] * cp.int32().nbytes
 
-awkward_reduce_max_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial))
-awkward_reduce_max_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial), shared_mem=shared_mem_size)
-awkward_reduce_max_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial))
+    raw_module = cp.RawModule(code=cuda_kernel)
 
-assert cp.array_equal(toptr, cp.array([1, 3, 9, -2147483648, -2147483648, 10]))
\ No newline at end of file
+    awkward_reduce_max_a = raw_module.get_function('awkward_reduce_max_a')
+    awkward_reduce_max_b = raw_module.get_function('awkward_reduce_max_b')
+    awkward_reduce_max_c = raw_module.get_function('awkward_reduce_max_c')
+
+    awkward_reduce_max_a((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, identity, partial))
+    awkward_reduce_max_b((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, identity, partial), shared_mem=shared_mem_size)
+    awkward_reduce_max_c(((outlength + block_size[i] - 1) // block_size[i],), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, identity, partial))
+
+    assert cp.array_equal(toptr, cp.array([1, -2, 9, -2147483648, -2147483648, 10]))
\ No newline at end of file
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py
index e3a64f33d7..8069a3007d 100644
--- a/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py
+++ b/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py
@@ -59,24 +59,26 @@
 """
 
 parents = cp.array([0, 1, 1, 2, 2, 2, 2, 2, 2, 5], dtype=cp.int32)
-fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=cp.int32)
+fromptr = cp.array([1, -2, -3, 4, 5, 6, 7, 8, 9, 10], dtype=cp.int32)
 lenparents = len(parents)
 outlength = int(cp.max(parents)) + 1
 identity = cp.iinfo(cp.int32).max
 toptr = cp.full(outlength, identity, dtype=cp.int32)
-block_size = 2
-partial = cp.full((outlength * ((lenparents + block_size - 1) // block_size)), identity, dtype=cp.int32)
-grid_size = (lenparents + block_size - 1) // block_size
-shared_mem_size = block_size * cp.int32().nbytes
 
-raw_module = cp.RawModule(code=cuda_kernel)
+block_size = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
+for i in range (len(block_size)):
+    partial = cp.full((outlength * ((lenparents + block_size[i] - 1) // block_size[i])), identity, dtype=cp.int32)
+    grid_size = (lenparents + block_size[i] - 1) // block_size[i]
+    shared_mem_size = block_size[i] * cp.int32().nbytes
 
-awkward_reduce_min_a = raw_module.get_function('awkward_reduce_min_a')
-awkward_reduce_min_b = raw_module.get_function('awkward_reduce_min_b')
-awkward_reduce_min_c = raw_module.get_function('awkward_reduce_min_c')
+    raw_module = cp.RawModule(code=cuda_kernel)
 
-awkward_reduce_min_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial))
-awkward_reduce_min_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial), shared_mem=shared_mem_size)
-awkward_reduce_min_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial))
+    awkward_reduce_min_a = raw_module.get_function('awkward_reduce_min_a')
+    awkward_reduce_min_b = raw_module.get_function('awkward_reduce_min_b')
+    awkward_reduce_min_c = raw_module.get_function('awkward_reduce_min_c')
 
-assert cp.array_equal(toptr, cp.array([1, 2, 4, 2147483647, 2147483647, 10]))
\ No newline at end of file
+    awkward_reduce_min_a((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, identity, partial))
+    awkward_reduce_min_b((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, identity, partial), shared_mem=shared_mem_size)
+    awkward_reduce_min_c(((outlength + block_size[i] - 1) // block_size[i],), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, identity, partial))
+
+    assert cp.array_equal(toptr, cp.array([1, -3, 4, 2147483647, 2147483647, 10]))
\ No newline at end of file
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_prod_bool_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_prod_bool_tree_reduction.py
index 02a657ccb0..2acbbac92e 100644
--- a/studies/cuda-kernels/reducers/awkward_reduce_prod_bool_tree_reduction.py
+++ b/studies/cuda-kernels/reducers/awkward_reduce_prod_bool_tree_reduction.py
@@ -64,19 +64,20 @@
 outlength = int(cp.max(parents)) + 1
 toptr = cp.ones(outlength, dtype=cp.bool_)
 
-block_size = 2
-partial = cp.ones((outlength * ((lenparents + block_size - 1) // block_size)), dtype=cp.int32)
-grid_size = (lenparents + block_size - 1) // block_size
-shared_mem_size = block_size * cp.int32().nbytes
+block_size = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
+for i in range (len(block_size)):
+    partial = cp.ones((outlength * ((lenparents + block_size[i] - 1) // block_size[i])), dtype=cp.int32)
+    grid_size = (lenparents + block_size[i] - 1) // block_size[i]
+    shared_mem_size = block_size[i] * cp.int32().nbytes
 
-raw_module = cp.RawModule(code=cuda_kernel)
+    raw_module = cp.RawModule(code=cuda_kernel)
 
-awkward_reduce_prod_bool_a = raw_module.get_function('awkward_reduce_prod_bool_a')
-awkward_reduce_prod_bool_b = raw_module.get_function('awkward_reduce_prod_bool_b')
-awkward_reduce_prod_bool_c = raw_module.get_function('awkward_reduce_prod_bool_c')
+    awkward_reduce_prod_bool_a = raw_module.get_function('awkward_reduce_prod_bool_a')
+    awkward_reduce_prod_bool_b = raw_module.get_function('awkward_reduce_prod_bool_b')
+    awkward_reduce_prod_bool_c = raw_module.get_function('awkward_reduce_prod_bool_c')
 
-awkward_reduce_prod_bool_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial))
-awkward_reduce_prod_bool_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size)
-awkward_reduce_prod_bool_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial))
+    awkward_reduce_prod_bool_a((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial))
+    awkward_reduce_prod_bool_b((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size)
+    awkward_reduce_prod_bool_c(((outlength + block_size[i] - 1) // block_size[i],), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial))
 
-assert cp.array_equal(toptr, cp.array([0, 1, 0, 1]))
\ No newline at end of file
+    assert cp.array_equal(toptr, cp.array([0, 1, 0, 1]))
\ No newline at end of file
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py
index d85c3863b6..910ba712b8 100644
--- a/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py
+++ b/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py
@@ -64,19 +64,20 @@
 outlength = int(cp.max(parents)) + 1
 toptr = cp.zeros(outlength, dtype=cp.int32)
 
-block_size = 2
-partial = cp.ones((outlength * ((lenparents + block_size - 1) // block_size)), dtype=cp.int32)
-grid_size = (lenparents + block_size - 1) // block_size
-shared_mem_size = block_size * cp.int32().nbytes
+block_size = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
+for i in range (len(block_size)):
+    partial = cp.ones((outlength * ((lenparents + block_size[i] - 1) // block_size[i])), dtype=cp.int32)
+    grid_size = (lenparents + block_size[i] - 1) // block_size[i]
+    shared_mem_size = block_size[i] * cp.int32().nbytes
 
-raw_module = cp.RawModule(code=cuda_kernel)
+    raw_module = cp.RawModule(code=cuda_kernel)
 
-awkward_reduce_prod_a = raw_module.get_function('awkward_reduce_prod_a')
-awkward_reduce_prod_b = raw_module.get_function('awkward_reduce_prod_b')
-awkward_reduce_prod_c = raw_module.get_function('awkward_reduce_prod_c')
+    awkward_reduce_prod_a = raw_module.get_function('awkward_reduce_prod_a')
+    awkward_reduce_prod_b = raw_module.get_function('awkward_reduce_prod_b')
+    awkward_reduce_prod_c = raw_module.get_function('awkward_reduce_prod_c')
 
-awkward_reduce_prod_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial))
-awkward_reduce_prod_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size)
-awkward_reduce_prod_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial))
+    awkward_reduce_prod_a((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial))
+    awkward_reduce_prod_b((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size)
+    awkward_reduce_prod_c(((outlength + block_size[i] - 1) // block_size[i],), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial))
 
-assert cp.array_equal(toptr, cp.array([1, 6, 60480, 1, 1, 10]))
\ No newline at end of file
+    assert cp.array_equal(toptr, cp.array([1, 6, 60480, 1, 1, 10]))
\ No newline at end of file
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_sum_bool_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_sum_bool_tree_reduction.py
index 6b44646274..8a512bd9bf 100644
--- a/studies/cuda-kernels/reducers/awkward_reduce_sum_bool_tree_reduction.py
+++ b/studies/cuda-kernels/reducers/awkward_reduce_sum_bool_tree_reduction.py
@@ -6,7 +6,7 @@
        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
        if (thread_id < outlength) {
-          toptr[thread_id] = 0;
+          toptr[thread_id] = false;
        }
     }
 }
@@ -64,19 +64,20 @@
 outlength = int(cp.max(parents)) + 1
 toptr = cp.zeros(outlength, dtype=cp.bool_)
 
-block_size = 2
-partial = cp.zeros((outlength * ((lenparents + block_size - 1) // block_size)), dtype=cp.int32)
-grid_size = (lenparents + block_size - 1) // block_size
-shared_mem_size = block_size * cp.int32().nbytes
+block_size = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
+for i in range (len(block_size)):
+    partial = cp.zeros((outlength * ((lenparents + block_size[i] - 1) // block_size[i])), dtype=cp.int32)
+    grid_size = (lenparents + block_size[i] - 1) // block_size[i]
+    shared_mem_size = block_size[i] * cp.int32().nbytes
 
-raw_module = cp.RawModule(code=cuda_kernel)
+    raw_module = cp.RawModule(code=cuda_kernel)
 
-awkward_reduce_sum_bool_a = raw_module.get_function('awkward_reduce_sum_bool_a')
-awkward_reduce_sum_bool_b = raw_module.get_function('awkward_reduce_sum_bool_b')
-awkward_reduce_sum_bool_c = raw_module.get_function('awkward_reduce_sum_bool_c')
+    awkward_reduce_sum_bool_a = raw_module.get_function('awkward_reduce_sum_bool_a')
+    awkward_reduce_sum_bool_b = raw_module.get_function('awkward_reduce_sum_bool_b')
+    awkward_reduce_sum_bool_c = raw_module.get_function('awkward_reduce_sum_bool_c')
 
-awkward_reduce_sum_bool_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial))
-awkward_reduce_sum_bool_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size)
-awkward_reduce_sum_bool_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial))
+    awkward_reduce_sum_bool_a((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial))
+    awkward_reduce_sum_bool_b((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size)
+    awkward_reduce_sum_bool_c(((outlength + block_size[i] - 1) // block_size[i],), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial))
 
-assert cp.array_equal(toptr, cp.array([1, 0, 0, 1, 1, 1]))
\ No newline at end of file
+    assert cp.array_equal(toptr, cp.array([1, 0, 0, 1, 1, 1]))
\ No newline at end of file
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py
index f04ef5ce85..11428c5475 100644
--- a/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py
+++ b/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py
@@ -64,19 +64,20 @@
 outlength = int(cp.max(parents)) + 1
 toptr = cp.zeros(outlength, dtype=cp.int32)
 
-block_size = 2
-partial = cp.zeros((outlength * ((lenparents + block_size - 1) // block_size)), dtype=cp.int32)
-grid_size = (lenparents + block_size - 1) // block_size
-shared_mem_size = block_size * cp.int32().nbytes
+block_size = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
+for i in range (len(block_size)):
+    partial = cp.zeros((outlength * ((lenparents + block_size[i] - 1) // block_size[i])), dtype=cp.int32)
+    grid_size = (lenparents + block_size[i] - 1) // block_size[i]
+    shared_mem_size = block_size[i] * cp.int32().nbytes
 
-raw_module = cp.RawModule(code=cuda_kernel)
+    raw_module = cp.RawModule(code=cuda_kernel)
 
-awkward_reduce_sum_a = raw_module.get_function('awkward_reduce_sum_a')
-awkward_reduce_sum_b = raw_module.get_function('awkward_reduce_sum_b')
-awkward_reduce_sum_c = raw_module.get_function('awkward_reduce_sum_c')
+    awkward_reduce_sum_a = raw_module.get_function('awkward_reduce_sum_a')
+    awkward_reduce_sum_b = raw_module.get_function('awkward_reduce_sum_b')
+    awkward_reduce_sum_c = raw_module.get_function('awkward_reduce_sum_c')
 
-awkward_reduce_sum_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial))
-awkward_reduce_sum_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size)
-awkward_reduce_sum_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial))
+    awkward_reduce_sum_a((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial))
+    awkward_reduce_sum_b((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size)
+    awkward_reduce_sum_c(((outlength + block_size[i] - 1) // block_size[i],), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial))
 
-assert cp.array_equal(toptr, cp.array([1, 5, 39, 0, 0, 10]))
\ No newline at end of file
+    assert cp.array_equal(toptr, cp.array([1, 5, 39, 0, 0, 10]))
\ No newline at end of file

From fbb11955f8241d36541b38cc58616220c6d9543a Mon Sep 17 00:00:00 2001
From: ManasviGoyal <mg.manasvi@gmail.com>
Date: Wed, 22 May 2024 12:16:26 +0200
Subject: [PATCH 09/10] chore: add sum int32 and int64 bool kernels

---
 .../awkward_reduce_argmax_tree_reduction.py   |  2 +-
 ...awkward_reduce_prod_bool_tree_reduction.py |  2 +-
 .../awkward_reduce_sum_bool_tree_reduction.py |  2 +-
 ...reduce_sum_int32_bool_64_tree_reduction.py | 83 +++++++++++++++++++
 ...reduce_sum_int64_bool_64_tree_reduction.py | 83 +++++++++++++++++++
 5 files changed, 169 insertions(+), 3 deletions(-)
 create mode 100644 studies/cuda-kernels/reducers/awkward_reduce_sum_int32_bool_64_tree_reduction.py
 create mode 100644 studies/cuda-kernels/reducers/awkward_reduce_sum_int64_bool_64_tree_reduction.py

diff --git a/studies/cuda-kernels/reducers/awkward_reduce_argmax_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_argmax_tree_reduction.py
index ca1160f276..b0c4d8cfaa 100644
--- a/studies/cuda-kernels/reducers/awkward_reduce_argmax_tree_reduction.py
+++ b/studies/cuda-kernels/reducers/awkward_reduce_argmax_tree_reduction.py
@@ -85,5 +85,5 @@
     awkward_reduce_argmax_a((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial))
     awkward_reduce_argmax_b((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size)
     awkward_reduce_argmax_c(((outlength + block_size[i] - 1) // block_size[i],), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial))
-    
+
     assert cp.array_equal(toptr, cp.array([0, 2, 8, -1, -1, 9]))
\ No newline at end of file
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_prod_bool_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_prod_bool_tree_reduction.py
index 2acbbac92e..156625fa68 100644
--- a/studies/cuda-kernels/reducers/awkward_reduce_prod_bool_tree_reduction.py
+++ b/studies/cuda-kernels/reducers/awkward_reduce_prod_bool_tree_reduction.py
@@ -47,7 +47,7 @@
         int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
         if (thread_id < outlength) {
-            int prod = 1;
+            bool prod = 1;
             int blocks = (lenparents + blockDim.x - 1) / blockDim.x;
             for (int i = 0; i < blocks; ++i) {
                 prod &= (partial[i * outlength + thread_id] != 0);
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_sum_bool_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_sum_bool_tree_reduction.py
index 8a512bd9bf..971331a3e6 100644
--- a/studies/cuda-kernels/reducers/awkward_reduce_sum_bool_tree_reduction.py
+++ b/studies/cuda-kernels/reducers/awkward_reduce_sum_bool_tree_reduction.py
@@ -47,7 +47,7 @@
         int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
         if (thread_id < outlength) {
-            int sum = 0;
+            bool sum = 0;
             int blocks = (lenparents + blockDim.x - 1) / blockDim.x;
             for (int i = 0; i < blocks; ++i) {
                 sum |= (partial[i * outlength + thread_id] != 0);
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_sum_int32_bool_64_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_sum_int32_bool_64_tree_reduction.py
new file mode 100644
index 0000000000..28a0cb3a34
--- /dev/null
+++ b/studies/cuda-kernels/reducers/awkward_reduce_sum_int32_bool_64_tree_reduction.py
@@ -0,0 +1,83 @@
+import cupy as cp
+
+cuda_kernel = """
+extern "C" {
+    __global__ void awkward_reduce_countnonzero_a(int* toptr, bool* fromptr, int* parents, int lenparents, int outlength, int* partial) {
+       int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+       if (thread_id < outlength) {
+          toptr[thread_id] = 0;
+       }
+    }
+}
+    
+extern "C" {
+    __global__ void awkward_reduce_countnonzero_b(int* toptr, bool* fromptr, int* parents, int lenparents, int outlength, int* partial) {
+        extern __shared__ int shared[];
+
+        int idx = threadIdx.x;
+        int thread_id = blockIdx.x * blockDim.x + idx;
+
+        if (thread_id < lenparents) {
+            shared[idx] = (fromptr[thread_id] != 0) ? 1 : 0;
+        }
+        __syncthreads();
+
+        for (int stride = 1; stride < blockDim.x; stride *= 2) {
+            int val = 0;
+            if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+                val = shared[idx - stride];
+            }
+            __syncthreads();
+            shared[idx] += val;
+            __syncthreads();
+        }
+
+        if (thread_id < lenparents) {
+            int parent = parents[thread_id];
+            if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+                partial[blockIdx.x * outlength + parent] = shared[idx];
+            }
+        }
+    }
+}
+
+extern "C" {
+    __global__ void awkward_reduce_countnonzero_c(int* toptr, bool* fromptr, int* parents, int lenparents, int outlength, int* partial) {
+        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+        if (thread_id < outlength) {
+            int countnonzero = 0;
+            int blocks = (lenparents + blockDim.x - 1) / blockDim.x;
+            for (int i = 0; i < blocks; ++i) {
+                countnonzero += partial[i * outlength + thread_id];
+            }
+            toptr[thread_id] = countnonzero;
+        }
+    }
+}
+"""
+
+parents = cp.array([0, 1, 1, 2, 2, 2, 2, 2, 2, 5], dtype=cp.int32)
+fromptr = cp.array([1, 1, 1, 0, 1, 1, 0, 1, 1, 0], dtype=cp.bool_)
+lenparents = len(parents)
+outlength = int(cp.max(parents)) + 1
+toptr = cp.zeros(outlength, dtype=cp.int32)
+
+block_size = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
+for i in range (len(block_size)):
+    partial = cp.zeros((outlength * ((lenparents + block_size[i] - 1) // block_size[i])), dtype=cp.int32)
+    grid_size = (lenparents + block_size[i] - 1) // block_size[i]
+    shared_mem_size = block_size[i] * cp.int32().nbytes
+
+    raw_module = cp.RawModule(code=cuda_kernel)
+
+    awkward_reduce_countnonzero_a = raw_module.get_function('awkward_reduce_countnonzero_a')
+    awkward_reduce_countnonzero_b = raw_module.get_function('awkward_reduce_countnonzero_b')
+    awkward_reduce_countnonzero_c = raw_module.get_function('awkward_reduce_countnonzero_c')
+
+    awkward_reduce_countnonzero_a((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial))
+    awkward_reduce_countnonzero_b((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size)
+    awkward_reduce_countnonzero_c(((outlength + block_size[i] - 1) // block_size[i],), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial))
+
+    assert cp.array_equal(toptr, cp.array([1, 2, 4, 0, 0, 0]))
\ No newline at end of file
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_sum_int64_bool_64_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_sum_int64_bool_64_tree_reduction.py
new file mode 100644
index 0000000000..3aa595d34f
--- /dev/null
+++ b/studies/cuda-kernels/reducers/awkward_reduce_sum_int64_bool_64_tree_reduction.py
@@ -0,0 +1,83 @@
+import cupy as cp
+
+cuda_kernel = """
+extern "C" {
+    __global__ void awkward_reduce_countnonzero_a(long long* toptr, bool* fromptr, long long* parents, long long lenparents, long long outlength, long long* partial) {
+       long long thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+       if (thread_id < outlength) {
+          toptr[thread_id] = 0;
+       }
+    }
+}
+    
+extern "C" {
+    __global__ void awkward_reduce_countnonzero_b(long long* toptr, bool* fromptr, long long* parents, long long lenparents, long long outlength, long long* partial) {
+        extern __shared__ long long shared[];
+
+        long long idx = threadIdx.x;
+        long long thread_id = blockIdx.x * blockDim.x + idx;
+
+        if (thread_id < lenparents) {
+            shared[idx] = (fromptr[thread_id] != 0) ? 1 : 0;
+        }
+        __syncthreads();
+
+        for (long long stride = 1; stride < blockDim.x; stride *= 2) {
+            long long val = 0;
+            if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+                val = shared[idx - stride];
+            }
+            __syncthreads();
+            shared[idx] += val;
+            __syncthreads();
+        }
+
+        if (thread_id < lenparents) {
+            long long parent = parents[thread_id];
+            if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+                partial[blockIdx.x * outlength + parent] = shared[idx];
+            }
+        }
+    }
+}
+
+extern "C" {
+    __global__ void awkward_reduce_countnonzero_c(long long* toptr, bool* fromptr, long long* parents, long long lenparents, long long outlength, long long* partial) {
+        long long thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+        if (thread_id < outlength) {
+            long long countnonzero = 0;
+            long long blocks = (lenparents + blockDim.x - 1) / blockDim.x;
+            for (long long i = 0; i < blocks; ++i) {
+                countnonzero += partial[i * outlength + thread_id];
+            }
+            toptr[thread_id] = countnonzero;
+        }
+    }
+}
+"""
+
+parents = cp.array([0, 1, 1, 2, 2, 2, 2, 2, 2, 5], dtype=cp.int64)
+fromptr = cp.array([1, 1, 1, 0, 1, 1, 0, 1, 1, 0], dtype=cp.bool_)
+lenparents = len(parents)
+outlength = int(cp.max(parents)) + 1
+toptr = cp.zeros(outlength, dtype=cp.int64)
+
+block_size = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
+for i in range (len(block_size)):
+    partial = cp.zeros((outlength * ((lenparents + block_size[i] - 1) // block_size[i])), dtype=cp.int64)
+    grid_size = (lenparents + block_size[i] - 1) // block_size[i]
+    shared_mem_size = block_size[i] * cp.int64().nbytes
+
+    raw_module = cp.RawModule(code=cuda_kernel)
+
+    awkward_reduce_countnonzero_a = raw_module.get_function('awkward_reduce_countnonzero_a')
+    awkward_reduce_countnonzero_b = raw_module.get_function('awkward_reduce_countnonzero_b')
+    awkward_reduce_countnonzero_c = raw_module.get_function('awkward_reduce_countnonzero_c')
+
+    awkward_reduce_countnonzero_a((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial))
+    awkward_reduce_countnonzero_b((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size)
+    awkward_reduce_countnonzero_c(((outlength + block_size[i] - 1) // block_size[i],), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial))
+
+    assert cp.array_equal(toptr, cp.array([1, 2, 4, 0, 0, 0]))
\ No newline at end of file

From aa507a2a4d1cb7a88654ddb0e2563a45718356e5 Mon Sep 17 00:00:00 2001
From: ManasviGoyal <mg.manasvi@gmail.com>
Date: Thu, 23 May 2024 14:57:22 +0200
Subject: [PATCH 10/10] chore: add sum and prod complex

---
 .../awkward_reduce_argmax_tree_reduction.py   |  3 +-
 .../awkward_reduce_argmin_tree_reduction.py   |  3 +-
 .../awkward_reduce_count_64_tree_reduction.py |  3 +-
 ...ward_reduce_countnonzero_tree_reduction.py |  3 +-
 .../awkward_reduce_max_tree_reduction.py      |  3 +-
 .../awkward_reduce_min_tree_reduction.py      |  3 +-
 ...awkward_reduce_prod_bool_tree_reduction.py |  3 +-
 ...ward_reduce_prod_complex_tree_reduction.py | 86 +++++++++++++++++++
 .../awkward_reduce_prod_tree_reduction.py     |  3 +-
 .../awkward_reduce_sum_bool_tree_reduction.py |  3 +-
 ...kward_reduce_sum_complex_tree_reduction.py | 83 +++++++++---------
 ...reduce_sum_int32_bool_64_tree_reduction.py |  3 +-
 ...reduce_sum_int64_bool_64_tree_reduction.py |  3 +-
 .../awkward_reduce_sum_tree_reduction.py      |  3 +-
 14 files changed, 149 insertions(+), 56 deletions(-)
 create mode 100644 studies/cuda-kernels/reducers/awkward_reduce_prod_complex_tree_reduction.py

diff --git a/studies/cuda-kernels/reducers/awkward_reduce_argmax_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_argmax_tree_reduction.py
index b0c4d8cfaa..011dd3b594 100644
--- a/studies/cuda-kernels/reducers/awkward_reduce_argmax_tree_reduction.py
+++ b/studies/cuda-kernels/reducers/awkward_reduce_argmax_tree_reduction.py
@@ -20,6 +20,8 @@
 
         if (thread_id < lenparents) {
             shared[idx] = thread_id;
+        } else {
+            shared[idx] = -1;
         }
         __syncthreads();
 
@@ -28,7 +30,6 @@
             if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
                 index = shared[idx - stride];
             }
-            __syncthreads();
             if (index != -1 && (shared[idx] == -1 || fromptr[index] > fromptr[shared[idx]])) {
                 shared[idx] = index;
             }
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_argmin_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_argmin_tree_reduction.py
index 4756809fc0..972dbaf4dc 100644
--- a/studies/cuda-kernels/reducers/awkward_reduce_argmin_tree_reduction.py
+++ b/studies/cuda-kernels/reducers/awkward_reduce_argmin_tree_reduction.py
@@ -20,6 +20,8 @@
 
         if (thread_id < lenparents) {
             shared[idx] = thread_id;
+        } else {
+            shared[idx] = -1;
         }
         __syncthreads();
 
@@ -28,7 +30,6 @@
             if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
                 index = shared[idx - stride];
             }
-            __syncthreads();
             if (index != -1 && (shared[idx] == -1 || fromptr[index] < fromptr[shared[idx]])) {
                 shared[idx] = index;
             }
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_count_64_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_count_64_tree_reduction.py
index 66fe580399..0c894fee35 100644
--- a/studies/cuda-kernels/reducers/awkward_reduce_count_64_tree_reduction.py
+++ b/studies/cuda-kernels/reducers/awkward_reduce_count_64_tree_reduction.py
@@ -20,6 +20,8 @@
 
         if (thread_id < lenparents) {
             shared[idx] = 1;
+        } else {
+            shared[idx] = 0;
         }
         __syncthreads();
 
@@ -28,7 +30,6 @@
             if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
                 val = shared[idx - stride];
             }
-            __syncthreads();
             shared[idx] += val;
             __syncthreads();
         }
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_countnonzero_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_countnonzero_tree_reduction.py
index 0d7ca6d5e0..774c3531b5 100644
--- a/studies/cuda-kernels/reducers/awkward_reduce_countnonzero_tree_reduction.py
+++ b/studies/cuda-kernels/reducers/awkward_reduce_countnonzero_tree_reduction.py
@@ -20,6 +20,8 @@
 
         if (thread_id < lenparents) {
             shared[idx] = (fromptr[thread_id] != 0) ? 1 : 0;
+        } else {
+            shared[idx] = 0;
         }
         __syncthreads();
 
@@ -28,7 +30,6 @@
             if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
                 val = shared[idx - stride];
             }
-            __syncthreads();
             shared[idx] += val;
             __syncthreads();
         }
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py
index dfff51a25f..cecc63d415 100644
--- a/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py
+++ b/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py
@@ -20,6 +20,8 @@
 
         if (thread_id < lenparents) {
             shared[idx] = fromptr[thread_id];
+        } else {
+            shared[idx] = identity;
         }
         __syncthreads();
 
@@ -28,7 +30,6 @@
             if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
                 val = shared[idx - stride];
             }
-            __syncthreads();
             shared[idx] = max(shared[idx], val);
             __syncthreads();
         }
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py
index 8069a3007d..fcc3d7e133 100644
--- a/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py
+++ b/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py
@@ -20,6 +20,8 @@
 
         if (thread_id < lenparents) {
             shared[idx] = fromptr[thread_id];
+        } else {
+            shared[idx] = identity;
         }
         __syncthreads();
 
@@ -28,7 +30,6 @@
             if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
                 val = shared[idx - stride];
             }
-            __syncthreads();
             shared[idx] = min(shared[idx], val);
             __syncthreads();
         }
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_prod_bool_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_prod_bool_tree_reduction.py
index 156625fa68..a66ca772c2 100644
--- a/studies/cuda-kernels/reducers/awkward_reduce_prod_bool_tree_reduction.py
+++ b/studies/cuda-kernels/reducers/awkward_reduce_prod_bool_tree_reduction.py
@@ -20,6 +20,8 @@
 
         if (thread_id < lenparents) {
             shared[idx] = fromptr[thread_id];
+        } else {
+            shared[idx] = 1;
         }
         __syncthreads();
 
@@ -28,7 +30,6 @@
             if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
                 val = shared[idx - stride];
             }
-            __syncthreads();
             shared[idx] &= (val != 0);
             __syncthreads();
         }
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_prod_complex_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_prod_complex_tree_reduction.py
new file mode 100644
index 0000000000..0a85c488ac
--- /dev/null
+++ b/studies/cuda-kernels/reducers/awkward_reduce_prod_complex_tree_reduction.py
@@ -0,0 +1,86 @@
+import cupy as cp
+
+cuda_kernel = """
+#include <cuda/std/complex>
+
+extern "C" {
+    __global__ void awkward_reduce_prod_complex_a(float* toptr, float* fromptr, int* parents, int lenparents, int outlength, float* partial) {
+        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+        if (thread_id < outlength) {
+            toptr[thread_id * 2] = 1.0f;
+            toptr[thread_id * 2 + 1] = 0.0f;
+        }
+    }
+
+    __global__ void awkward_reduce_prod_complex_b(float* toptr, float* fromptr, int* parents, int lenparents, int outlength, float* partial) {
+        extern __shared__ cuda::std::complex<float> shared[];
+
+        int idx = threadIdx.x;
+        int thread_id = blockIdx.x * blockDim.x + idx;
+
+        if (thread_id < lenparents) {
+            shared[idx] = cuda::std::complex<float>(fromptr[thread_id * 2], fromptr[thread_id * 2 + 1]);
+        }
+        __syncthreads();
+
+        for (int stride = 1; stride < blockDim.x; stride *= 2) {
+            cuda::std::complex<float> val(1.0f, 0.0f);
+            if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+                val = shared[idx - stride];
+            }
+            __syncthreads();
+            shared[idx] *= val;
+            __syncthreads();
+        }
+
+        if (thread_id < lenparents) {
+            int parent = parents[thread_id];
+            if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+                partial[(blockIdx.x * outlength + parent) * 2] = shared[idx].real();
+                partial[(blockIdx.x * outlength + parent) * 2 + 1] = shared[idx].imag();
+            }
+        }
+    }
+
+    __global__ void awkward_reduce_prod_complex_c(float* toptr, float* fromptr, int* parents, int lenparents, int outlength, float* partial) {
+        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+        if (thread_id < outlength) {
+            cuda::std::complex<float> prod(1.0f, 0.0f);
+            int blocks = (lenparents + blockDim.x - 1) / blockDim.x;
+            for (int i = 0; i < blocks; ++i) {
+                cuda::std::complex<float> val(partial[(i * outlength + thread_id) * 2], partial[(i * outlength + thread_id) * 2 + 1]);
+                prod = prod * val;
+            }
+            toptr[thread_id * 2] = prod.real();
+            toptr[thread_id * 2 + 1] = prod.imag();
+        }
+    }
+}
+"""
+
+raw_module = cp.RawModule(code=cuda_kernel, options=('-I', '/usr/local/cuda-12.3/include/'),)
+
+parents = cp.array([0, 1, 1, 2, 2, 2, 2, 2, 5, 5], dtype=cp.int32)
+fromptr = cp.array([1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 6, 0], dtype=cp.float32)
+lenparents = len(parents)
+outlength = int(cp.max(parents)) + 1
+toptr = cp.zeros(2 * outlength, dtype=cp.float32)
+
+block_size = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
+for i in range(len(block_size)):
+    partial = cp.zeros(2 * outlength * ((lenparents + block_size[i] - 1) // block_size[i]), dtype=cp.float32)
+    grid_size = (lenparents + block_size[i] - 1) // block_size[i]
+    shared_mem_size = block_size[i] * 2 * cp.float32().nbytes
+
+
+    awkward_reduce_prod_complex_a = raw_module.get_function('awkward_reduce_prod_complex_a')
+    awkward_reduce_prod_complex_b = raw_module.get_function('awkward_reduce_prod_complex_b')
+    awkward_reduce_prod_complex_c = raw_module.get_function('awkward_reduce_prod_complex_c')
+
+    awkward_reduce_prod_complex_a((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial))
+    awkward_reduce_prod_complex_b((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size)
+    awkward_reduce_prod_complex_c(((outlength + block_size[i] - 1) // block_size[i],), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial))
+
+    print(block_size[i], toptr.get())
\ No newline at end of file
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py
index 910ba712b8..38e97fe8f4 100644
--- a/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py
+++ b/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py
@@ -20,6 +20,8 @@
 
         if (thread_id < lenparents) {
             shared[idx] = fromptr[thread_id];
+        } else {
+            shared[idx] = 1;
         }
         __syncthreads();
 
@@ -28,7 +30,6 @@
             if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
                 val = shared[idx - stride];
             }
-            __syncthreads();
             shared[idx] *= val;
             __syncthreads();
         }
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_sum_bool_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_sum_bool_tree_reduction.py
index 971331a3e6..3f9ab1bae6 100644
--- a/studies/cuda-kernels/reducers/awkward_reduce_sum_bool_tree_reduction.py
+++ b/studies/cuda-kernels/reducers/awkward_reduce_sum_bool_tree_reduction.py
@@ -20,6 +20,8 @@
 
         if (thread_id < lenparents) {
             shared[idx] = fromptr[thread_id];
+        } else {
+            shared[idx] = 0;
         }
         __syncthreads();
 
@@ -28,7 +30,6 @@
             if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
                 val = shared[idx - stride];
             }
-            __syncthreads();
             shared[idx] |= (val != 0);
             __syncthreads();
         }
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_sum_complex_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_sum_complex_tree_reduction.py
index 06285338de..5c3f20ca56 100644
--- a/studies/cuda-kernels/reducers/awkward_reduce_sum_complex_tree_reduction.py
+++ b/studies/cuda-kernels/reducers/awkward_reduce_sum_complex_tree_reduction.py
@@ -1,91 +1,86 @@
 import cupy as cp
 
 cuda_kernel = """
+#include <cuda/std/complex>
+
 extern "C" {
-    __global__ void awkward_reduce_sum_complex_a(float* toptr, float* fromptr, int* parents, int lenparents, int outlength, int* partial) {
-       int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    __global__ void awkward_reduce_sum_complex_a(float* toptr, float* fromptr, int* parents, int lenparents, int outlength, float* partial) {
+        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
-       if (thread_id < outlength) {
-          toptr[thread_id * 2] = 0;
-          toptr[thread_id * 2 + 1] = 0;
-       }
+        if (thread_id < outlength) {
+            toptr[thread_id * 2] = 0.0f;
+            toptr[thread_id * 2 + 1] = 0.0f;
+        }
     }
-}
-    
-extern "C" {
-    __global__ void awkward_reduce_sum_complex_b(float* toptr, float* fromptr, int* parents, int lenparents, int outlength, int* partial) {
-        extern __shared__ float shared[];
+
+    __global__ void awkward_reduce_sum_complex_b(float* toptr, float* fromptr, int* parents, int lenparents, int outlength, float* partial) {
+        extern __shared__ cuda::std::complex<float> shared[];
 
         int idx = threadIdx.x;
         int thread_id = blockIdx.x * blockDim.x + idx;
 
         if (thread_id < lenparents) {
-            shared[idx * 2] = fromptr[thread_id * 2];
-            shared[idx * 2 + 1] = fromptr[thread_id * 2 + 1];
+            shared[idx] = cuda::std::complex<float>(fromptr[thread_id * 2], fromptr[thread_id * 2 + 1]);
         }
         __syncthreads();
 
         for (int stride = 1; stride < blockDim.x; stride *= 2) {
-            float real = 0;
-            float imag = 0;
+            cuda::std::complex<float> val(0.0f, 0.0f);
             if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
-                real = shared[(idx - stride) * 2];
-                imag = shared[(idx - stride) * 2 + 1];
+                val = shared[idx - stride];
             }
             __syncthreads();
-            shared[idx * 2] += real;
-            shared[idx * 2 + 1] += imag;
+            shared[idx] += val;
             __syncthreads();
         }
 
         if (thread_id < lenparents) {
             int parent = parents[thread_id];
             if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
-                partial[(blockIdx.x * outlength + parent) * 2 ] = shared[idx * 2];
-                partial[(blockIdx.x * outlength + parent) * 2 + 1] = shared[idx * 2 + 1];
+                partial[(blockIdx.x * outlength + parent) * 2] = shared[idx].real();
+                partial[(blockIdx.x * outlength + parent) * 2 + 1] = shared[idx].imag();
             }
         }
     }
-}
 
-extern "C" {
-    __global__ void awkward_reduce_sum_complex_c(float* toptr, float* fromptr, int* parents, int lenparents, int outlength, int* partial) {
+    __global__ void awkward_reduce_sum_complex_c(float* toptr, float* fromptr, int* parents, int lenparents, int outlength, float* partial) {
         int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
         if (thread_id < outlength) {
-            float real = 0;
-            float imag = 0;
+            cuda::std::complex<float> sum(0.0f, 0.0f);
             int blocks = (lenparents + blockDim.x - 1) / blockDim.x;
             for (int i = 0; i < blocks; ++i) {
-                real += partial[(i * outlength + thread_id) * 2];
-                imag += partial[(i * outlength + thread_id) * 2 + 1];
+                cuda::std::complex<float> val(partial[(i * outlength + thread_id) * 2], partial[(i * outlength + thread_id) * 2 + 1]);
+                sum += val;
             }
-            toptr[thread_id * 2] = real;
-            toptr[thread_id * 2 + 1] = imag;
+            toptr[thread_id * 2] = sum.real();
+            toptr[thread_id * 2 + 1] = sum.imag();
         }
     }
 }
 """
 
+raw_module = cp.RawModule(code=cuda_kernel, options=('-I', '/usr/local/cuda-12.3/include/'),)
+
 parents = cp.array([0, 1, 1, 2, 2, 2, 2, 2, 2, 5], dtype=cp.int32)
-fromptr = cp.array([1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 0], dtype=cp.float32)
+fromptr = cp.array([1, 0, 2.5677, 1.2345, 3.2367, 2.256576, 4.3456, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 0], dtype=cp.float32)
 lenparents = len(parents)
 outlength = int(cp.max(parents)) + 1
-toptr = cp.zeros(outlength * 2, dtype=cp.float32)
+toptr = cp.zeros(2 * outlength, dtype=cp.float32)
 
-block_size = 2
-partial = cp.zeros((outlength * ((lenparents + block_size - 1) // block_size)), dtype=cp.float32)
-grid_size = (lenparents + block_size - 1) // block_size
-shared_mem_size = block_size * cp.int32().nbytes
+block_size = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
+for i in range(len(block_size)):
+    partial = cp.zeros(2 * outlength * ((lenparents + block_size[i] - 1) // block_size[i]), dtype=cp.float32)
+    grid_size = (lenparents + block_size[i] - 1) // block_size[i]
+    shared_mem_size = block_size[i] * 2 * cp.float32().nbytes
 
-raw_module = cp.RawModule(code=cuda_kernel)
 
-awkward_reduce_sum_complex_a = raw_module.get_function('awkward_reduce_sum_complex_a')
-awkward_reduce_sum_complex_b = raw_module.get_function('awkward_reduce_sum_complex_b')
-awkward_reduce_sum_complex_c = raw_module.get_function('awkward_reduce_sum_complex_c')
+    awkward_reduce_sum_complex_a = raw_module.get_function('awkward_reduce_sum_complex_a')
+    awkward_reduce_sum_complex_b = raw_module.get_function('awkward_reduce_sum_complex_b')
+    awkward_reduce_sum_complex_c = raw_module.get_function('awkward_reduce_sum_complex_c')
 
-awkward_reduce_sum_complex_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial))
-awkward_reduce_sum_complex_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size)
-awkward_reduce_sum_complex_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial))
+    awkward_reduce_sum_complex_a((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial))
+    awkward_reduce_sum_complex_b((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size)
+    awkward_reduce_sum_complex_c(((outlength + block_size[i] - 1) // block_size[i],), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial))
 
-assert cp.array_equal(toptr, cp.array([1, 0, 5, 3, 39, 33, 0, 0, 0, 0, 10, 0]))
\ No newline at end of file
+    print(block_size[i], toptr.get())
\ No newline at end of file
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_sum_int32_bool_64_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_sum_int32_bool_64_tree_reduction.py
index 28a0cb3a34..142e111730 100644
--- a/studies/cuda-kernels/reducers/awkward_reduce_sum_int32_bool_64_tree_reduction.py
+++ b/studies/cuda-kernels/reducers/awkward_reduce_sum_int32_bool_64_tree_reduction.py
@@ -20,6 +20,8 @@
 
         if (thread_id < lenparents) {
             shared[idx] = (fromptr[thread_id] != 0) ? 1 : 0;
+        } else {
+            shared[idx] = 0;
         }
         __syncthreads();
 
@@ -28,7 +30,6 @@
             if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
                 val = shared[idx - stride];
             }
-            __syncthreads();
             shared[idx] += val;
             __syncthreads();
         }
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_sum_int64_bool_64_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_sum_int64_bool_64_tree_reduction.py
index 3aa595d34f..69be141d00 100644
--- a/studies/cuda-kernels/reducers/awkward_reduce_sum_int64_bool_64_tree_reduction.py
+++ b/studies/cuda-kernels/reducers/awkward_reduce_sum_int64_bool_64_tree_reduction.py
@@ -20,6 +20,8 @@
 
         if (thread_id < lenparents) {
             shared[idx] = (fromptr[thread_id] != 0) ? 1 : 0;
+        } else {
+            shared[idx] = 0;
         }
         __syncthreads();
 
@@ -28,7 +30,6 @@
             if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
                 val = shared[idx - stride];
             }
-            __syncthreads();
             shared[idx] += val;
             __syncthreads();
         }
diff --git a/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py
index 11428c5475..db460ec574 100644
--- a/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py
+++ b/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py
@@ -20,6 +20,8 @@
 
         if (thread_id < lenparents) {
             shared[idx] = fromptr[thread_id];
+        } else {
+            shared[idx] = 0;
         }
         __syncthreads();
 
@@ -28,7 +30,6 @@
             if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
                 val = shared[idx - stride];
             }
-            __syncthreads();
             shared[idx] += val;
             __syncthreads();
         }