From 1e1b8ea4ade00407d7a584a34887163bb2fa01be Mon Sep 17 00:00:00 2001 From: Manasvi Goyal Date: Thu, 16 May 2024 17:50:27 +0200 Subject: [PATCH 01/10] chore: trying atomics and tree reduction for CUDA reducer kernels --- .../reducers/awkward_reduce_sum_atomics.py | 43 ++++++++++++ .../awkward_reduce_sum_tree_reduction.py | 65 +++++++++++++++++++ 2 files changed, 108 insertions(+) create mode 100644 studies/cuda-kernels/reducers/awkward_reduce_sum_atomics.py create mode 100644 studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py diff --git a/studies/cuda-kernels/reducers/awkward_reduce_sum_atomics.py b/studies/cuda-kernels/reducers/awkward_reduce_sum_atomics.py new file mode 100644 index 0000000000..d1e456cf4b --- /dev/null +++ b/studies/cuda-kernels/reducers/awkward_reduce_sum_atomics.py @@ -0,0 +1,43 @@ +import cupy as cp + +cuda_kernel = """ +extern "C" { + __global__ void reduce_sum_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength) { + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < outlength) { + toptr[thread_id] = 0; + } + } +} +extern "C" { + __global__ void reduce_sum_b(int* toptr, int* fromptr, int* parents, int lenparents, int outlength) { + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + + for (int i = thread_id; i < lenparents; i += stride) { + atomicAdd(&toptr[parents[i]], fromptr[i]); + } + } +} +""" + +parents = cp.array([0, 1, 1, 2, 2, 3, 3, 3, 5], dtype=cp.int32) +fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=cp.int32) +lenparents = len(parents) +outlength = int(cp.max(parents)) + 1 +toptr = cp.zeros(outlength, dtype=cp.int32) + +block_size = 256 +grid_size = (lenparents + block_size - 1) // block_size + +raw_module = cp.RawModule(code=cuda_kernel) + +reduce_sum_a = raw_module.get_function('reduce_sum_a') +reduce_sum_b = raw_module.get_function('reduce_sum_b') + +reduce_sum_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength)) +reduce_sum_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength)) + +toptr_host = toptr.get() +print("atomic toptr:", toptr_host) \ No newline at end of file diff --git a/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py new file mode 100644 index 0000000000..82b072cc63 --- /dev/null +++ b/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py @@ -0,0 +1,65 @@ +import cupy as cp + +cuda_kernel = """ +extern "C" { + __global__ void reduce_sum_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength) { + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < outlength) { + toptr[thread_id] = 0; + } + } +} + extern "C" { + __global__ void reduce_sum_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength) { + extern __shared__ int shared[]; + + int idx = threadIdx.x; + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < lenparents) { + shared[idx] = fromptr[thread_id]; + __syncthreads(); + + for (int stride = 1; stride < blockDim.x; stride *= 2) { + int index = idx - stride; + if (index >= 0 && parents[index] == parents[idx]) { + shared[idx] += shared[index]; + } + __syncthreads(); + } + + fromptr[thread_id] = shared[idx]; + + if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { + int parent = parents[thread_id]; + if (parent < lenparents) { + toptr[parent] = shared[idx]; + } + } + } + } +} +""" + +parents = cp.array([0, 1, 1, 2, 2, 3, 3, 3, 5], dtype=cp.int32) +fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=cp.int32) +lenparents = len(parents) +outlength = int(cp.max(parents)) + 1 +toptr = cp.zeros(outlength, dtype=cp.int32) + +block_size = 256 +grid_size = (lenparents + block_size - 1) // block_size + +toptr = cp.zeros(outlength, dtype=cp.int32) + +raw_module = cp.RawModule(code=cuda_kernel) + +reduce_sum_a = raw_module.get_function('reduce_sum_a') +reduce_sum_b = raw_module.get_function('reduce_sum_b') + +reduce_sum_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength)) +reduce_sum_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength)) + +toptr_host = toptr.get() +print("tree reduction toptr:", toptr_host) \ No newline at end of file From f79b2ae3a8440432f1fd6c4bbe553aa6f4ee3500 Mon Sep 17 00:00:00 2001 From: Manasvi Goyal Date: Thu, 16 May 2024 18:03:29 +0200 Subject: [PATCH 02/10] chore: add prod, min, max --- .../awkward_reduce_max_tree_reduction.py | 65 +++++++++++++++++++ .../awkward_reduce_min_tree_reduction.py | 65 +++++++++++++++++++ .../awkward_reduce_prod_tree_reduction.py | 65 +++++++++++++++++++ .../reducers/awkward_reduce_sum_atomics.py | 12 ++-- .../awkward_reduce_sum_tree_reduction.py | 12 ++-- 5 files changed, 207 insertions(+), 12 deletions(-) create mode 100644 studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py create mode 100644 studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py create mode 100644 studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py diff --git a/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py new file mode 100644 index 0000000000..a00047672a --- /dev/null +++ b/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py @@ -0,0 +1,65 @@ +import cupy as cp + +cuda_kernel = """ +extern "C" { + __global__ void awkward_reduce_max_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength) { + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < outlength) { + toptr[thread_id] = -9223372036854775808; + } + } +} + extern "C" { + __global__ void awkward_reduce_max_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength) { + extern __shared__ int shared[]; + + int idx = threadIdx.x; + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < lenparents) { + shared[idx] = fromptr[thread_id]; + __syncthreads(); + + for (int stride = 1; stride < blockDim.x; stride *= 2) { + int index = idx - stride; + if (index >= 0 && parents[index] == parents[idx]) { + shared[tid] = max(shared[tid], shared[index]); + } + __syncthreads(); + } + + fromptr[thread_id] = shared[idx]; + + if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { + int parent = parents[thread_id]; + if (parent < lenparents) { + toptr[parent] = shared[idx]; + } + } + } + } +} +""" + +parents = cp.array([0, 1, 1, 2, 2, 3, 3, 3, 5], dtype=cp.int32) +fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=cp.int32) +lenparents = len(parents) +outlength = int(cp.max(parents)) + 1 +toptr = cp.zeros(outlength, dtype=cp.int32) + +block_size = 256 +grid_size = (lenparents + block_size - 1) // block_size + +toptr = cp.zeros(outlength, dtype=cp.int32) + +raw_module = cp.RawModule(code=cuda_kernel) + +awkward_reduce_max_a = raw_module.get_function('awkward_reduce_max_a') +awkward_reduce_max_b = raw_module.get_function('awkward_reduce_max_b') + +awkward_reduce_max_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength)) +awkward_reduce_max_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength)) + +toptr_host = toptr.get() +print("tree reduction toptr:", toptr_host) \ No newline at end of file diff --git a/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py new file mode 100644 index 0000000000..4ee35375c5 --- /dev/null +++ b/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py @@ -0,0 +1,65 @@ +import cupy as cp + +cuda_kernel = """ +extern "C" { + __global__ void awkward_reduce_min_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength) { + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < outlength) { + toptr[thread_id] = 9223372036854775807; + } + } +} + extern "C" { + __global__ void awkward_reduce_min_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength) { + extern __shared__ int shared[]; + + int idx = threadIdx.x; + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < lenparents) { + shared[idx] = fromptr[thread_id]; + __syncthreads(); + + for (int stride = 1; stride < blockDim.x; stride *= 2) { + int index = idx - stride; + if (index >= 0 && parents[index] == parents[idx]) { + shared[tid] = min(shared[tid], shared[index]); + } + __syncthreads(); + } + + fromptr[thread_id] = shared[idx]; + + if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { + int parent = parents[thread_id]; + if (parent < lenparents) { + toptr[parent] = shared[idx]; + } + } + } + } +} +""" + +parents = cp.array([0, 1, 1, 2, 2, 3, 3, 3, 5], dtype=cp.int32) +fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=cp.int32) +lenparents = len(parents) +outlength = int(cp.max(parents)) + 1 +toptr = cp.zeros(outlength, dtype=cp.int32) + +block_size = 256 +grid_size = (lenparents + block_size - 1) // block_size + +toptr = cp.zeros(outlength, dtype=cp.int32) + +raw_module = cp.RawModule(code=cuda_kernel) + +awkward_reduce_min_a = raw_module.get_function('awkward_reduce_min_a') +awkward_reduce_min_b = raw_module.get_function('awkward_reduce_min_b') + +awkward_reduce_min_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength)) +awkward_reduce_min_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength)) + +toptr_host = toptr.get() +print("tree reduction toptr:", toptr_host) \ No newline at end of file diff --git a/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py new file mode 100644 index 0000000000..ac17bbfdeb --- /dev/null +++ b/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py @@ -0,0 +1,65 @@ +import cupy as cp + +cuda_kernel = """ +extern "C" { + __global__ void awkward_reduce_prod_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength) { + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < outlength) { + toptr[thread_id] = 1; + } + } +} + extern "C" { + __global__ void awkward_reduce_prod_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength) { + extern __shared__ int shared[]; + + int idx = threadIdx.x; + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < lenparents) { + shared[idx] = fromptr[thread_id]; + __syncthreads(); + + for (int stride = 1; stride < blockDim.x; stride *= 2) { + int index = idx - stride; + if (index >= 0 && parents[index] == parents[idx]) { + shared[idx] *= shared[index]; + } + __syncthreads(); + } + + fromptr[thread_id] = shared[idx]; + + if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { + int parent = parents[thread_id]; + if (parent < lenparents) { + toptr[parent] = shared[idx]; + } + } + } + } +} +""" + +parents = cp.array([0, 1, 1, 2, 2, 3, 3, 3, 5], dtype=cp.int32) +fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=cp.int32) +lenparents = len(parents) +outlength = int(cp.max(parents)) + 1 +toptr = cp.zeros(outlength, dtype=cp.int32) + +block_size = 256 +grid_size = (lenparents + block_size - 1) // block_size + +toptr = cp.zeros(outlength, dtype=cp.int32) + +raw_module = cp.RawModule(code=cuda_kernel) + +awkward_reduce_prod_a = raw_module.get_function('awkward_reduce_prod_a') +awkward_reduce_prod_b = raw_module.get_function('awkward_reduce_prod_b') + +awkward_reduce_prod_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength)) +awkward_reduce_prod_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength)) + +toptr_host = toptr.get() +print("tree reduction toptr:", toptr_host) \ No newline at end of file diff --git a/studies/cuda-kernels/reducers/awkward_reduce_sum_atomics.py b/studies/cuda-kernels/reducers/awkward_reduce_sum_atomics.py index d1e456cf4b..dc2d6185ad 100644 --- a/studies/cuda-kernels/reducers/awkward_reduce_sum_atomics.py +++ b/studies/cuda-kernels/reducers/awkward_reduce_sum_atomics.py @@ -2,7 +2,7 @@ cuda_kernel = """ extern "C" { - __global__ void reduce_sum_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength) { + __global__ void awkward_reduce_sum_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength) { int thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { @@ -11,7 +11,7 @@ } } extern "C" { - __global__ void reduce_sum_b(int* toptr, int* fromptr, int* parents, int lenparents, int outlength) { + __global__ void awkward_reduce_sum_b(int* toptr, int* fromptr, int* parents, int lenparents, int outlength) { int thread_id = blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; @@ -33,11 +33,11 @@ raw_module = cp.RawModule(code=cuda_kernel) -reduce_sum_a = raw_module.get_function('reduce_sum_a') -reduce_sum_b = raw_module.get_function('reduce_sum_b') +awkward_reduce_sum_a = raw_module.get_function('awkward_reduce_sum_a') +awkward_reduce_sum_b = raw_module.get_function('awkward_reduce_sum_b') -reduce_sum_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength)) -reduce_sum_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength)) +awkward_reduce_sum_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength)) +awkward_reduce_sum_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength)) toptr_host = toptr.get() print("atomic toptr:", toptr_host) \ No newline at end of file diff --git a/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py index 82b072cc63..8ec46ad4f8 100644 --- a/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py +++ b/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py @@ -2,7 +2,7 @@ cuda_kernel = """ extern "C" { - __global__ void reduce_sum_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength) { + __global__ void awkward_reduce_sum_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength) { int thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { @@ -11,7 +11,7 @@ } } extern "C" { - __global__ void reduce_sum_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength) { + __global__ void awkward_reduce_sum_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength) { extern __shared__ int shared[]; int idx = threadIdx.x; @@ -55,11 +55,11 @@ raw_module = cp.RawModule(code=cuda_kernel) -reduce_sum_a = raw_module.get_function('reduce_sum_a') -reduce_sum_b = raw_module.get_function('reduce_sum_b') +awkward_reduce_sum_a = raw_module.get_function('awkward_reduce_sum_a') +awkward_reduce_sum_b = raw_module.get_function('awkward_reduce_sum_b') -reduce_sum_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength)) -reduce_sum_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength)) +awkward_reduce_sum_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength)) +awkward_reduce_sum_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength)) toptr_host = toptr.get() print("tree reduction toptr:", toptr_host) \ No newline at end of file From 49da4f26ae6c2b07e2b43ac82f92d5df6fa7a0cb Mon Sep 17 00:00:00 2001 From: ManasviGoyal Date: Fri, 17 May 2024 09:33:53 +0200 Subject: [PATCH 03/10] fix: some fixes --- .../reducers/awkward_reduce_max_tree_reduction.py | 15 +++++++-------- .../reducers/awkward_reduce_min_tree_reduction.py | 15 +++++++-------- .../awkward_reduce_prod_tree_reduction.py | 2 -- .../reducers/awkward_reduce_sum_tree_reduction.py | 2 -- 4 files changed, 14 insertions(+), 20 deletions(-) diff --git a/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py index a00047672a..eb20720a25 100644 --- a/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py +++ b/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py @@ -2,16 +2,16 @@ cuda_kernel = """ extern "C" { - __global__ void awkward_reduce_max_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength) { + __global__ void awkward_reduce_max_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int identity) { int thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - toptr[thread_id] = -9223372036854775808; + toptr[thread_id] = identity; } } } extern "C" { - __global__ void awkward_reduce_max_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength) { + __global__ void awkward_reduce_max_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int identity) { extern __shared__ int shared[]; int idx = threadIdx.x; @@ -24,7 +24,7 @@ for (int stride = 1; stride < blockDim.x; stride *= 2) { int index = idx - stride; if (index >= 0 && parents[index] == parents[idx]) { - shared[tid] = max(shared[tid], shared[index]); + shared[idx] = max(shared[idx], shared[index]); } __syncthreads(); } @@ -46,20 +46,19 @@ fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=cp.int32) lenparents = len(parents) outlength = int(cp.max(parents)) + 1 +identity = cp.int32(cp.iinfo(cp.int32).min) toptr = cp.zeros(outlength, dtype=cp.int32) block_size = 256 grid_size = (lenparents + block_size - 1) // block_size -toptr = cp.zeros(outlength, dtype=cp.int32) - raw_module = cp.RawModule(code=cuda_kernel) awkward_reduce_max_a = raw_module.get_function('awkward_reduce_max_a') awkward_reduce_max_b = raw_module.get_function('awkward_reduce_max_b') -awkward_reduce_max_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength)) -awkward_reduce_max_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength)) +awkward_reduce_max_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity)) +awkward_reduce_max_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity)) toptr_host = toptr.get() print("tree reduction toptr:", toptr_host) \ No newline at end of file diff --git a/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py index 4ee35375c5..07e26bc878 100644 --- a/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py +++ b/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py @@ -2,16 +2,16 @@ cuda_kernel = """ extern "C" { - __global__ void awkward_reduce_min_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength) { + __global__ void awkward_reduce_min_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int identity) { int thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - toptr[thread_id] = 9223372036854775807; + toptr[thread_id] = identity; } } } extern "C" { - __global__ void awkward_reduce_min_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength) { + __global__ void awkward_reduce_min_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int identity) { extern __shared__ int shared[]; int idx = threadIdx.x; @@ -24,7 +24,7 @@ for (int stride = 1; stride < blockDim.x; stride *= 2) { int index = idx - stride; if (index >= 0 && parents[index] == parents[idx]) { - shared[tid] = min(shared[tid], shared[index]); + shared[idx] = min(shared[idx], shared[index]); } __syncthreads(); } @@ -46,20 +46,19 @@ fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=cp.int32) lenparents = len(parents) outlength = int(cp.max(parents)) + 1 +identity = cp.int32(cp.iinfo(cp.int32).max) toptr = cp.zeros(outlength, dtype=cp.int32) block_size = 256 grid_size = (lenparents + block_size - 1) // block_size -toptr = cp.zeros(outlength, dtype=cp.int32) - raw_module = cp.RawModule(code=cuda_kernel) awkward_reduce_min_a = raw_module.get_function('awkward_reduce_min_a') awkward_reduce_min_b = raw_module.get_function('awkward_reduce_min_b') -awkward_reduce_min_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength)) -awkward_reduce_min_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength)) +awkward_reduce_min_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity)) +awkward_reduce_min_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity)) toptr_host = toptr.get() print("tree reduction toptr:", toptr_host) \ No newline at end of file diff --git a/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py index ac17bbfdeb..0b543d3423 100644 --- a/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py +++ b/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py @@ -51,8 +51,6 @@ block_size = 256 grid_size = (lenparents + block_size - 1) // block_size -toptr = cp.zeros(outlength, dtype=cp.int32) - raw_module = cp.RawModule(code=cuda_kernel) awkward_reduce_prod_a = raw_module.get_function('awkward_reduce_prod_a') diff --git a/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py index 8ec46ad4f8..a8ae4f3335 100644 --- a/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py +++ b/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py @@ -51,8 +51,6 @@ block_size = 256 grid_size = (lenparents + block_size - 1) // block_size -toptr = cp.zeros(outlength, dtype=cp.int32) - raw_module = cp.RawModule(code=cuda_kernel) awkward_reduce_sum_a = raw_module.get_function('awkward_reduce_sum_a') From 80f91c969e51a2348e62238c536d70fb650c4638 Mon Sep 17 00:00:00 2001 From: ManasviGoyal Date: Tue, 21 May 2024 14:05:23 +0200 Subject: [PATCH 04/10] fix: handle block boundaries --- .../awkward_reduce_max_tree_reduction.py | 67 ++++++++++++------- .../awkward_reduce_min_tree_reduction.py | 67 ++++++++++++------- .../awkward_reduce_prod_tree_reduction.py | 62 +++++++++++------ .../awkward_reduce_sum_tree_reduction.py | 62 +++++++++++------ 4 files changed, 168 insertions(+), 90 deletions(-) diff --git a/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py index eb20720a25..0624c3ed8c 100644 --- a/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py +++ b/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py @@ -2,7 +2,7 @@ cuda_kernel = """ extern "C" { - __global__ void awkward_reduce_max_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int identity) { + __global__ void awkward_reduce_max_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int identity, int* partial) { int thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { @@ -10,55 +10,74 @@ } } } - extern "C" { - __global__ void awkward_reduce_max_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int identity) { + +extern "C" { + __global__ void awkward_reduce_max_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int identity, int* partial) { extern __shared__ int shared[]; int idx = threadIdx.x; - int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + int thread_id = blockIdx.x * blockDim.x + idx; if (thread_id < lenparents) { shared[idx] = fromptr[thread_id]; + } + __syncthreads(); + + for (int stride = 1; stride < blockDim.x; stride *= 2) { + int val = identity; + if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { + val = shared[idx - stride]; + } + __syncthreads(); + shared[idx] = max(shared[idx], val); __syncthreads(); + } - for (int stride = 1; stride < blockDim.x; stride *= 2) { - int index = idx - stride; - if (index >= 0 && parents[index] == parents[idx]) { - shared[idx] = max(shared[idx], shared[index]); - } - __syncthreads(); + if (thread_id < lenparents) { + int parent = parents[thread_id]; + if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { + partial[blockIdx.x * outlength + parent] = shared[idx]; } + } + } +} - fromptr[thread_id] = shared[idx]; +extern "C" { + __global__ void awkward_reduce_max_c(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int identity, int* partial) { + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; - if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { - int parent = parents[thread_id]; - if (parent < lenparents) { - toptr[parent] = shared[idx]; - } + if (thread_id < outlength) { + int maximum = identity; + int blocks = (lenparents + blockDim.x - 1) / blockDim.x; + for (int i = 0; i < blocks; ++i) { + maximum = max(maximum, partial[i * outlength + thread_id]); } + toptr[thread_id] = maximum; } } } """ -parents = cp.array([0, 1, 1, 2, 2, 3, 3, 3, 5], dtype=cp.int32) -fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=cp.int32) +parents = cp.array([0, 1, 1, 2, 2, 2, 2, 2, 2, 5], dtype=cp.int32) +fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=cp.int32) lenparents = len(parents) outlength = int(cp.max(parents)) + 1 -identity = cp.int32(cp.iinfo(cp.int32).min) -toptr = cp.zeros(outlength, dtype=cp.int32) - -block_size = 256 +toptr = cp.full(outlength, cp.iinfo(cp.int32).min, dtype=cp.int32) +identity = cp.iinfo(cp.int32).min +block_size = 2 +partial = cp.full((outlength * ((lenparents + block_size - 1) // block_size)), cp.iinfo(cp.int32).min, dtype=cp.int32) grid_size = (lenparents + block_size - 1) // block_size +shared_mem_size = block_size * cp.int32().nbytes raw_module = cp.RawModule(code=cuda_kernel) awkward_reduce_max_a = raw_module.get_function('awkward_reduce_max_a') awkward_reduce_max_b = raw_module.get_function('awkward_reduce_max_b') +awkward_reduce_max_c = raw_module.get_function('awkward_reduce_max_c') -awkward_reduce_max_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity)) -awkward_reduce_max_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity)) +awkward_reduce_max_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial)) +awkward_reduce_max_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial), shared_mem=shared_mem_size) +awkward_reduce_max_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial)) toptr_host = toptr.get() print("tree reduction toptr:", toptr_host) \ No newline at end of file diff --git a/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py index 07e26bc878..5810ca97e5 100644 --- a/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py +++ b/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py @@ -2,7 +2,7 @@ cuda_kernel = """ extern "C" { - __global__ void awkward_reduce_min_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int identity) { + __global__ void awkward_reduce_min_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int identity, int* partial) { int thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { @@ -10,55 +10,74 @@ } } } - extern "C" { - __global__ void awkward_reduce_min_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int identity) { + +extern "C" { + __global__ void awkward_reduce_min_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int identity, int* partial) { extern __shared__ int shared[]; int idx = threadIdx.x; - int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + int thread_id = blockIdx.x * blockDim.x + idx; if (thread_id < lenparents) { shared[idx] = fromptr[thread_id]; + } + __syncthreads(); + + for (int stride = 1; stride < blockDim.x; stride *= 2) { + int val = identity; + if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { + val = shared[idx - stride]; + } + __syncthreads(); + shared[idx] = min(shared[idx], val); __syncthreads(); + } - for (int stride = 1; stride < blockDim.x; stride *= 2) { - int index = idx - stride; - if (index >= 0 && parents[index] == parents[idx]) { - shared[idx] = min(shared[idx], shared[index]); - } - __syncthreads(); + if (thread_id < lenparents) { + int parent = parents[thread_id]; + if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { + partial[blockIdx.x * outlength + parent] = shared[idx]; } + } + } +} - fromptr[thread_id] = shared[idx]; +extern "C" { + __global__ void awkward_reduce_min_c(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int identity, int* partial) { + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; - if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { - int parent = parents[thread_id]; - if (parent < lenparents) { - toptr[parent] = shared[idx]; - } + if (thread_id < outlength) { + int minimum = identity; + int blocks = (lenparents + blockDim.x - 1) / blockDim.x; + for (int i = 0; i < blocks; ++i) { + minimum = min(minimum, partial[i * outlength + thread_id]); } + toptr[thread_id] = minimum; } } } """ -parents = cp.array([0, 1, 1, 2, 2, 3, 3, 3, 5], dtype=cp.int32) -fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=cp.int32) +parents = cp.array([0, 1, 1, 2, 2, 2, 2, 2, 2, 5], dtype=cp.int32) +fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=cp.int32) lenparents = len(parents) outlength = int(cp.max(parents)) + 1 -identity = cp.int32(cp.iinfo(cp.int32).max) -toptr = cp.zeros(outlength, dtype=cp.int32) - -block_size = 256 +toptr = cp.full(outlength, cp.iinfo(cp.int32).max, dtype=cp.int32) +identity = cp.iinfo(cp.int32).max +block_size = 2 +partial = cp.full((outlength * ((lenparents + block_size - 1) // block_size)), cp.iinfo(cp.int32).max, dtype=cp.int32) grid_size = (lenparents + block_size - 1) // block_size +shared_mem_size = block_size * cp.int32().nbytes raw_module = cp.RawModule(code=cuda_kernel) awkward_reduce_min_a = raw_module.get_function('awkward_reduce_min_a') awkward_reduce_min_b = raw_module.get_function('awkward_reduce_min_b') +awkward_reduce_min_c = raw_module.get_function('awkward_reduce_min_c') -awkward_reduce_min_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity)) -awkward_reduce_min_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity)) +awkward_reduce_min_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial)) +awkward_reduce_min_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial), shared_mem=shared_mem_size) +awkward_reduce_min_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial)) toptr_host = toptr.get() print("tree reduction toptr:", toptr_host) \ No newline at end of file diff --git a/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py index 0b543d3423..48f33cc5c8 100644 --- a/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py +++ b/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py @@ -2,7 +2,7 @@ cuda_kernel = """ extern "C" { - __global__ void awkward_reduce_prod_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength) { + __global__ void awkward_reduce_prod_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) { int thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { @@ -10,54 +10,74 @@ } } } - extern "C" { - __global__ void awkward_reduce_prod_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength) { + +extern "C" { + __global__ void awkward_reduce_prod_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int* partial) { extern __shared__ int shared[]; int idx = threadIdx.x; - int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + int thread_id = blockIdx.x * blockDim.x + idx; if (thread_id < lenparents) { shared[idx] = fromptr[thread_id]; + } + __syncthreads(); + + for (int stride = 1; stride < blockDim.x; stride *= 2) { + int val = 1; + if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { + val = shared[idx - stride]; + } + __syncthreads(); + shared[idx] *= val; __syncthreads(); + } - for (int stride = 1; stride < blockDim.x; stride *= 2) { - int index = idx - stride; - if (index >= 0 && parents[index] == parents[idx]) { - shared[idx] *= shared[index]; - } - __syncthreads(); + if (thread_id < lenparents) { + int parent = parents[thread_id]; + if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { + partial[blockIdx.x * outlength + parent] = shared[idx]; } + } + } +} - fromptr[thread_id] = shared[idx]; +extern "C" { + __global__ void awkward_reduce_prod_c(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int* partial) { + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; - if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { - int parent = parents[thread_id]; - if (parent < lenparents) { - toptr[parent] = shared[idx]; - } + if (thread_id < outlength) { + int prod = 1; + int blocks = (lenparents + blockDim.x - 1) / blockDim.x; + for (int i = 0; i < blocks; ++i) { + prod *= partial[i * outlength + thread_id]; } + toptr[thread_id] = prod; } } } """ -parents = cp.array([0, 1, 1, 2, 2, 3, 3, 3, 5], dtype=cp.int32) -fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=cp.int32) +parents = cp.array([0, 1, 1, 2, 2, 2, 2, 2, 2, 5], dtype=cp.int32) +fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=cp.int32) lenparents = len(parents) outlength = int(cp.max(parents)) + 1 toptr = cp.zeros(outlength, dtype=cp.int32) -block_size = 256 +block_size = 2 +partial = cp.ones((outlength * ((lenparents + block_size - 1) // block_size)), dtype=cp.int32) grid_size = (lenparents + block_size - 1) // block_size +shared_mem_size = block_size * cp.int32().nbytes raw_module = cp.RawModule(code=cuda_kernel) awkward_reduce_prod_a = raw_module.get_function('awkward_reduce_prod_a') awkward_reduce_prod_b = raw_module.get_function('awkward_reduce_prod_b') +awkward_reduce_prod_c = raw_module.get_function('awkward_reduce_prod_c') -awkward_reduce_prod_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength)) -awkward_reduce_prod_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength)) +awkward_reduce_prod_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial)) +awkward_reduce_prod_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size) +awkward_reduce_prod_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial)) toptr_host = toptr.get() print("tree reduction toptr:", toptr_host) \ No newline at end of file diff --git a/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py index a8ae4f3335..c8757ebb64 100644 --- a/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py +++ b/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py @@ -2,7 +2,7 @@ cuda_kernel = """ extern "C" { - __global__ void awkward_reduce_sum_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength) { + __global__ void awkward_reduce_sum_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) { int thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { @@ -10,54 +10,74 @@ } } } - extern "C" { - __global__ void awkward_reduce_sum_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength) { + +extern "C" { + __global__ void awkward_reduce_sum_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int* partial) { extern __shared__ int shared[]; int idx = threadIdx.x; - int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + int thread_id = blockIdx.x * blockDim.x + idx; if (thread_id < lenparents) { shared[idx] = fromptr[thread_id]; + } + __syncthreads(); + + for (int stride = 1; stride < blockDim.x; stride *= 2) { + int val = 0; + if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { + val = shared[idx - stride]; + } + __syncthreads(); + shared[idx] += val; __syncthreads(); + } - for (int stride = 1; stride < blockDim.x; stride *= 2) { - int index = idx - stride; - if (index >= 0 && parents[index] == parents[idx]) { - shared[idx] += shared[index]; - } - __syncthreads(); + if (thread_id < lenparents) { + int parent = parents[thread_id]; + if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { + partial[blockIdx.x * outlength + parent] = shared[idx]; } + } + } +} - fromptr[thread_id] = shared[idx]; +extern "C" { + __global__ void awkward_reduce_sum_c(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int* partial) { + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; - if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { - int parent = parents[thread_id]; - if (parent < lenparents) { - toptr[parent] = shared[idx]; - } + if (thread_id < outlength) { + int sum = 0; + int blocks = (lenparents + blockDim.x - 1) / blockDim.x; + for (int i = 0; i < blocks; ++i) { + sum += partial[i * outlength + thread_id]; } + toptr[thread_id] = sum; } } } """ -parents = cp.array([0, 1, 1, 2, 2, 3, 3, 3, 5], dtype=cp.int32) -fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=cp.int32) +parents = cp.array([0, 1, 1, 2, 2, 2, 2, 2, 2, 5], dtype=cp.int32) +fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=cp.int32) lenparents = len(parents) outlength = int(cp.max(parents)) + 1 toptr = cp.zeros(outlength, dtype=cp.int32) -block_size = 256 +block_size = 2 +partial = cp.zeros((outlength * ((lenparents + block_size - 1) // block_size)), dtype=cp.int32) grid_size = (lenparents + block_size - 1) // block_size +shared_mem_size = block_size * cp.int32().nbytes raw_module = cp.RawModule(code=cuda_kernel) awkward_reduce_sum_a = raw_module.get_function('awkward_reduce_sum_a') awkward_reduce_sum_b = raw_module.get_function('awkward_reduce_sum_b') +awkward_reduce_sum_c = raw_module.get_function('awkward_reduce_sum_c') -awkward_reduce_sum_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength)) -awkward_reduce_sum_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength)) +awkward_reduce_sum_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial)) +awkward_reduce_sum_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size) +awkward_reduce_sum_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial)) toptr_host = toptr.get() print("tree reduction toptr:", toptr_host) \ No newline at end of file From f000fd83e5eafaf82629bcd629db34c8f868cda0 Mon Sep 17 00:00:00 2001 From: ManasviGoyal Date: Tue, 21 May 2024 14:26:25 +0200 Subject: [PATCH 05/10] chore: add argmin and argmax --- .../awkward_reduce_argmax_tree_reduction.py | 87 +++++++++++++++++++ .../awkward_reduce_argmin_tree_reduction.py | 87 +++++++++++++++++++ 2 files changed, 174 insertions(+) create mode 100644 studies/cuda-kernels/reducers/awkward_reduce_argmax_tree_reduction.py create mode 100644 studies/cuda-kernels/reducers/awkward_reduce_argmin_tree_reduction.py diff --git a/studies/cuda-kernels/reducers/awkward_reduce_argmax_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_argmax_tree_reduction.py new file mode 100644 index 0000000000..41221ddf6b --- /dev/null +++ b/studies/cuda-kernels/reducers/awkward_reduce_argmax_tree_reduction.py @@ -0,0 +1,87 @@ +import cupy as cp + +cuda_kernel = """ +extern "C" { + __global__ void awkward_reduce_argmax_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) { + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < outlength) { + toptr[thread_id] = -1; + } + } +} + +extern "C" { + __global__ void awkward_reduce_argmax_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int* partial) { + extern __shared__ int shared[]; + + int idx = threadIdx.x; + int thread_id = blockIdx.x * blockDim.x + idx; + + if (thread_id < lenparents) { + shared[idx] = thread_id; + } + __syncthreads(); + + for (int stride = 1; stride < blockDim.x; stride *= 2) { + int index = -1; + if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { + index = shared[idx - stride]; + } + __syncthreads(); + if (index != -1 && (shared[idx] == -1 || fromptr[index] > fromptr[shared[idx]])) { + shared[idx] = index; + } + __syncthreads(); + } + + if (thread_id < lenparents) { + int parent = parents[thread_id]; + if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { + partial[blockIdx.x * outlength + parent] = shared[idx]; + } + } + } +} + +extern "C" { + __global__ void awkward_reduce_argmax_c(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int* partial) { + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < outlength) { + int max_index = -1; + int blocks = (lenparents + blockDim.x - 1) / blockDim.x; + for (int i = 0; i < blocks; ++i) { + int index = partial[i * outlength + thread_id]; + if (index != -1 && (max_index == -1 || fromptr[index] > fromptr[max_index])) { + max_index = index; + } + } + toptr[thread_id] = max_index; + } + } +} +""" + +parents = cp.array([0, 1, 1, 2, 2, 2, 2, 2, 2, 5], dtype=cp.int32) +fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=cp.int32) +lenparents = len(parents) +outlength = int(cp.max(parents)) + 1 +toptr = cp.full(outlength, -1, dtype=cp.int32) +block_size = 2 +partial = cp.full((outlength * ((lenparents + block_size - 1) // block_size)), -1, dtype=cp.int32) +grid_size = (lenparents + block_size - 1) // block_size +shared_mem_size = block_size * cp.int32().nbytes + +raw_module = cp.RawModule(code=cuda_kernel) + +awkward_reduce_argmax_a = raw_module.get_function('awkward_reduce_argmax_a') +awkward_reduce_argmax_b = raw_module.get_function('awkward_reduce_argmax_b') +awkward_reduce_argmax_c = raw_module.get_function('awkward_reduce_argmax_c') + +awkward_reduce_argmax_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial)) +awkward_reduce_argmax_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size) +awkward_reduce_argmax_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial)) + +toptr_host = toptr.get() +print("tree reduction toptr:", toptr_host) \ No newline at end of file diff --git a/studies/cuda-kernels/reducers/awkward_reduce_argmin_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_argmin_tree_reduction.py new file mode 100644 index 0000000000..8718339519 --- /dev/null +++ b/studies/cuda-kernels/reducers/awkward_reduce_argmin_tree_reduction.py @@ -0,0 +1,87 @@ +import cupy as cp + +cuda_kernel = """ +extern "C" { + __global__ void awkward_reduce_argmin_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) { + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < outlength) { + toptr[thread_id] = -1; + } + } +} + +extern "C" { + __global__ void awkward_reduce_argmin_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int* partial) { + extern __shared__ int shared[]; + + int idx = threadIdx.x; + int thread_id = blockIdx.x * blockDim.x + idx; + + if (thread_id < lenparents) { + shared[idx] = thread_id; + } + __syncthreads(); + + for (int stride = 1; stride < blockDim.x; stride *= 2) { + int index = -1; + if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { + index = shared[idx - stride]; + } + __syncthreads(); + if (index != -1 && (shared[idx] == -1 || fromptr[index] < fromptr[shared[idx]])) { + shared[idx] = index; + } + __syncthreads(); + } + + if (thread_id < lenparents) { + int parent = parents[thread_id]; + if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { + partial[blockIdx.x * outlength + parent] = shared[idx]; + } + } + } +} + +extern "C" { + __global__ void awkward_reduce_argmin_c(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int* partial) { + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < outlength) { + int min_index = -1; + int blocks = (lenparents + blockDim.x - 1) / blockDim.x; + for (int i = 0; i < blocks; ++i) { + int index = partial[i * outlength + thread_id]; + if (index != -1 && (min_index == -1 || fromptr[index] < fromptr[min_index])) { + min_index = index; + } + } + toptr[thread_id] = min_index; + } + } +} +""" + +parents = cp.array([0, 1, 1, 2, 2, 2, 2, 2, 2, 5], dtype=cp.int32) +fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=cp.int32) +lenparents = len(parents) +outlength = int(cp.max(parents)) + 1 +toptr = cp.full(outlength, -1, dtype=cp.int32) +block_size = 2 +partial = cp.full((outlength * ((lenparents + block_size - 1) // block_size)), -1, dtype=cp.int32) +grid_size = (lenparents + block_size - 1) // block_size +shared_mem_size = block_size * cp.int32().nbytes + +raw_module = cp.RawModule(code=cuda_kernel) + +awkward_reduce_argmin_a = raw_module.get_function('awkward_reduce_argmin_a') +awkward_reduce_argmin_b = raw_module.get_function('awkward_reduce_argmin_b') +awkward_reduce_argmin_c = raw_module.get_function('awkward_reduce_argmin_c') + +awkward_reduce_argmin_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial)) +awkward_reduce_argmin_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size) +awkward_reduce_argmin_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial)) + +toptr_host = toptr.get() +print("tree reduction toptr:", toptr_host) \ No newline at end of file From c040a039c60c514ce209ba156dde5f19429f60c1 Mon Sep 17 00:00:00 2001 From: ManasviGoyal Date: Tue, 21 May 2024 16:20:40 +0200 Subject: [PATCH 06/10] chore: add sum and max complex --- ...kward_reduce_max_complex_tree_reduction.py | 98 +++++++++++++++++++ ...kward_reduce_sum_complex_tree_reduction.py | 92 +++++++++++++++++ 2 files changed, 190 insertions(+) create mode 100644 studies/cuda-kernels/reducers/awkward_reduce_max_complex_tree_reduction.py create mode 100644 studies/cuda-kernels/reducers/awkward_reduce_sum_complex_tree_reduction.py diff --git a/studies/cuda-kernels/reducers/awkward_reduce_max_complex_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_max_complex_tree_reduction.py new file mode 100644 index 0000000000..4d81b60a80 --- /dev/null +++ b/studies/cuda-kernels/reducers/awkward_reduce_max_complex_tree_reduction.py @@ -0,0 +1,98 @@ +import cupy as cp + +cuda_kernel = """ +extern "C" { + __global__ void awkward_reduce_max_complex_a(float* toptr, float* fromptr, int* parents, int lenparents, int outlength, int identity, int* partial) { + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < outlength) { + toptr[thread_id * 2] = identity; + toptr[thread_id * 2 + 1] = 0; + } + } +} + +extern "C" { + __global__ void awkward_reduce_max_complex_b(float* toptr, float* fromptr, int* parents, int lenparents, int outlength, int identity, int* partial) { + extern __shared__ float shared[]; + + int idx = threadIdx.x; + int thread_id = blockIdx.x * blockDim.x + idx; + + if (thread_id < lenparents) { + shared[idx * 2] = fromptr[thread_id * 2]; + shared[idx * 2 + 1] = fromptr[thread_id * 2 + 1]; + } + __syncthreads(); + + for (int stride = 1; stride < blockDim.x; stride *= 2) { + float real = identity; + float imag = 0; + if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { + real = shared[(idx - stride) * 2]; + imag = shared[(idx - stride) * 2 + 1]; + } + __syncthreads(); + if (shared[idx * 2] < real || shared[idx * 2] == real && shared[idx * 2 + 1] < imag) { + shared[idx * 2] = real; + shared[idx * 2 + 1] = imag; + } + __syncthreads(); + } + + if (thread_id < lenparents) { + int parent = parents[thread_id]; + if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { + partial[(blockIdx.x * outlength + parent) * 2 ] = shared[idx * 2]; + partial[(blockIdx.x * outlength + parent) * 2 + 1] = shared[idx * 2 + 1]; + } + } + } +} + +extern "C" { + __global__ void awkward_reduce_max_complex_c(float* toptr, float* fromptr, int* parents, int lenparents, int outlength, int identity, int* partial) { + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < outlength) { + float real = identity; + float imag = 0; + int blocks = (lenparents + blockDim.x - 1) / blockDim.x; + for (int i = 0; i < blocks; ++i) { + if (real < partial[2 * (i * outlength + thread_id)] || + (partial[2 * (i * outlength + thread_id)] == real && + imag < partial[2 * (i * outlength + thread_id) + 1])) { + real = partial[(i * outlength + thread_id) * 2]; + imag = partial[(i * outlength + thread_id) * 2 + 1]; + } + } + toptr[thread_id * 2] = real; + toptr[thread_id * 2 + 1] = imag; + } + } +} +""" + +parents = cp.array([0, 1, 1, 2, 2, 2, 2, 2, 2, 5], dtype=cp.int32) +fromptr = cp.array([1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 0], dtype=cp.float32) +lenparents = len(parents) +outlength = int(cp.max(parents)) + 1 +identity = cp.finfo(cp.float32).min +toptr = cp.full(outlength * 2, identity, dtype=cp.float32) +block_size = 2 +partial = cp.full((outlength * ((lenparents + block_size - 1) // block_size)), identity, dtype=cp.float32) +grid_size = (lenparents + block_size - 1) // block_size +shared_mem_size = block_size * cp.float32().nbytes + +raw_module = cp.RawModule(code=cuda_kernel) + +awkward_reduce_max_complex_a = raw_module.get_function('awkward_reduce_max_complex_a') +awkward_reduce_max_complex_b = raw_module.get_function('awkward_reduce_max_complex_b') +awkward_reduce_max_complex_c = raw_module.get_function('awkward_reduce_max_complex_c') + +awkward_reduce_max_complex_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial)) +awkward_reduce_max_complex_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial), shared_mem=shared_mem_size) +awkward_reduce_max_complex_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial)) + +toptr_host = toptr[0::2] + 1j * toptr[1::2] +print("tree reduction toptr:", toptr_host) \ No newline at end of file diff --git a/studies/cuda-kernels/reducers/awkward_reduce_sum_complex_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_sum_complex_tree_reduction.py new file mode 100644 index 0000000000..3f042880dc --- /dev/null +++ b/studies/cuda-kernels/reducers/awkward_reduce_sum_complex_tree_reduction.py @@ -0,0 +1,92 @@ +import cupy as cp + +cuda_kernel = """ +extern "C" { + __global__ void awkward_reduce_sum_complex_a(float* toptr, float* fromptr, int* parents, int lenparents, int outlength, int* partial) { + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < outlength) { + toptr[thread_id * 2] = 0; + toptr[thread_id * 2 + 1] = 0; + } + } +} + +extern "C" { + __global__ void awkward_reduce_sum_complex_b(float* toptr, float* fromptr, int* parents, int lenparents, int outlength, int* partial) { + extern __shared__ float shared[]; + + int idx = threadIdx.x; + int thread_id = blockIdx.x * blockDim.x + idx; + + if (thread_id < lenparents) { + shared[idx * 2] = fromptr[thread_id * 2]; + shared[idx * 2 + 1] = fromptr[thread_id * 2 + 1]; + } + __syncthreads(); + + for (int stride = 1; stride < blockDim.x; stride *= 2) { + float real = 0; + float imag = 0; + if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { + real = shared[(idx - stride) * 2]; + imag = shared[(idx - stride) * 2 + 1]; + } + __syncthreads(); + shared[idx * 2] += real; + shared[idx * 2 + 1] += imag; + __syncthreads(); + } + + if (thread_id < lenparents) { + int parent = parents[thread_id]; + if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { + partial[(blockIdx.x * outlength + parent) * 2 ] = shared[idx * 2]; + partial[(blockIdx.x * outlength + parent) * 2 + 1] = shared[idx * 2 + 1]; + } + } + } +} + +extern "C" { + __global__ void awkward_reduce_sum_complex_c(float* toptr, float* fromptr, int* parents, int lenparents, int outlength, int* partial) { + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < outlength) { + float real = 0; + float imag = 0; + int blocks = (lenparents + blockDim.x - 1) / blockDim.x; + for (int i = 0; i < blocks; ++i) { + real += partial[(i * outlength + thread_id) * 2]; + imag += partial[(i * outlength + thread_id) * 2 + 1]; + } + toptr[thread_id * 2] = real; + toptr[thread_id * 2 + 1] = imag; + } + } +} +""" + +parents = cp.array([0, 1, 1, 2, 2, 2, 2, 2, 2, 5], dtype=cp.int32) +fromptr = cp.array([1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 0], dtype=cp.float32) +lenparents = len(parents) +outlength = int(cp.max(parents)) + 1 +toptr = cp.zeros(outlength * 2, dtype=cp.float32) + +block_size = 2 +partial = cp.zeros((outlength * ((lenparents + block_size - 1) // block_size)), dtype=cp.float32) +grid_size = (lenparents + block_size - 1) // block_size +shared_mem_size = block_size * cp.int32().nbytes + +raw_module = cp.RawModule(code=cuda_kernel) + +awkward_reduce_sum_complex_a = raw_module.get_function('awkward_reduce_sum_complex_a') +awkward_reduce_sum_complex_b = raw_module.get_function('awkward_reduce_sum_complex_b') +awkward_reduce_sum_complex_c = raw_module.get_function('awkward_reduce_sum_complex_c') + +awkward_reduce_sum_complex_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial)) +awkward_reduce_sum_complex_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size) +awkward_reduce_sum_complex_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial)) + +toptr_host = toptr.get() +print("tree reduction toptr:", toptr_host) \ No newline at end of file From a01979782a94c88746a357d5b7bfe77b70e3b70f Mon Sep 17 00:00:00 2001 From: ManasviGoyal Date: Wed, 22 May 2024 10:27:45 +0200 Subject: [PATCH 07/10] chore: add sum and prod bool --- .../awkward_reduce_argmax_tree_reduction.py | 7 +- .../awkward_reduce_argmin_tree_reduction.py | 7 +- .../awkward_reduce_max_tree_reduction.py | 11 +-- ...kward_reduce_min_complex_tree_reduction.py | 98 +++++++++++++++++++ .../awkward_reduce_min_tree_reduction.py | 11 +-- ...awkward_reduce_prod_bool_tree_reduction.py | 82 ++++++++++++++++ .../awkward_reduce_prod_tree_reduction.py | 7 +- .../awkward_reduce_sum_bool_tree_reduction.py | 82 ++++++++++++++++ ...kward_reduce_sum_complex_tree_reduction.py | 3 +- .../awkward_reduce_sum_tree_reduction.py | 7 +- 10 files changed, 285 insertions(+), 30 deletions(-) create mode 100644 studies/cuda-kernels/reducers/awkward_reduce_min_complex_tree_reduction.py create mode 100644 studies/cuda-kernels/reducers/awkward_reduce_prod_bool_tree_reduction.py create mode 100644 studies/cuda-kernels/reducers/awkward_reduce_sum_bool_tree_reduction.py diff --git a/studies/cuda-kernels/reducers/awkward_reduce_argmax_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_argmax_tree_reduction.py index 41221ddf6b..9743c91c9b 100644 --- a/studies/cuda-kernels/reducers/awkward_reduce_argmax_tree_reduction.py +++ b/studies/cuda-kernels/reducers/awkward_reduce_argmax_tree_reduction.py @@ -12,7 +12,7 @@ } extern "C" { - __global__ void awkward_reduce_argmax_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int* partial) { + __global__ void awkward_reduce_argmax_b(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) { extern __shared__ int shared[]; int idx = threadIdx.x; @@ -45,7 +45,7 @@ } extern "C" { - __global__ void awkward_reduce_argmax_c(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int* partial) { + __global__ void awkward_reduce_argmax_c(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) { int thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { @@ -83,5 +83,4 @@ awkward_reduce_argmax_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size) awkward_reduce_argmax_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial)) -toptr_host = toptr.get() -print("tree reduction toptr:", toptr_host) \ No newline at end of file +assert cp.array_equal(toptr, cp.array([0, 2, 8, -1, -1, 9])) \ No newline at end of file diff --git a/studies/cuda-kernels/reducers/awkward_reduce_argmin_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_argmin_tree_reduction.py index 8718339519..0d8cd527a5 100644 --- a/studies/cuda-kernels/reducers/awkward_reduce_argmin_tree_reduction.py +++ b/studies/cuda-kernels/reducers/awkward_reduce_argmin_tree_reduction.py @@ -12,7 +12,7 @@ } extern "C" { - __global__ void awkward_reduce_argmin_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int* partial) { + __global__ void awkward_reduce_argmin_b(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) { extern __shared__ int shared[]; int idx = threadIdx.x; @@ -45,7 +45,7 @@ } extern "C" { - __global__ void awkward_reduce_argmin_c(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int* partial) { + __global__ void awkward_reduce_argmin_c(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) { int thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { @@ -83,5 +83,4 @@ awkward_reduce_argmin_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size) awkward_reduce_argmin_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial)) -toptr_host = toptr.get() -print("tree reduction toptr:", toptr_host) \ No newline at end of file +assert cp.array_equal(toptr, cp.array([0, 1, 3, -1, -1, 9])) \ No newline at end of file diff --git a/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py index 0624c3ed8c..df60f4417e 100644 --- a/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py +++ b/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py @@ -12,7 +12,7 @@ } extern "C" { - __global__ void awkward_reduce_max_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int identity, int* partial) { + __global__ void awkward_reduce_max_b(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int identity, int* partial) { extern __shared__ int shared[]; int idx = threadIdx.x; @@ -43,7 +43,7 @@ } extern "C" { - __global__ void awkward_reduce_max_c(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int identity, int* partial) { + __global__ void awkward_reduce_max_c(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int identity, int* partial) { int thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { @@ -62,10 +62,10 @@ fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=cp.int32) lenparents = len(parents) outlength = int(cp.max(parents)) + 1 -toptr = cp.full(outlength, cp.iinfo(cp.int32).min, dtype=cp.int32) identity = cp.iinfo(cp.int32).min +toptr = cp.full(outlength, identity, dtype=cp.int32) block_size = 2 -partial = cp.full((outlength * ((lenparents + block_size - 1) // block_size)), cp.iinfo(cp.int32).min, dtype=cp.int32) +partial = cp.full((outlength * ((lenparents + block_size - 1) // block_size)), identity, dtype=cp.int32) grid_size = (lenparents + block_size - 1) // block_size shared_mem_size = block_size * cp.int32().nbytes @@ -79,5 +79,4 @@ awkward_reduce_max_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial), shared_mem=shared_mem_size) awkward_reduce_max_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial)) -toptr_host = toptr.get() -print("tree reduction toptr:", toptr_host) \ No newline at end of file +assert cp.array_equal(toptr, cp.array([1, 3, 9, -2147483648, -2147483648, 10])) \ No newline at end of file diff --git a/studies/cuda-kernels/reducers/awkward_reduce_min_complex_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_min_complex_tree_reduction.py new file mode 100644 index 0000000000..f9cad9abab --- /dev/null +++ b/studies/cuda-kernels/reducers/awkward_reduce_min_complex_tree_reduction.py @@ -0,0 +1,98 @@ +import cupy as cp + +cuda_kernel = """ +extern "C" { + __global__ void awkward_reduce_min_complex_a(float* toptr, float* fromptr, int* parents, int lenparents, int outlength, int identity, int* partial) { + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < outlength) { + toptr[thread_id * 2] = identity; + toptr[thread_id * 2 + 1] = 0; + } + } +} + +extern "C" { + __global__ void awkward_reduce_min_complex_b(float* toptr, float* fromptr, int* parents, int lenparents, int outlength, int identity, int* partial) { + extern __shared__ float shared[]; + + int idx = threadIdx.x; + int thread_id = blockIdx.x * blockDim.x + idx; + + if (thread_id < lenparents) { + shared[idx * 2] = fromptr[thread_id * 2]; + shared[idx * 2 + 1] = fromptr[thread_id * 2 + 1]; + } + __syncthreads(); + + for (int stride = 1; stride < blockDim.x; stride *= 2) { + float real = identity; + float imag = 0; + if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { + real = shared[(idx - stride) * 2]; + imag = shared[(idx - stride) * 2 + 1]; + } + __syncthreads(); + if (shared[idx * 2] > real || shared[idx * 2] == real && shared[idx * 2 + 1] > imag) { + shared[idx * 2] = real; + shared[idx * 2 + 1] = imag; + } + __syncthreads(); + } + + if (thread_id < lenparents) { + int parent = parents[thread_id]; + if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { + partial[(blockIdx.x * outlength + parent) * 2 ] = shared[idx * 2]; + partial[(blockIdx.x * outlength + parent) * 2 + 1] = shared[idx * 2 + 1]; + } + } + } +} + +extern "C" { + __global__ void awkward_reduce_min_complex_c(float* toptr, float* fromptr, int* parents, int lenparents, int outlength, int identity, int* partial) { + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < outlength) { + float real = identity; + float imag = 0; + int blocks = (lenparents + blockDim.x - 1) / blockDim.x; + for (int i = 0; i < blocks; ++i) { + if (real > partial[2 * (i * outlength + thread_id)] || + (partial[2 * (i * outlength + thread_id)] == real && + imag > partial[2 * (i * outlength + thread_id) + 1])) { + real = partial[(i * outlength + thread_id) * 2]; + imag = partial[(i * outlength + thread_id) * 2 + 1]; + } + } + toptr[thread_id * 2] = real; + toptr[thread_id * 2 + 1] = imag; + } + } +} +""" + +parents = cp.array([0, 1, 1, 2, 2, 2, 2, 2, 2, 5], dtype=cp.int32) +fromptr = cp.array([1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 0], dtype=cp.float32) +lenparents = len(parents) +outlength = int(cp.max(parents)) + 1 +identity = 111111 +toptr = cp.full(outlength * 2, identity, dtype=cp.float32) +block_size = 2 +partial = cp.full((outlength * ((lenparents + block_size - 1) // block_size)), identity, dtype=cp.float32) +grid_size = (lenparents + block_size - 1) // block_size +shared_mem_size = block_size * cp.float32().nbytes + +raw_module = cp.RawModule(code=cuda_kernel) + +awkward_reduce_min_complex_a = raw_module.get_function('awkward_reduce_min_complex_a') +awkward_reduce_min_complex_b = raw_module.get_function('awkward_reduce_min_complex_b') +awkward_reduce_min_complex_c = raw_module.get_function('awkward_reduce_min_complex_c') + +awkward_reduce_min_complex_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial)) +awkward_reduce_min_complex_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial), shared_mem=shared_mem_size) +awkward_reduce_min_complex_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial)) + +toptr_host = toptr[0::2] + 1j * toptr[1::2] +print("tree reduction toptr:", toptr_host) \ No newline at end of file diff --git a/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py index 5810ca97e5..e3a64f33d7 100644 --- a/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py +++ b/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py @@ -12,7 +12,7 @@ } extern "C" { - __global__ void awkward_reduce_min_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int identity, int* partial) { + __global__ void awkward_reduce_min_b(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int identity, int* partial) { extern __shared__ int shared[]; int idx = threadIdx.x; @@ -43,7 +43,7 @@ } extern "C" { - __global__ void awkward_reduce_min_c(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int identity, int* partial) { + __global__ void awkward_reduce_min_c(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int identity, int* partial) { int thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { @@ -62,10 +62,10 @@ fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=cp.int32) lenparents = len(parents) outlength = int(cp.max(parents)) + 1 -toptr = cp.full(outlength, cp.iinfo(cp.int32).max, dtype=cp.int32) identity = cp.iinfo(cp.int32).max +toptr = cp.full(outlength, identity, dtype=cp.int32) block_size = 2 -partial = cp.full((outlength * ((lenparents + block_size - 1) // block_size)), cp.iinfo(cp.int32).max, dtype=cp.int32) +partial = cp.full((outlength * ((lenparents + block_size - 1) // block_size)), identity, dtype=cp.int32) grid_size = (lenparents + block_size - 1) // block_size shared_mem_size = block_size * cp.int32().nbytes @@ -79,5 +79,4 @@ awkward_reduce_min_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial), shared_mem=shared_mem_size) awkward_reduce_min_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial)) -toptr_host = toptr.get() -print("tree reduction toptr:", toptr_host) \ No newline at end of file +assert cp.array_equal(toptr, cp.array([1, 2, 4, 2147483647, 2147483647, 10])) \ No newline at end of file diff --git a/studies/cuda-kernels/reducers/awkward_reduce_prod_bool_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_prod_bool_tree_reduction.py new file mode 100644 index 0000000000..02a657ccb0 --- /dev/null +++ b/studies/cuda-kernels/reducers/awkward_reduce_prod_bool_tree_reduction.py @@ -0,0 +1,82 @@ +import cupy as cp + +cuda_kernel = """ +extern "C" { + __global__ void awkward_reduce_prod_bool_a(bool* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) { + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < outlength) { + toptr[thread_id] = true; + } + } +} + +extern "C" { + __global__ void awkward_reduce_prod_bool_b(bool* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) { + extern __shared__ int shared[]; + + int idx = threadIdx.x; + int thread_id = blockIdx.x * blockDim.x + idx; + + if (thread_id < lenparents) { + shared[idx] = fromptr[thread_id]; + } + __syncthreads(); + + for (int stride = 1; stride < blockDim.x; stride *= 2) { + int val = 1; + if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { + val = shared[idx - stride]; + } + __syncthreads(); + shared[idx] &= (val != 0); + __syncthreads(); + } + + if (thread_id < lenparents) { + int parent = parents[thread_id]; + if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { + partial[blockIdx.x * outlength + parent] = shared[idx]; + } + } + } +} + +extern "C" { + __global__ void awkward_reduce_prod_bool_c(bool* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) { + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < outlength) { + int prod = 1; + int blocks = (lenparents + blockDim.x - 1) / blockDim.x; + for (int i = 0; i < blocks; ++i) { + prod &= (partial[i * outlength + thread_id] != 0); + } + toptr[thread_id] = prod; + } + } +} +""" + +parents = cp.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3], dtype=cp.int32) +fromptr = cp.array([1, 0, 0, 1, 1, 1, 1, 0, 0, 1], dtype=cp.int32) +lenparents = len(parents) +outlength = int(cp.max(parents)) + 1 +toptr = cp.ones(outlength, dtype=cp.bool_) + +block_size = 2 +partial = cp.ones((outlength * ((lenparents + block_size - 1) // block_size)), dtype=cp.int32) +grid_size = (lenparents + block_size - 1) // block_size +shared_mem_size = block_size * cp.int32().nbytes + +raw_module = cp.RawModule(code=cuda_kernel) + +awkward_reduce_prod_bool_a = raw_module.get_function('awkward_reduce_prod_bool_a') +awkward_reduce_prod_bool_b = raw_module.get_function('awkward_reduce_prod_bool_b') +awkward_reduce_prod_bool_c = raw_module.get_function('awkward_reduce_prod_bool_c') + +awkward_reduce_prod_bool_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial)) +awkward_reduce_prod_bool_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size) +awkward_reduce_prod_bool_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial)) + +assert cp.array_equal(toptr, cp.array([0, 1, 0, 1])) \ No newline at end of file diff --git a/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py index 48f33cc5c8..d85c3863b6 100644 --- a/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py +++ b/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py @@ -12,7 +12,7 @@ } extern "C" { - __global__ void awkward_reduce_prod_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int* partial) { + __global__ void awkward_reduce_prod_b(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) { extern __shared__ int shared[]; int idx = threadIdx.x; @@ -43,7 +43,7 @@ } extern "C" { - __global__ void awkward_reduce_prod_c(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int* partial) { + __global__ void awkward_reduce_prod_c(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) { int thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { @@ -79,5 +79,4 @@ awkward_reduce_prod_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size) awkward_reduce_prod_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial)) -toptr_host = toptr.get() -print("tree reduction toptr:", toptr_host) \ No newline at end of file +assert cp.array_equal(toptr, cp.array([1, 6, 60480, 1, 1, 10])) \ No newline at end of file diff --git a/studies/cuda-kernels/reducers/awkward_reduce_sum_bool_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_sum_bool_tree_reduction.py new file mode 100644 index 0000000000..6b44646274 --- /dev/null +++ b/studies/cuda-kernels/reducers/awkward_reduce_sum_bool_tree_reduction.py @@ -0,0 +1,82 @@ +import cupy as cp + +cuda_kernel = """ +extern "C" { + __global__ void awkward_reduce_sum_bool_a(bool* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) { + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < outlength) { + toptr[thread_id] = 0; + } + } +} + +extern "C" { + __global__ void awkward_reduce_sum_bool_b(bool* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) { + extern __shared__ int shared[]; + + int idx = threadIdx.x; + int thread_id = blockIdx.x * blockDim.x + idx; + + if (thread_id < lenparents) { + shared[idx] = fromptr[thread_id]; + } + __syncthreads(); + + for (int stride = 1; stride < blockDim.x; stride *= 2) { + int val = 0; + if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { + val = shared[idx - stride]; + } + __syncthreads(); + shared[idx] |= (val != 0); + __syncthreads(); + } + + if (thread_id < lenparents) { + int parent = parents[thread_id]; + if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { + partial[blockIdx.x * outlength + parent] = shared[idx]; + } + } + } +} + +extern "C" { + __global__ void awkward_reduce_sum_bool_c(bool* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) { + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < outlength) { + int sum = 0; + int blocks = (lenparents + blockDim.x - 1) / blockDim.x; + for (int i = 0; i < blocks; ++i) { + sum |= (partial[i * outlength + thread_id] != 0); + } + toptr[thread_id] = sum; + } + } +} +""" + +parents = cp.array([0, 0, 0, 2, 2, 3, 4, 4, 5], dtype=cp.int32) +fromptr = cp.array([1, 0, 1, 0, 0, 1, 0, 1, 1], dtype=cp.int32) +lenparents = len(parents) +outlength = int(cp.max(parents)) + 1 +toptr = cp.zeros(outlength, dtype=cp.bool_) + +block_size = 2 +partial = cp.zeros((outlength * ((lenparents + block_size - 1) // block_size)), dtype=cp.int32) +grid_size = (lenparents + block_size - 1) // block_size +shared_mem_size = block_size * cp.int32().nbytes + +raw_module = cp.RawModule(code=cuda_kernel) + +awkward_reduce_sum_bool_a = raw_module.get_function('awkward_reduce_sum_bool_a') +awkward_reduce_sum_bool_b = raw_module.get_function('awkward_reduce_sum_bool_b') +awkward_reduce_sum_bool_c = raw_module.get_function('awkward_reduce_sum_bool_c') + +awkward_reduce_sum_bool_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial)) +awkward_reduce_sum_bool_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size) +awkward_reduce_sum_bool_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial)) + +assert cp.array_equal(toptr, cp.array([1, 0, 0, 1, 1, 1])) \ No newline at end of file diff --git a/studies/cuda-kernels/reducers/awkward_reduce_sum_complex_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_sum_complex_tree_reduction.py index 3f042880dc..06285338de 100644 --- a/studies/cuda-kernels/reducers/awkward_reduce_sum_complex_tree_reduction.py +++ b/studies/cuda-kernels/reducers/awkward_reduce_sum_complex_tree_reduction.py @@ -88,5 +88,4 @@ awkward_reduce_sum_complex_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size) awkward_reduce_sum_complex_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial)) -toptr_host = toptr.get() -print("tree reduction toptr:", toptr_host) \ No newline at end of file +assert cp.array_equal(toptr, cp.array([1, 0, 5, 3, 39, 33, 0, 0, 0, 0, 10, 0])) \ No newline at end of file diff --git a/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py index c8757ebb64..f04ef5ce85 100644 --- a/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py +++ b/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py @@ -12,7 +12,7 @@ } extern "C" { - __global__ void awkward_reduce_sum_b(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int* partial) { + __global__ void awkward_reduce_sum_b(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) { extern __shared__ int shared[]; int idx = threadIdx.x; @@ -43,7 +43,7 @@ } extern "C" { - __global__ void awkward_reduce_sum_c(int *toptr, int *fromptr, int *parents, int lenparents, int outlength, int* partial) { + __global__ void awkward_reduce_sum_c(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) { int thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { @@ -79,5 +79,4 @@ awkward_reduce_sum_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size) awkward_reduce_sum_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial)) -toptr_host = toptr.get() -print("tree reduction toptr:", toptr_host) \ No newline at end of file +assert cp.array_equal(toptr, cp.array([1, 5, 39, 0, 0, 10])) \ No newline at end of file From 10c0a6cf8ea532b1c05e23d46325e469b6e38eb1 Mon Sep 17 00:00:00 2001 From: ManasviGoyal Date: Wed, 22 May 2024 11:47:26 +0200 Subject: [PATCH 08/10] chore: add count kernels --- .../awkward_reduce_argmax_tree_reduction.py | 27 +++--- .../awkward_reduce_argmin_tree_reduction.py | 27 +++--- .../awkward_reduce_count_64_tree_reduction.py | 83 +++++++++++++++++++ ...ward_reduce_countnonzero_tree_reduction.py | 83 +++++++++++++++++++ .../awkward_reduce_max_tree_reduction.py | 29 ++++--- .../awkward_reduce_min_tree_reduction.py | 28 ++++--- ...awkward_reduce_prod_bool_tree_reduction.py | 25 +++--- .../awkward_reduce_prod_tree_reduction.py | 25 +++--- .../awkward_reduce_sum_bool_tree_reduction.py | 27 +++--- .../awkward_reduce_sum_tree_reduction.py | 25 +++--- 10 files changed, 280 insertions(+), 99 deletions(-) create mode 100644 studies/cuda-kernels/reducers/awkward_reduce_count_64_tree_reduction.py create mode 100644 studies/cuda-kernels/reducers/awkward_reduce_countnonzero_tree_reduction.py diff --git a/studies/cuda-kernels/reducers/awkward_reduce_argmax_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_argmax_tree_reduction.py index 9743c91c9b..ca1160f276 100644 --- a/studies/cuda-kernels/reducers/awkward_reduce_argmax_tree_reduction.py +++ b/studies/cuda-kernels/reducers/awkward_reduce_argmax_tree_reduction.py @@ -68,19 +68,22 @@ lenparents = len(parents) outlength = int(cp.max(parents)) + 1 toptr = cp.full(outlength, -1, dtype=cp.int32) -block_size = 2 -partial = cp.full((outlength * ((lenparents + block_size - 1) // block_size)), -1, dtype=cp.int32) -grid_size = (lenparents + block_size - 1) // block_size -shared_mem_size = block_size * cp.int32().nbytes -raw_module = cp.RawModule(code=cuda_kernel) -awkward_reduce_argmax_a = raw_module.get_function('awkward_reduce_argmax_a') -awkward_reduce_argmax_b = raw_module.get_function('awkward_reduce_argmax_b') -awkward_reduce_argmax_c = raw_module.get_function('awkward_reduce_argmax_c') +block_size = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024] +for i in range (len(block_size)): + partial = cp.full((outlength * ((lenparents + block_size[i] - 1) // block_size[i])), -1, dtype=cp.int32) + grid_size = (lenparents + block_size[i] - 1) // block_size[i] + shared_mem_size = block_size[i] * cp.int32().nbytes -awkward_reduce_argmax_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial)) -awkward_reduce_argmax_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size) -awkward_reduce_argmax_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial)) + raw_module = cp.RawModule(code=cuda_kernel) -assert cp.array_equal(toptr, cp.array([0, 2, 8, -1, -1, 9])) \ No newline at end of file + awkward_reduce_argmax_a = raw_module.get_function('awkward_reduce_argmax_a') + awkward_reduce_argmax_b = raw_module.get_function('awkward_reduce_argmax_b') + awkward_reduce_argmax_c = raw_module.get_function('awkward_reduce_argmax_c') + + awkward_reduce_argmax_a((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial)) + awkward_reduce_argmax_b((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size) + awkward_reduce_argmax_c(((outlength + block_size[i] - 1) // block_size[i],), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial)) + + assert cp.array_equal(toptr, cp.array([0, 2, 8, -1, -1, 9])) \ No newline at end of file diff --git a/studies/cuda-kernels/reducers/awkward_reduce_argmin_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_argmin_tree_reduction.py index 0d8cd527a5..4756809fc0 100644 --- a/studies/cuda-kernels/reducers/awkward_reduce_argmin_tree_reduction.py +++ b/studies/cuda-kernels/reducers/awkward_reduce_argmin_tree_reduction.py @@ -68,19 +68,22 @@ lenparents = len(parents) outlength = int(cp.max(parents)) + 1 toptr = cp.full(outlength, -1, dtype=cp.int32) -block_size = 2 -partial = cp.full((outlength * ((lenparents + block_size - 1) // block_size)), -1, dtype=cp.int32) -grid_size = (lenparents + block_size - 1) // block_size -shared_mem_size = block_size * cp.int32().nbytes -raw_module = cp.RawModule(code=cuda_kernel) -awkward_reduce_argmin_a = raw_module.get_function('awkward_reduce_argmin_a') -awkward_reduce_argmin_b = raw_module.get_function('awkward_reduce_argmin_b') -awkward_reduce_argmin_c = raw_module.get_function('awkward_reduce_argmin_c') +block_size = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024] +for i in range (len(block_size)): + partial = cp.full((outlength * ((lenparents + block_size[i] - 1) // block_size[i])), -1, dtype=cp.int32) + grid_size = (lenparents + block_size[i] - 1) // block_size[i] + shared_mem_size = block_size[i] * cp.int32().nbytes -awkward_reduce_argmin_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial)) -awkward_reduce_argmin_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size) -awkward_reduce_argmin_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial)) + raw_module = cp.RawModule(code=cuda_kernel) -assert cp.array_equal(toptr, cp.array([0, 1, 3, -1, -1, 9])) \ No newline at end of file + awkward_reduce_argmin_a = raw_module.get_function('awkward_reduce_argmin_a') + awkward_reduce_argmin_b = raw_module.get_function('awkward_reduce_argmin_b') + awkward_reduce_argmin_c = raw_module.get_function('awkward_reduce_argmin_c') + + awkward_reduce_argmin_a((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial)) + awkward_reduce_argmin_b((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size) + awkward_reduce_argmin_c(((outlength + block_size[i] - 1) // block_size[i],), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial)) + + assert cp.array_equal(toptr, cp.array([0, 1, 3, -1, -1, 9])) \ No newline at end of file diff --git a/studies/cuda-kernels/reducers/awkward_reduce_count_64_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_count_64_tree_reduction.py new file mode 100644 index 0000000000..66fe580399 --- /dev/null +++ b/studies/cuda-kernels/reducers/awkward_reduce_count_64_tree_reduction.py @@ -0,0 +1,83 @@ +import cupy as cp + +cuda_kernel = """ +extern "C" { + __global__ void awkward_reduce_countnonzero_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) { + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < outlength) { + toptr[thread_id] = 0; + } + } +} + +extern "C" { + __global__ void awkward_reduce_countnonzero_b(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) { + extern __shared__ int shared[]; + + int idx = threadIdx.x; + int thread_id = blockIdx.x * blockDim.x + idx; + + if (thread_id < lenparents) { + shared[idx] = 1; + } + __syncthreads(); + + for (int stride = 1; stride < blockDim.x; stride *= 2) { + int val = 0; + if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { + val = shared[idx - stride]; + } + __syncthreads(); + shared[idx] += val; + __syncthreads(); + } + + if (thread_id < lenparents) { + int parent = parents[thread_id]; + if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { + partial[blockIdx.x * outlength + parent] = shared[idx]; + } + } + } +} + +extern "C" { + __global__ void awkward_reduce_countnonzero_c(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) { + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < outlength) { + int countnonzero = 0; + int blocks = (lenparents + blockDim.x - 1) / blockDim.x; + for (int i = 0; i < blocks; ++i) { + countnonzero += partial[i * outlength + thread_id]; + } + toptr[thread_id] = countnonzero; + } + } +} +""" + +parents = cp.array([0, 1, 1, 2, 2, 2, 2, 2, 2, 5], dtype=cp.int32) +fromptr = cp.array([1, 2, 3, 0, 5, 6, 0, 8, 9, 0], dtype=cp.int32) +lenparents = len(parents) +outlength = int(cp.max(parents)) + 1 +toptr = cp.zeros(outlength, dtype=cp.int32) + +block_size = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024] +for i in range (len(block_size)): + partial = cp.zeros((outlength * ((lenparents + block_size[i] - 1) // block_size[i])), dtype=cp.int32) + grid_size = (lenparents + block_size[i] - 1) // block_size[i] + shared_mem_size = block_size[i] * cp.int32().nbytes + + raw_module = cp.RawModule(code=cuda_kernel) + + awkward_reduce_countnonzero_a = raw_module.get_function('awkward_reduce_countnonzero_a') + awkward_reduce_countnonzero_b = raw_module.get_function('awkward_reduce_countnonzero_b') + awkward_reduce_countnonzero_c = raw_module.get_function('awkward_reduce_countnonzero_c') + + awkward_reduce_countnonzero_a((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial)) + awkward_reduce_countnonzero_b((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size) + awkward_reduce_countnonzero_c(((outlength + block_size[i] - 1) // block_size[i],), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial)) + + assert cp.array_equal(toptr, cp.array([1, 2, 6, 0, 0, 1])) \ No newline at end of file diff --git a/studies/cuda-kernels/reducers/awkward_reduce_countnonzero_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_countnonzero_tree_reduction.py new file mode 100644 index 0000000000..0d7ca6d5e0 --- /dev/null +++ b/studies/cuda-kernels/reducers/awkward_reduce_countnonzero_tree_reduction.py @@ -0,0 +1,83 @@ +import cupy as cp + +cuda_kernel = """ +extern "C" { + __global__ void awkward_reduce_countnonzero_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) { + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < outlength) { + toptr[thread_id] = 0; + } + } +} + +extern "C" { + __global__ void awkward_reduce_countnonzero_b(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) { + extern __shared__ int shared[]; + + int idx = threadIdx.x; + int thread_id = blockIdx.x * blockDim.x + idx; + + if (thread_id < lenparents) { + shared[idx] = (fromptr[thread_id] != 0) ? 1 : 0; + } + __syncthreads(); + + for (int stride = 1; stride < blockDim.x; stride *= 2) { + int val = 0; + if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { + val = shared[idx - stride]; + } + __syncthreads(); + shared[idx] += val; + __syncthreads(); + } + + if (thread_id < lenparents) { + int parent = parents[thread_id]; + if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { + partial[blockIdx.x * outlength + parent] = shared[idx]; + } + } + } +} + +extern "C" { + __global__ void awkward_reduce_countnonzero_c(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) { + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < outlength) { + int countnonzero = 0; + int blocks = (lenparents + blockDim.x - 1) / blockDim.x; + for (int i = 0; i < blocks; ++i) { + countnonzero += partial[i * outlength + thread_id]; + } + toptr[thread_id] = countnonzero; + } + } +} +""" + +parents = cp.array([0, 1, 1, 2, 2, 2, 2, 2, 2, 5], dtype=cp.int32) +fromptr = cp.array([1, 2, 3, 0, 5, 6, 0, 8, 9, 0], dtype=cp.int32) +lenparents = len(parents) +outlength = int(cp.max(parents)) + 1 +toptr = cp.zeros(outlength, dtype=cp.int32) + +block_size = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024] +for i in range (len(block_size)): + partial = cp.zeros((outlength * ((lenparents + block_size[i] - 1) // block_size[i])), dtype=cp.int32) + grid_size = (lenparents + block_size[i] - 1) // block_size[i] + shared_mem_size = block_size[i] * cp.int32().nbytes + + raw_module = cp.RawModule(code=cuda_kernel) + + awkward_reduce_countnonzero_a = raw_module.get_function('awkward_reduce_countnonzero_a') + awkward_reduce_countnonzero_b = raw_module.get_function('awkward_reduce_countnonzero_b') + awkward_reduce_countnonzero_c = raw_module.get_function('awkward_reduce_countnonzero_c') + + awkward_reduce_countnonzero_a((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial)) + awkward_reduce_countnonzero_b((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size) + awkward_reduce_countnonzero_c(((outlength + block_size[i] - 1) // block_size[i],), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial)) + + assert cp.array_equal(toptr, cp.array([1, 2, 4, 0, 0, 0])) \ No newline at end of file diff --git a/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py index df60f4417e..dfff51a25f 100644 --- a/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py +++ b/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py @@ -59,24 +59,27 @@ """ parents = cp.array([0, 1, 1, 2, 2, 2, 2, 2, 2, 5], dtype=cp.int32) -fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=cp.int32) +fromptr = cp.array([1, -2, -3, 4, 5, 6, 7, 8, 9, 10], dtype=cp.int32) lenparents = len(parents) outlength = int(cp.max(parents)) + 1 identity = cp.iinfo(cp.int32).min toptr = cp.full(outlength, identity, dtype=cp.int32) -block_size = 2 -partial = cp.full((outlength * ((lenparents + block_size - 1) // block_size)), identity, dtype=cp.int32) -grid_size = (lenparents + block_size - 1) // block_size -shared_mem_size = block_size * cp.int32().nbytes -raw_module = cp.RawModule(code=cuda_kernel) -awkward_reduce_max_a = raw_module.get_function('awkward_reduce_max_a') -awkward_reduce_max_b = raw_module.get_function('awkward_reduce_max_b') -awkward_reduce_max_c = raw_module.get_function('awkward_reduce_max_c') +block_size = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024] +for i in range (len(block_size)): + partial = cp.full((outlength * ((lenparents + block_size[i] - 1) // block_size[i])), identity, dtype=cp.int32) + grid_size = (lenparents + block_size[i] - 1) // block_size[i] + shared_mem_size = block_size[i] * cp.int32().nbytes -awkward_reduce_max_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial)) -awkward_reduce_max_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial), shared_mem=shared_mem_size) -awkward_reduce_max_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial)) + raw_module = cp.RawModule(code=cuda_kernel) -assert cp.array_equal(toptr, cp.array([1, 3, 9, -2147483648, -2147483648, 10])) \ No newline at end of file + awkward_reduce_max_a = raw_module.get_function('awkward_reduce_max_a') + awkward_reduce_max_b = raw_module.get_function('awkward_reduce_max_b') + awkward_reduce_max_c = raw_module.get_function('awkward_reduce_max_c') + + awkward_reduce_max_a((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, identity, partial)) + awkward_reduce_max_b((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, identity, partial), shared_mem=shared_mem_size) + awkward_reduce_max_c(((outlength + block_size[i] - 1) // block_size[i],), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, identity, partial)) + + assert cp.array_equal(toptr, cp.array([1, -2, 9, -2147483648, -2147483648, 10])) \ No newline at end of file diff --git a/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py index e3a64f33d7..8069a3007d 100644 --- a/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py +++ b/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py @@ -59,24 +59,26 @@ """ parents = cp.array([0, 1, 1, 2, 2, 2, 2, 2, 2, 5], dtype=cp.int32) -fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=cp.int32) +fromptr = cp.array([1, -2, -3, 4, 5, 6, 7, 8, 9, 10], dtype=cp.int32) lenparents = len(parents) outlength = int(cp.max(parents)) + 1 identity = cp.iinfo(cp.int32).max toptr = cp.full(outlength, identity, dtype=cp.int32) -block_size = 2 -partial = cp.full((outlength * ((lenparents + block_size - 1) // block_size)), identity, dtype=cp.int32) -grid_size = (lenparents + block_size - 1) // block_size -shared_mem_size = block_size * cp.int32().nbytes -raw_module = cp.RawModule(code=cuda_kernel) +block_size = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024] +for i in range (len(block_size)): + partial = cp.full((outlength * ((lenparents + block_size[i] - 1) // block_size[i])), identity, dtype=cp.int32) + grid_size = (lenparents + block_size[i] - 1) // block_size[i] + shared_mem_size = block_size[i] * cp.int32().nbytes -awkward_reduce_min_a = raw_module.get_function('awkward_reduce_min_a') -awkward_reduce_min_b = raw_module.get_function('awkward_reduce_min_b') -awkward_reduce_min_c = raw_module.get_function('awkward_reduce_min_c') + raw_module = cp.RawModule(code=cuda_kernel) -awkward_reduce_min_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial)) -awkward_reduce_min_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial), shared_mem=shared_mem_size) -awkward_reduce_min_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial)) + awkward_reduce_min_a = raw_module.get_function('awkward_reduce_min_a') + awkward_reduce_min_b = raw_module.get_function('awkward_reduce_min_b') + awkward_reduce_min_c = raw_module.get_function('awkward_reduce_min_c') -assert cp.array_equal(toptr, cp.array([1, 2, 4, 2147483647, 2147483647, 10])) \ No newline at end of file + awkward_reduce_min_a((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, identity, partial)) + awkward_reduce_min_b((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, identity, partial), shared_mem=shared_mem_size) + awkward_reduce_min_c(((outlength + block_size[i] - 1) // block_size[i],), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, identity, partial)) + + assert cp.array_equal(toptr, cp.array([1, -3, 4, 2147483647, 2147483647, 10])) \ No newline at end of file diff --git a/studies/cuda-kernels/reducers/awkward_reduce_prod_bool_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_prod_bool_tree_reduction.py index 02a657ccb0..2acbbac92e 100644 --- a/studies/cuda-kernels/reducers/awkward_reduce_prod_bool_tree_reduction.py +++ b/studies/cuda-kernels/reducers/awkward_reduce_prod_bool_tree_reduction.py @@ -64,19 +64,20 @@ outlength = int(cp.max(parents)) + 1 toptr = cp.ones(outlength, dtype=cp.bool_) -block_size = 2 -partial = cp.ones((outlength * ((lenparents + block_size - 1) // block_size)), dtype=cp.int32) -grid_size = (lenparents + block_size - 1) // block_size -shared_mem_size = block_size * cp.int32().nbytes +block_size = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024] +for i in range (len(block_size)): + partial = cp.ones((outlength * ((lenparents + block_size[i] - 1) // block_size[i])), dtype=cp.int32) + grid_size = (lenparents + block_size[i] - 1) // block_size[i] + shared_mem_size = block_size[i] * cp.int32().nbytes -raw_module = cp.RawModule(code=cuda_kernel) + raw_module = cp.RawModule(code=cuda_kernel) -awkward_reduce_prod_bool_a = raw_module.get_function('awkward_reduce_prod_bool_a') -awkward_reduce_prod_bool_b = raw_module.get_function('awkward_reduce_prod_bool_b') -awkward_reduce_prod_bool_c = raw_module.get_function('awkward_reduce_prod_bool_c') + awkward_reduce_prod_bool_a = raw_module.get_function('awkward_reduce_prod_bool_a') + awkward_reduce_prod_bool_b = raw_module.get_function('awkward_reduce_prod_bool_b') + awkward_reduce_prod_bool_c = raw_module.get_function('awkward_reduce_prod_bool_c') -awkward_reduce_prod_bool_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial)) -awkward_reduce_prod_bool_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size) -awkward_reduce_prod_bool_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial)) + awkward_reduce_prod_bool_a((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial)) + awkward_reduce_prod_bool_b((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size) + awkward_reduce_prod_bool_c(((outlength + block_size[i] - 1) // block_size[i],), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial)) -assert cp.array_equal(toptr, cp.array([0, 1, 0, 1])) \ No newline at end of file + assert cp.array_equal(toptr, cp.array([0, 1, 0, 1])) \ No newline at end of file diff --git a/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py index d85c3863b6..910ba712b8 100644 --- a/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py +++ b/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py @@ -64,19 +64,20 @@ outlength = int(cp.max(parents)) + 1 toptr = cp.zeros(outlength, dtype=cp.int32) -block_size = 2 -partial = cp.ones((outlength * ((lenparents + block_size - 1) // block_size)), dtype=cp.int32) -grid_size = (lenparents + block_size - 1) // block_size -shared_mem_size = block_size * cp.int32().nbytes +block_size = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024] +for i in range (len(block_size)): + partial = cp.ones((outlength * ((lenparents + block_size[i] - 1) // block_size[i])), dtype=cp.int32) + grid_size = (lenparents + block_size[i] - 1) // block_size[i] + shared_mem_size = block_size[i] * cp.int32().nbytes -raw_module = cp.RawModule(code=cuda_kernel) + raw_module = cp.RawModule(code=cuda_kernel) -awkward_reduce_prod_a = raw_module.get_function('awkward_reduce_prod_a') -awkward_reduce_prod_b = raw_module.get_function('awkward_reduce_prod_b') -awkward_reduce_prod_c = raw_module.get_function('awkward_reduce_prod_c') + awkward_reduce_prod_a = raw_module.get_function('awkward_reduce_prod_a') + awkward_reduce_prod_b = raw_module.get_function('awkward_reduce_prod_b') + awkward_reduce_prod_c = raw_module.get_function('awkward_reduce_prod_c') -awkward_reduce_prod_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial)) -awkward_reduce_prod_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size) -awkward_reduce_prod_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial)) + awkward_reduce_prod_a((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial)) + awkward_reduce_prod_b((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size) + awkward_reduce_prod_c(((outlength + block_size[i] - 1) // block_size[i],), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial)) -assert cp.array_equal(toptr, cp.array([1, 6, 60480, 1, 1, 10])) \ No newline at end of file + assert cp.array_equal(toptr, cp.array([1, 6, 60480, 1, 1, 10])) \ No newline at end of file diff --git a/studies/cuda-kernels/reducers/awkward_reduce_sum_bool_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_sum_bool_tree_reduction.py index 6b44646274..8a512bd9bf 100644 --- a/studies/cuda-kernels/reducers/awkward_reduce_sum_bool_tree_reduction.py +++ b/studies/cuda-kernels/reducers/awkward_reduce_sum_bool_tree_reduction.py @@ -6,7 +6,7 @@ int thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - toptr[thread_id] = 0; + toptr[thread_id] = false; } } } @@ -64,19 +64,20 @@ outlength = int(cp.max(parents)) + 1 toptr = cp.zeros(outlength, dtype=cp.bool_) -block_size = 2 -partial = cp.zeros((outlength * ((lenparents + block_size - 1) // block_size)), dtype=cp.int32) -grid_size = (lenparents + block_size - 1) // block_size -shared_mem_size = block_size * cp.int32().nbytes +block_size = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024] +for i in range (len(block_size)): + partial = cp.zeros((outlength * ((lenparents + block_size[i] - 1) // block_size[i])), dtype=cp.int32) + grid_size = (lenparents + block_size[i] - 1) // block_size[i] + shared_mem_size = block_size[i] * cp.int32().nbytes -raw_module = cp.RawModule(code=cuda_kernel) + raw_module = cp.RawModule(code=cuda_kernel) -awkward_reduce_sum_bool_a = raw_module.get_function('awkward_reduce_sum_bool_a') -awkward_reduce_sum_bool_b = raw_module.get_function('awkward_reduce_sum_bool_b') -awkward_reduce_sum_bool_c = raw_module.get_function('awkward_reduce_sum_bool_c') + awkward_reduce_sum_bool_a = raw_module.get_function('awkward_reduce_sum_bool_a') + awkward_reduce_sum_bool_b = raw_module.get_function('awkward_reduce_sum_bool_b') + awkward_reduce_sum_bool_c = raw_module.get_function('awkward_reduce_sum_bool_c') -awkward_reduce_sum_bool_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial)) -awkward_reduce_sum_bool_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size) -awkward_reduce_sum_bool_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial)) + awkward_reduce_sum_bool_a((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial)) + awkward_reduce_sum_bool_b((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size) + awkward_reduce_sum_bool_c(((outlength + block_size[i] - 1) // block_size[i],), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial)) -assert cp.array_equal(toptr, cp.array([1, 0, 0, 1, 1, 1])) \ No newline at end of file + assert cp.array_equal(toptr, cp.array([1, 0, 0, 1, 1, 1])) \ No newline at end of file diff --git a/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py index f04ef5ce85..11428c5475 100644 --- a/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py +++ b/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py @@ -64,19 +64,20 @@ outlength = int(cp.max(parents)) + 1 toptr = cp.zeros(outlength, dtype=cp.int32) -block_size = 2 -partial = cp.zeros((outlength * ((lenparents + block_size - 1) // block_size)), dtype=cp.int32) -grid_size = (lenparents + block_size - 1) // block_size -shared_mem_size = block_size * cp.int32().nbytes +block_size = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024] +for i in range (len(block_size)): + partial = cp.zeros((outlength * ((lenparents + block_size[i] - 1) // block_size[i])), dtype=cp.int32) + grid_size = (lenparents + block_size[i] - 1) // block_size[i] + shared_mem_size = block_size[i] * cp.int32().nbytes -raw_module = cp.RawModule(code=cuda_kernel) + raw_module = cp.RawModule(code=cuda_kernel) -awkward_reduce_sum_a = raw_module.get_function('awkward_reduce_sum_a') -awkward_reduce_sum_b = raw_module.get_function('awkward_reduce_sum_b') -awkward_reduce_sum_c = raw_module.get_function('awkward_reduce_sum_c') + awkward_reduce_sum_a = raw_module.get_function('awkward_reduce_sum_a') + awkward_reduce_sum_b = raw_module.get_function('awkward_reduce_sum_b') + awkward_reduce_sum_c = raw_module.get_function('awkward_reduce_sum_c') -awkward_reduce_sum_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial)) -awkward_reduce_sum_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size) -awkward_reduce_sum_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial)) + awkward_reduce_sum_a((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial)) + awkward_reduce_sum_b((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size) + awkward_reduce_sum_c(((outlength + block_size[i] - 1) // block_size[i],), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial)) -assert cp.array_equal(toptr, cp.array([1, 5, 39, 0, 0, 10])) \ No newline at end of file + assert cp.array_equal(toptr, cp.array([1, 5, 39, 0, 0, 10])) \ No newline at end of file From fbb11955f8241d36541b38cc58616220c6d9543a Mon Sep 17 00:00:00 2001 From: ManasviGoyal Date: Wed, 22 May 2024 12:16:26 +0200 Subject: [PATCH 09/10] chore: add sum int32 and int64 bool kernels --- .../awkward_reduce_argmax_tree_reduction.py | 2 +- ...awkward_reduce_prod_bool_tree_reduction.py | 2 +- .../awkward_reduce_sum_bool_tree_reduction.py | 2 +- ...reduce_sum_int32_bool_64_tree_reduction.py | 83 +++++++++++++++++++ ...reduce_sum_int64_bool_64_tree_reduction.py | 83 +++++++++++++++++++ 5 files changed, 169 insertions(+), 3 deletions(-) create mode 100644 studies/cuda-kernels/reducers/awkward_reduce_sum_int32_bool_64_tree_reduction.py create mode 100644 studies/cuda-kernels/reducers/awkward_reduce_sum_int64_bool_64_tree_reduction.py diff --git a/studies/cuda-kernels/reducers/awkward_reduce_argmax_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_argmax_tree_reduction.py index ca1160f276..b0c4d8cfaa 100644 --- a/studies/cuda-kernels/reducers/awkward_reduce_argmax_tree_reduction.py +++ b/studies/cuda-kernels/reducers/awkward_reduce_argmax_tree_reduction.py @@ -85,5 +85,5 @@ awkward_reduce_argmax_a((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial)) awkward_reduce_argmax_b((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size) awkward_reduce_argmax_c(((outlength + block_size[i] - 1) // block_size[i],), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial)) - + assert cp.array_equal(toptr, cp.array([0, 2, 8, -1, -1, 9])) \ No newline at end of file diff --git a/studies/cuda-kernels/reducers/awkward_reduce_prod_bool_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_prod_bool_tree_reduction.py index 2acbbac92e..156625fa68 100644 --- a/studies/cuda-kernels/reducers/awkward_reduce_prod_bool_tree_reduction.py +++ b/studies/cuda-kernels/reducers/awkward_reduce_prod_bool_tree_reduction.py @@ -47,7 +47,7 @@ int thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - int prod = 1; + bool prod = 1; int blocks = (lenparents + blockDim.x - 1) / blockDim.x; for (int i = 0; i < blocks; ++i) { prod &= (partial[i * outlength + thread_id] != 0); diff --git a/studies/cuda-kernels/reducers/awkward_reduce_sum_bool_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_sum_bool_tree_reduction.py index 8a512bd9bf..971331a3e6 100644 --- a/studies/cuda-kernels/reducers/awkward_reduce_sum_bool_tree_reduction.py +++ b/studies/cuda-kernels/reducers/awkward_reduce_sum_bool_tree_reduction.py @@ -47,7 +47,7 @@ int thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - int sum = 0; + bool sum = 0; int blocks = (lenparents + blockDim.x - 1) / blockDim.x; for (int i = 0; i < blocks; ++i) { sum |= (partial[i * outlength + thread_id] != 0); diff --git a/studies/cuda-kernels/reducers/awkward_reduce_sum_int32_bool_64_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_sum_int32_bool_64_tree_reduction.py new file mode 100644 index 0000000000..28a0cb3a34 --- /dev/null +++ b/studies/cuda-kernels/reducers/awkward_reduce_sum_int32_bool_64_tree_reduction.py @@ -0,0 +1,83 @@ +import cupy as cp + +cuda_kernel = """ +extern "C" { + __global__ void awkward_reduce_countnonzero_a(int* toptr, bool* fromptr, int* parents, int lenparents, int outlength, int* partial) { + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < outlength) { + toptr[thread_id] = 0; + } + } +} + +extern "C" { + __global__ void awkward_reduce_countnonzero_b(int* toptr, bool* fromptr, int* parents, int lenparents, int outlength, int* partial) { + extern __shared__ int shared[]; + + int idx = threadIdx.x; + int thread_id = blockIdx.x * blockDim.x + idx; + + if (thread_id < lenparents) { + shared[idx] = (fromptr[thread_id] != 0) ? 1 : 0; + } + __syncthreads(); + + for (int stride = 1; stride < blockDim.x; stride *= 2) { + int val = 0; + if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { + val = shared[idx - stride]; + } + __syncthreads(); + shared[idx] += val; + __syncthreads(); + } + + if (thread_id < lenparents) { + int parent = parents[thread_id]; + if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { + partial[blockIdx.x * outlength + parent] = shared[idx]; + } + } + } +} + +extern "C" { + __global__ void awkward_reduce_countnonzero_c(int* toptr, bool* fromptr, int* parents, int lenparents, int outlength, int* partial) { + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < outlength) { + int countnonzero = 0; + int blocks = (lenparents + blockDim.x - 1) / blockDim.x; + for (int i = 0; i < blocks; ++i) { + countnonzero += partial[i * outlength + thread_id]; + } + toptr[thread_id] = countnonzero; + } + } +} +""" + +parents = cp.array([0, 1, 1, 2, 2, 2, 2, 2, 2, 5], dtype=cp.int32) +fromptr = cp.array([1, 1, 1, 0, 1, 1, 0, 1, 1, 0], dtype=cp.bool_) +lenparents = len(parents) +outlength = int(cp.max(parents)) + 1 +toptr = cp.zeros(outlength, dtype=cp.int32) + +block_size = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024] +for i in range (len(block_size)): + partial = cp.zeros((outlength * ((lenparents + block_size[i] - 1) // block_size[i])), dtype=cp.int32) + grid_size = (lenparents + block_size[i] - 1) // block_size[i] + shared_mem_size = block_size[i] * cp.int32().nbytes + + raw_module = cp.RawModule(code=cuda_kernel) + + awkward_reduce_countnonzero_a = raw_module.get_function('awkward_reduce_countnonzero_a') + awkward_reduce_countnonzero_b = raw_module.get_function('awkward_reduce_countnonzero_b') + awkward_reduce_countnonzero_c = raw_module.get_function('awkward_reduce_countnonzero_c') + + awkward_reduce_countnonzero_a((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial)) + awkward_reduce_countnonzero_b((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size) + awkward_reduce_countnonzero_c(((outlength + block_size[i] - 1) // block_size[i],), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial)) + + assert cp.array_equal(toptr, cp.array([1, 2, 4, 0, 0, 0])) \ No newline at end of file diff --git a/studies/cuda-kernels/reducers/awkward_reduce_sum_int64_bool_64_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_sum_int64_bool_64_tree_reduction.py new file mode 100644 index 0000000000..3aa595d34f --- /dev/null +++ b/studies/cuda-kernels/reducers/awkward_reduce_sum_int64_bool_64_tree_reduction.py @@ -0,0 +1,83 @@ +import cupy as cp + +cuda_kernel = """ +extern "C" { + __global__ void awkward_reduce_countnonzero_a(long long* toptr, bool* fromptr, long long* parents, long long lenparents, long long outlength, long long* partial) { + long long thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < outlength) { + toptr[thread_id] = 0; + } + } +} + +extern "C" { + __global__ void awkward_reduce_countnonzero_b(long long* toptr, bool* fromptr, long long* parents, long long lenparents, long long outlength, long long* partial) { + extern __shared__ long long shared[]; + + long long idx = threadIdx.x; + long long thread_id = blockIdx.x * blockDim.x + idx; + + if (thread_id < lenparents) { + shared[idx] = (fromptr[thread_id] != 0) ? 1 : 0; + } + __syncthreads(); + + for (long long stride = 1; stride < blockDim.x; stride *= 2) { + long long val = 0; + if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { + val = shared[idx - stride]; + } + __syncthreads(); + shared[idx] += val; + __syncthreads(); + } + + if (thread_id < lenparents) { + long long parent = parents[thread_id]; + if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { + partial[blockIdx.x * outlength + parent] = shared[idx]; + } + } + } +} + +extern "C" { + __global__ void awkward_reduce_countnonzero_c(long long* toptr, bool* fromptr, long long* parents, long long lenparents, long long outlength, long long* partial) { + long long thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < outlength) { + long long countnonzero = 0; + long long blocks = (lenparents + blockDim.x - 1) / blockDim.x; + for (long long i = 0; i < blocks; ++i) { + countnonzero += partial[i * outlength + thread_id]; + } + toptr[thread_id] = countnonzero; + } + } +} +""" + +parents = cp.array([0, 1, 1, 2, 2, 2, 2, 2, 2, 5], dtype=cp.int64) +fromptr = cp.array([1, 1, 1, 0, 1, 1, 0, 1, 1, 0], dtype=cp.bool_) +lenparents = len(parents) +outlength = int(cp.max(parents)) + 1 +toptr = cp.zeros(outlength, dtype=cp.int64) + +block_size = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024] +for i in range (len(block_size)): + partial = cp.zeros((outlength * ((lenparents + block_size[i] - 1) // block_size[i])), dtype=cp.int64) + grid_size = (lenparents + block_size[i] - 1) // block_size[i] + shared_mem_size = block_size[i] * cp.int64().nbytes + + raw_module = cp.RawModule(code=cuda_kernel) + + awkward_reduce_countnonzero_a = raw_module.get_function('awkward_reduce_countnonzero_a') + awkward_reduce_countnonzero_b = raw_module.get_function('awkward_reduce_countnonzero_b') + awkward_reduce_countnonzero_c = raw_module.get_function('awkward_reduce_countnonzero_c') + + awkward_reduce_countnonzero_a((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial)) + awkward_reduce_countnonzero_b((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size) + awkward_reduce_countnonzero_c(((outlength + block_size[i] - 1) // block_size[i],), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial)) + + assert cp.array_equal(toptr, cp.array([1, 2, 4, 0, 0, 0])) \ No newline at end of file From aa507a2a4d1cb7a88654ddb0e2563a45718356e5 Mon Sep 17 00:00:00 2001 From: ManasviGoyal Date: Thu, 23 May 2024 14:57:22 +0200 Subject: [PATCH 10/10] chore: add sum and prod complex --- .../awkward_reduce_argmax_tree_reduction.py | 3 +- .../awkward_reduce_argmin_tree_reduction.py | 3 +- .../awkward_reduce_count_64_tree_reduction.py | 3 +- ...ward_reduce_countnonzero_tree_reduction.py | 3 +- .../awkward_reduce_max_tree_reduction.py | 3 +- .../awkward_reduce_min_tree_reduction.py | 3 +- ...awkward_reduce_prod_bool_tree_reduction.py | 3 +- ...ward_reduce_prod_complex_tree_reduction.py | 86 +++++++++++++++++++ .../awkward_reduce_prod_tree_reduction.py | 3 +- .../awkward_reduce_sum_bool_tree_reduction.py | 3 +- ...kward_reduce_sum_complex_tree_reduction.py | 83 +++++++++--------- ...reduce_sum_int32_bool_64_tree_reduction.py | 3 +- ...reduce_sum_int64_bool_64_tree_reduction.py | 3 +- .../awkward_reduce_sum_tree_reduction.py | 3 +- 14 files changed, 149 insertions(+), 56 deletions(-) create mode 100644 studies/cuda-kernels/reducers/awkward_reduce_prod_complex_tree_reduction.py diff --git a/studies/cuda-kernels/reducers/awkward_reduce_argmax_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_argmax_tree_reduction.py index b0c4d8cfaa..011dd3b594 100644 --- a/studies/cuda-kernels/reducers/awkward_reduce_argmax_tree_reduction.py +++ b/studies/cuda-kernels/reducers/awkward_reduce_argmax_tree_reduction.py @@ -20,6 +20,8 @@ if (thread_id < lenparents) { shared[idx] = thread_id; + } else { + shared[idx] = -1; } __syncthreads(); @@ -28,7 +30,6 @@ if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { index = shared[idx - stride]; } - __syncthreads(); if (index != -1 && (shared[idx] == -1 || fromptr[index] > fromptr[shared[idx]])) { shared[idx] = index; } diff --git a/studies/cuda-kernels/reducers/awkward_reduce_argmin_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_argmin_tree_reduction.py index 4756809fc0..972dbaf4dc 100644 --- a/studies/cuda-kernels/reducers/awkward_reduce_argmin_tree_reduction.py +++ b/studies/cuda-kernels/reducers/awkward_reduce_argmin_tree_reduction.py @@ -20,6 +20,8 @@ if (thread_id < lenparents) { shared[idx] = thread_id; + } else { + shared[idx] = -1; } __syncthreads(); @@ -28,7 +30,6 @@ if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { index = shared[idx - stride]; } - __syncthreads(); if (index != -1 && (shared[idx] == -1 || fromptr[index] < fromptr[shared[idx]])) { shared[idx] = index; } diff --git a/studies/cuda-kernels/reducers/awkward_reduce_count_64_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_count_64_tree_reduction.py index 66fe580399..0c894fee35 100644 --- a/studies/cuda-kernels/reducers/awkward_reduce_count_64_tree_reduction.py +++ b/studies/cuda-kernels/reducers/awkward_reduce_count_64_tree_reduction.py @@ -20,6 +20,8 @@ if (thread_id < lenparents) { shared[idx] = 1; + } else { + shared[idx] = 0; } __syncthreads(); @@ -28,7 +30,6 @@ if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { val = shared[idx - stride]; } - __syncthreads(); shared[idx] += val; __syncthreads(); } diff --git a/studies/cuda-kernels/reducers/awkward_reduce_countnonzero_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_countnonzero_tree_reduction.py index 0d7ca6d5e0..774c3531b5 100644 --- a/studies/cuda-kernels/reducers/awkward_reduce_countnonzero_tree_reduction.py +++ b/studies/cuda-kernels/reducers/awkward_reduce_countnonzero_tree_reduction.py @@ -20,6 +20,8 @@ if (thread_id < lenparents) { shared[idx] = (fromptr[thread_id] != 0) ? 1 : 0; + } else { + shared[idx] = 0; } __syncthreads(); @@ -28,7 +30,6 @@ if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { val = shared[idx - stride]; } - __syncthreads(); shared[idx] += val; __syncthreads(); } diff --git a/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py index dfff51a25f..cecc63d415 100644 --- a/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py +++ b/studies/cuda-kernels/reducers/awkward_reduce_max_tree_reduction.py @@ -20,6 +20,8 @@ if (thread_id < lenparents) { shared[idx] = fromptr[thread_id]; + } else { + shared[idx] = identity; } __syncthreads(); @@ -28,7 +30,6 @@ if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { val = shared[idx - stride]; } - __syncthreads(); shared[idx] = max(shared[idx], val); __syncthreads(); } diff --git a/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py index 8069a3007d..fcc3d7e133 100644 --- a/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py +++ b/studies/cuda-kernels/reducers/awkward_reduce_min_tree_reduction.py @@ -20,6 +20,8 @@ if (thread_id < lenparents) { shared[idx] = fromptr[thread_id]; + } else { + shared[idx] = identity; } __syncthreads(); @@ -28,7 +30,6 @@ if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { val = shared[idx - stride]; } - __syncthreads(); shared[idx] = min(shared[idx], val); __syncthreads(); } diff --git a/studies/cuda-kernels/reducers/awkward_reduce_prod_bool_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_prod_bool_tree_reduction.py index 156625fa68..a66ca772c2 100644 --- a/studies/cuda-kernels/reducers/awkward_reduce_prod_bool_tree_reduction.py +++ b/studies/cuda-kernels/reducers/awkward_reduce_prod_bool_tree_reduction.py @@ -20,6 +20,8 @@ if (thread_id < lenparents) { shared[idx] = fromptr[thread_id]; + } else { + shared[idx] = 1; } __syncthreads(); @@ -28,7 +30,6 @@ if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { val = shared[idx - stride]; } - __syncthreads(); shared[idx] &= (val != 0); __syncthreads(); } diff --git a/studies/cuda-kernels/reducers/awkward_reduce_prod_complex_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_prod_complex_tree_reduction.py new file mode 100644 index 0000000000..0a85c488ac --- /dev/null +++ b/studies/cuda-kernels/reducers/awkward_reduce_prod_complex_tree_reduction.py @@ -0,0 +1,86 @@ +import cupy as cp + +cuda_kernel = """ +#include + +extern "C" { + __global__ void awkward_reduce_prod_complex_a(float* toptr, float* fromptr, int* parents, int lenparents, int outlength, float* partial) { + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < outlength) { + toptr[thread_id * 2] = 1.0f; + toptr[thread_id * 2 + 1] = 0.0f; + } + } + + __global__ void awkward_reduce_prod_complex_b(float* toptr, float* fromptr, int* parents, int lenparents, int outlength, float* partial) { + extern __shared__ cuda::std::complex shared[]; + + int idx = threadIdx.x; + int thread_id = blockIdx.x * blockDim.x + idx; + + if (thread_id < lenparents) { + shared[idx] = cuda::std::complex(fromptr[thread_id * 2], fromptr[thread_id * 2 + 1]); + } + __syncthreads(); + + for (int stride = 1; stride < blockDim.x; stride *= 2) { + cuda::std::complex val(1.0f, 0.0f); + if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { + val = shared[idx - stride]; + } + __syncthreads(); + shared[idx] *= val; + __syncthreads(); + } + + if (thread_id < lenparents) { + int parent = parents[thread_id]; + if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { + partial[(blockIdx.x * outlength + parent) * 2] = shared[idx].real(); + partial[(blockIdx.x * outlength + parent) * 2 + 1] = shared[idx].imag(); + } + } + } + + __global__ void awkward_reduce_prod_complex_c(float* toptr, float* fromptr, int* parents, int lenparents, int outlength, float* partial) { + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < outlength) { + cuda::std::complex prod(1.0f, 0.0f); + int blocks = (lenparents + blockDim.x - 1) / blockDim.x; + for (int i = 0; i < blocks; ++i) { + cuda::std::complex val(partial[(i * outlength + thread_id) * 2], partial[(i * outlength + thread_id) * 2 + 1]); + prod = prod * val; + } + toptr[thread_id * 2] = prod.real(); + toptr[thread_id * 2 + 1] = prod.imag(); + } + } +} +""" + +raw_module = cp.RawModule(code=cuda_kernel, options=('-I', '/usr/local/cuda-12.3/include/'),) + +parents = cp.array([0, 1, 1, 2, 2, 2, 2, 2, 5, 5], dtype=cp.int32) +fromptr = cp.array([1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 6, 0], dtype=cp.float32) +lenparents = len(parents) +outlength = int(cp.max(parents)) + 1 +toptr = cp.zeros(2 * outlength, dtype=cp.float32) + +block_size = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024] +for i in range(len(block_size)): + partial = cp.zeros(2 * outlength * ((lenparents + block_size[i] - 1) // block_size[i]), dtype=cp.float32) + grid_size = (lenparents + block_size[i] - 1) // block_size[i] + shared_mem_size = block_size[i] * 2 * cp.float32().nbytes + + + awkward_reduce_prod_complex_a = raw_module.get_function('awkward_reduce_prod_complex_a') + awkward_reduce_prod_complex_b = raw_module.get_function('awkward_reduce_prod_complex_b') + awkward_reduce_prod_complex_c = raw_module.get_function('awkward_reduce_prod_complex_c') + + awkward_reduce_prod_complex_a((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial)) + awkward_reduce_prod_complex_b((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size) + awkward_reduce_prod_complex_c(((outlength + block_size[i] - 1) // block_size[i],), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial)) + + print(block_size[i], toptr.get()) \ No newline at end of file diff --git a/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py index 910ba712b8..38e97fe8f4 100644 --- a/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py +++ b/studies/cuda-kernels/reducers/awkward_reduce_prod_tree_reduction.py @@ -20,6 +20,8 @@ if (thread_id < lenparents) { shared[idx] = fromptr[thread_id]; + } else { + shared[idx] = 1; } __syncthreads(); @@ -28,7 +30,6 @@ if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { val = shared[idx - stride]; } - __syncthreads(); shared[idx] *= val; __syncthreads(); } diff --git a/studies/cuda-kernels/reducers/awkward_reduce_sum_bool_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_sum_bool_tree_reduction.py index 971331a3e6..3f9ab1bae6 100644 --- a/studies/cuda-kernels/reducers/awkward_reduce_sum_bool_tree_reduction.py +++ b/studies/cuda-kernels/reducers/awkward_reduce_sum_bool_tree_reduction.py @@ -20,6 +20,8 @@ if (thread_id < lenparents) { shared[idx] = fromptr[thread_id]; + } else { + shared[idx] = 0; } __syncthreads(); @@ -28,7 +30,6 @@ if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { val = shared[idx - stride]; } - __syncthreads(); shared[idx] |= (val != 0); __syncthreads(); } diff --git a/studies/cuda-kernels/reducers/awkward_reduce_sum_complex_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_sum_complex_tree_reduction.py index 06285338de..5c3f20ca56 100644 --- a/studies/cuda-kernels/reducers/awkward_reduce_sum_complex_tree_reduction.py +++ b/studies/cuda-kernels/reducers/awkward_reduce_sum_complex_tree_reduction.py @@ -1,91 +1,86 @@ import cupy as cp cuda_kernel = """ +#include + extern "C" { - __global__ void awkward_reduce_sum_complex_a(float* toptr, float* fromptr, int* parents, int lenparents, int outlength, int* partial) { - int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + __global__ void awkward_reduce_sum_complex_a(float* toptr, float* fromptr, int* parents, int lenparents, int outlength, float* partial) { + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; - if (thread_id < outlength) { - toptr[thread_id * 2] = 0; - toptr[thread_id * 2 + 1] = 0; - } + if (thread_id < outlength) { + toptr[thread_id * 2] = 0.0f; + toptr[thread_id * 2 + 1] = 0.0f; + } } -} - -extern "C" { - __global__ void awkward_reduce_sum_complex_b(float* toptr, float* fromptr, int* parents, int lenparents, int outlength, int* partial) { - extern __shared__ float shared[]; + + __global__ void awkward_reduce_sum_complex_b(float* toptr, float* fromptr, int* parents, int lenparents, int outlength, float* partial) { + extern __shared__ cuda::std::complex shared[]; int idx = threadIdx.x; int thread_id = blockIdx.x * blockDim.x + idx; if (thread_id < lenparents) { - shared[idx * 2] = fromptr[thread_id * 2]; - shared[idx * 2 + 1] = fromptr[thread_id * 2 + 1]; + shared[idx] = cuda::std::complex(fromptr[thread_id * 2], fromptr[thread_id * 2 + 1]); } __syncthreads(); for (int stride = 1; stride < blockDim.x; stride *= 2) { - float real = 0; - float imag = 0; + cuda::std::complex val(0.0f, 0.0f); if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { - real = shared[(idx - stride) * 2]; - imag = shared[(idx - stride) * 2 + 1]; + val = shared[idx - stride]; } __syncthreads(); - shared[idx * 2] += real; - shared[idx * 2 + 1] += imag; + shared[idx] += val; __syncthreads(); } if (thread_id < lenparents) { int parent = parents[thread_id]; if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { - partial[(blockIdx.x * outlength + parent) * 2 ] = shared[idx * 2]; - partial[(blockIdx.x * outlength + parent) * 2 + 1] = shared[idx * 2 + 1]; + partial[(blockIdx.x * outlength + parent) * 2] = shared[idx].real(); + partial[(blockIdx.x * outlength + parent) * 2 + 1] = shared[idx].imag(); } } } -} -extern "C" { - __global__ void awkward_reduce_sum_complex_c(float* toptr, float* fromptr, int* parents, int lenparents, int outlength, int* partial) { + __global__ void awkward_reduce_sum_complex_c(float* toptr, float* fromptr, int* parents, int lenparents, int outlength, float* partial) { int thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - float real = 0; - float imag = 0; + cuda::std::complex sum(0.0f, 0.0f); int blocks = (lenparents + blockDim.x - 1) / blockDim.x; for (int i = 0; i < blocks; ++i) { - real += partial[(i * outlength + thread_id) * 2]; - imag += partial[(i * outlength + thread_id) * 2 + 1]; + cuda::std::complex val(partial[(i * outlength + thread_id) * 2], partial[(i * outlength + thread_id) * 2 + 1]); + sum += val; } - toptr[thread_id * 2] = real; - toptr[thread_id * 2 + 1] = imag; + toptr[thread_id * 2] = sum.real(); + toptr[thread_id * 2 + 1] = sum.imag(); } } } """ +raw_module = cp.RawModule(code=cuda_kernel, options=('-I', '/usr/local/cuda-12.3/include/'),) + parents = cp.array([0, 1, 1, 2, 2, 2, 2, 2, 2, 5], dtype=cp.int32) -fromptr = cp.array([1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 0], dtype=cp.float32) +fromptr = cp.array([1, 0, 2.5677, 1.2345, 3.2367, 2.256576, 4.3456, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 0], dtype=cp.float32) lenparents = len(parents) outlength = int(cp.max(parents)) + 1 -toptr = cp.zeros(outlength * 2, dtype=cp.float32) +toptr = cp.zeros(2 * outlength, dtype=cp.float32) -block_size = 2 -partial = cp.zeros((outlength * ((lenparents + block_size - 1) // block_size)), dtype=cp.float32) -grid_size = (lenparents + block_size - 1) // block_size -shared_mem_size = block_size * cp.int32().nbytes +block_size = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024] +for i in range(len(block_size)): + partial = cp.zeros(2 * outlength * ((lenparents + block_size[i] - 1) // block_size[i]), dtype=cp.float32) + grid_size = (lenparents + block_size[i] - 1) // block_size[i] + shared_mem_size = block_size[i] * 2 * cp.float32().nbytes -raw_module = cp.RawModule(code=cuda_kernel) -awkward_reduce_sum_complex_a = raw_module.get_function('awkward_reduce_sum_complex_a') -awkward_reduce_sum_complex_b = raw_module.get_function('awkward_reduce_sum_complex_b') -awkward_reduce_sum_complex_c = raw_module.get_function('awkward_reduce_sum_complex_c') + awkward_reduce_sum_complex_a = raw_module.get_function('awkward_reduce_sum_complex_a') + awkward_reduce_sum_complex_b = raw_module.get_function('awkward_reduce_sum_complex_b') + awkward_reduce_sum_complex_c = raw_module.get_function('awkward_reduce_sum_complex_c') -awkward_reduce_sum_complex_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial)) -awkward_reduce_sum_complex_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size) -awkward_reduce_sum_complex_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, partial)) + awkward_reduce_sum_complex_a((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial)) + awkward_reduce_sum_complex_b((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size) + awkward_reduce_sum_complex_c(((outlength + block_size[i] - 1) // block_size[i],), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial)) -assert cp.array_equal(toptr, cp.array([1, 0, 5, 3, 39, 33, 0, 0, 0, 0, 10, 0])) \ No newline at end of file + print(block_size[i], toptr.get()) \ No newline at end of file diff --git a/studies/cuda-kernels/reducers/awkward_reduce_sum_int32_bool_64_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_sum_int32_bool_64_tree_reduction.py index 28a0cb3a34..142e111730 100644 --- a/studies/cuda-kernels/reducers/awkward_reduce_sum_int32_bool_64_tree_reduction.py +++ b/studies/cuda-kernels/reducers/awkward_reduce_sum_int32_bool_64_tree_reduction.py @@ -20,6 +20,8 @@ if (thread_id < lenparents) { shared[idx] = (fromptr[thread_id] != 0) ? 1 : 0; + } else { + shared[idx] = 0; } __syncthreads(); @@ -28,7 +30,6 @@ if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { val = shared[idx - stride]; } - __syncthreads(); shared[idx] += val; __syncthreads(); } diff --git a/studies/cuda-kernels/reducers/awkward_reduce_sum_int64_bool_64_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_sum_int64_bool_64_tree_reduction.py index 3aa595d34f..69be141d00 100644 --- a/studies/cuda-kernels/reducers/awkward_reduce_sum_int64_bool_64_tree_reduction.py +++ b/studies/cuda-kernels/reducers/awkward_reduce_sum_int64_bool_64_tree_reduction.py @@ -20,6 +20,8 @@ if (thread_id < lenparents) { shared[idx] = (fromptr[thread_id] != 0) ? 1 : 0; + } else { + shared[idx] = 0; } __syncthreads(); @@ -28,7 +30,6 @@ if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { val = shared[idx - stride]; } - __syncthreads(); shared[idx] += val; __syncthreads(); } diff --git a/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py b/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py index 11428c5475..db460ec574 100644 --- a/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py +++ b/studies/cuda-kernels/reducers/awkward_reduce_sum_tree_reduction.py @@ -20,6 +20,8 @@ if (thread_id < lenparents) { shared[idx] = fromptr[thread_id]; + } else { + shared[idx] = 0; } __syncthreads(); @@ -28,7 +30,6 @@ if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { val = shared[idx - stride]; } - __syncthreads(); shared[idx] += val; __syncthreads(); }