From 860868d536e5162670bf51950d6c9bde988ff70b Mon Sep 17 00:00:00 2001 From: ManasviGoyal Date: Wed, 29 May 2024 11:28:40 +0200 Subject: [PATCH 01/33] feat: add tree reduction implementation of argmin and argmax --- dev/generate-kernel-signatures.py | 2 + kernel-test-data.json | 4 +- .../cuda_kernels/awkward_reduce_argmax.cu | 76 +++++++++++++++++-- .../cuda_kernels/awkward_reduce_argmin.cu | 76 +++++++++++++++++-- 4 files changed, 144 insertions(+), 14 deletions(-) diff --git a/dev/generate-kernel-signatures.py b/dev/generate-kernel-signatures.py index b64946626c..e020391e01 100644 --- a/dev/generate-kernel-signatures.py +++ b/dev/generate-kernel-signatures.py @@ -374,6 +374,8 @@ def kernel_signatures_cuda_py(specification): from awkward._connect.cuda import fetch_specialization from awkward._connect.cuda import import_cupy +import math + cupy = import_cupy("Awkward Arrays with CUDA") """ ) diff --git a/kernel-test-data.json b/kernel-test-data.json index fde02211fa..fc0fb07cd5 100644 --- a/kernel-test-data.json +++ b/kernel-test-data.json @@ -23348,7 +23348,7 @@ }, { "name": "awkward_reduce_argmax", - "status": false, + "status": true, "tests": [ { "error": false, @@ -25544,7 +25544,7 @@ }, { "name": "awkward_reduce_argmin", - "status": false, + "status": true, "tests": [ { "error": false, diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu index 555420c3f8..ee57d912fc 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu @@ -3,10 +3,20 @@ // BEGIN PYTHON // def f(grid, block, args): // (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_a", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_b", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code)) +// shared_mem_size = block[0] * toptr.dtype.itemsize +// if block[0] > 0: +// segment = math.floor((outlength + block[0] - 1) / block[0]) +// partial_size = outlength * ((lenparents + block[0] - 1) / block[0]) +// else: +// segment = 0 +// partial_size = 0 +// partial = cupy.full(math.floor(partial_size), -1, dtype=toptr.dtype) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, partial, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, partial, invocation_index, err_code), shared_mem=shared_mem_size) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, partial, invocation_index, err_code)) // out["awkward_reduce_argmax_a", {dtype_specializations}] = None // out["awkward_reduce_argmax_b", {dtype_specializations}] = None +// out["awkward_reduce_argmax_c", {dtype_specializations}] = None // END PYTHON template @@ -17,10 +27,12 @@ awkward_reduce_argmax_a( const U* parents, int64_t lenparents, int64_t outlength, + T* partial, uint64_t invocation_index, uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_id < outlength) { toptr[thread_id] = -1; } @@ -35,17 +47,69 @@ awkward_reduce_argmax_b( const U* parents, int64_t lenparents, int64_t outlength, + T* partial, uint64_t invocation_index, uint64_t* err_code) { if (err_code[0] == NO_ERROR) { - int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[]; + T *shared_mem = reinterpret_cast(shared_memory); + + int64_t idx = threadIdx.x; + int64_t thread_id = blockIdx.x * blockDim.x + idx; + + if (thread_id < lenparents) { + shared_mem[idx] = thread_id; + } else { + shared_mem[idx] = -1; + } + __syncthreads(); + + for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { + int64_t index = -1; + if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { + index = shared_mem[idx - stride]; + } + if (index != -1 && (shared_mem[idx] == -1 || fromptr[index] > fromptr[shared_mem[idx]] || + (fromptr[index] == fromptr[shared_mem[idx]] && index < shared_mem[idx]))) { + shared_mem[idx] = index; + } + __syncthreads(); + } if (thread_id < lenparents) { int64_t parent = parents[thread_id]; - if (toptr[parent] == -1 || - (fromptr[thread_id] > (fromptr[toptr[parent]]))) { - toptr[parent] = thread_id; // we need the last parent filled, thread random order problem, find max arg at that index + if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { + partial[blockIdx.x * outlength + parent] = shared_mem[idx]; + } + } + } +} + +template +__global__ void +awkward_reduce_argmax_c( + T* toptr, + const C* fromptr, + const U* parents, + int64_t lenparents, + int64_t outlength, + T* partial, + uint64_t invocation_index, + uint64_t* err_code) { + if (err_code[0] == NO_ERROR) { + int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < outlength) { + int64_t argmax = -1; + int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x; + for (int64_t i = 0; i < blocks; ++i) { + int64_t index = partial[i * outlength + thread_id]; + if (index != -1 && (argmax == -1 || fromptr[index] > fromptr[argmax]) || + (fromptr[index] == fromptr[argmax] && index < argmax)) { + argmax = index; + } } + toptr[thread_id] = argmax; } } } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu index 282ebd11cc..262a51b57a 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu @@ -3,10 +3,20 @@ // BEGIN PYTHON // def f(grid, block, args): // (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmin_a", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmin_b", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code)) +// shared_mem_size = block[0] * toptr.dtype.itemsize +// if block[0] > 0: +// segment = math.floor((outlength + block[0] - 1) / block[0]) +// partial_size = outlength * ((lenparents + block[0] - 1) / block[0]) +// else: +// segment = 0 +// partial_size = 0 +// partial = cupy.full(math.floor(partial_size), -1, dtype=toptr.dtype) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmin_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, partial, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmin_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, partial, invocation_index, err_code), shared_mem=shared_mem_size) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmin_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, partial, invocation_index, err_code)) // out["awkward_reduce_argmin_a", {dtype_specializations}] = None // out["awkward_reduce_argmin_b", {dtype_specializations}] = None +// out["awkward_reduce_argmin_c", {dtype_specializations}] = None // END PYTHON template @@ -17,10 +27,12 @@ awkward_reduce_argmin_a( const U* parents, int64_t lenparents, int64_t outlength, + T* partial, uint64_t invocation_index, uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_id < outlength) { toptr[thread_id] = -1; } @@ -35,17 +47,69 @@ awkward_reduce_argmin_b( const U* parents, int64_t lenparents, int64_t outlength, + T* partial, uint64_t invocation_index, uint64_t* err_code) { if (err_code[0] == NO_ERROR) { - int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[]; + T *shared_mem = reinterpret_cast(shared_memory); + + int64_t idx = threadIdx.x; + int64_t thread_id = blockIdx.x * blockDim.x + idx; + + if (thread_id < lenparents) { + shared_mem[idx] = thread_id; + } else { + shared_mem[idx] = -1; + } + __syncthreads(); + + for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { + int64_t index = -1; + if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { + index = shared_mem[idx - stride]; + } + if (index != -1 && (shared_mem[idx] == -1 || fromptr[index] < fromptr[shared_mem[idx]] || + (fromptr[index] == fromptr[shared_mem[idx]] && index < shared_mem[idx]))) { + shared_mem[idx] = index; + } + __syncthreads(); + } if (thread_id < lenparents) { int64_t parent = parents[thread_id]; - if (toptr[parent] == -1 || - (fromptr[thread_id] < (fromptr[toptr[parent]]))) { - toptr[parent] = thread_id; + if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { + partial[blockIdx.x * outlength + parent] = shared_mem[idx]; + } + } + } +} + +template +__global__ void +awkward_reduce_argmin_c( + T* toptr, + const C* fromptr, + const U* parents, + int64_t lenparents, + int64_t outlength, + T* partial, + uint64_t invocation_index, + uint64_t* err_code) { + if (err_code[0] == NO_ERROR) { + int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < outlength) { + int64_t argmin = -1; + int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x; + for (int64_t i = 0; i < blocks; ++i) { + int64_t index = partial[i * outlength + thread_id]; + if (index != -1 && (argmin == -1 || fromptr[index] < fromptr[argmin]) || + (fromptr[index] == fromptr[argmin] && index < argmin)) { + argmin = index; + } } + toptr[thread_id] = argmin; } } } From 3cdbd7e1aac65149b44a8a0b5b97bccd4c157357 Mon Sep 17 00:00:00 2001 From: ManasviGoyal Date: Thu, 30 May 2024 15:37:02 +0200 Subject: [PATCH 02/33] feat: add awkward_ListOffsetArray_reduce_local_outoffsets_64 kernel --- dev/generate-kernel-signatures.py | 1 + dev/generate-tests.py | 1 + kernel-test-data.json | 2 +- src/awkward/_connect/cuda/__init__.py | 1 + ...tOffsetArray_reduce_local_outoffsets_64.cu | 134 ++++++++++++++++++ 5 files changed, 138 insertions(+), 1 deletion(-) create mode 100644 src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu diff --git a/dev/generate-kernel-signatures.py b/dev/generate-kernel-signatures.py index e020391e01..c70f787c25 100644 --- a/dev/generate-kernel-signatures.py +++ b/dev/generate-kernel-signatures.py @@ -101,6 +101,7 @@ "awkward_ListOffsetArray_drop_none_indexes", "awkward_ListOffsetArray_reduce_local_nextparents_64", "awkward_ListOffsetArray_reduce_nonlocal_maxcount_offsetscopy_64", + "awkward_ListOffsetArray_reduce_local_outoffsets_64", "awkward_UnionArray_flatten_length", "awkward_UnionArray_flatten_combine", "awkward_UnionArray_nestedfill_tags_index", diff --git a/dev/generate-tests.py b/dev/generate-tests.py index 37dc859b9a..db89f2655e 100644 --- a/dev/generate-tests.py +++ b/dev/generate-tests.py @@ -886,6 +886,7 @@ def gencpuunittests(specdict): "awkward_ListOffsetArray_drop_none_indexes", "awkward_ListOffsetArray_reduce_local_nextparents_64", "awkward_ListOffsetArray_reduce_nonlocal_maxcount_offsetscopy_64", + "awkward_ListOffsetArray_reduce_local_outoffsets_64", "awkward_UnionArray_flatten_length", "awkward_UnionArray_flatten_combine", "awkward_UnionArray_nestedfill_tags_index", diff --git a/kernel-test-data.json b/kernel-test-data.json index fc0fb07cd5..db4bb86c93 100644 --- a/kernel-test-data.json +++ b/kernel-test-data.json @@ -18851,7 +18851,7 @@ }, { "name": "awkward_ListOffsetArray_reduce_local_outoffsets_64", - "status": false, + "status": true, "tests": [ { "error": false, diff --git a/src/awkward/_connect/cuda/__init__.py b/src/awkward/_connect/cuda/__init__.py index 354fdcd217..5d7a77b8c5 100644 --- a/src/awkward/_connect/cuda/__init__.py +++ b/src/awkward/_connect/cuda/__init__.py @@ -105,6 +105,7 @@ def fetch_template_specializations(kernel_dict): "awkward_ListOffsetArray_drop_none_indexes", "awkward_ListOffsetArray_reduce_nonlocal_nextstarts_64", "awkward_ListOffsetArray_reduce_nonlocal_maxcount_offsetscopy_64", + "awkward_ListOffsetArray_reduce_local_outoffsets_64", "awkward_ListOffsetArray_rpad_length_axis1", "awkward_MaskedArray_getitem_next_jagged_project", "awkward_UnionArray_nestedfill_tags_index", diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu new file mode 100644 index 0000000000..16dec17447 --- /dev/null +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu @@ -0,0 +1,134 @@ +// BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +// BEGIN PYTHON +// def f(grid, block, args): +// (outoffsets, parents, lenparents, outlength, invocation_index, err_code) = args +// shared_mem_size = block[0] * outoffsets.dtype.itemsize +// if block[0] > 0: +// segment = math.floor((outlength + block[0] - 1) / block[0]) +// grid_size = math.floor((lenparents + block[0] - 1) / block[0]) +// else: +// segment = 0 +// grid_size = 1 +// print(block, grid_size) +// parents = cupy.sort(parents) +// partial = cupy.zeros(outlength * grid_size, dtype=outoffsets.dtype) +// temp = cupy.zeros(lenparents, dtype=cupy.int64) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_a", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_b", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, partial, temp, invocation_index, err_code), shared_mem=shared_mem_size) +// scan_in_array = cupy.zeros(outlength, dtype=cupy.int64) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_c", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((segment,), block, (outoffsets, parents, lenparents, outlength, partial, scan_in_array, invocation_index, err_code)) +// scan_in_array = cupy.cumsum(scan_in_array) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_d", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, partial, scan_in_array, invocation_index, err_code)) +// out["awkward_ListOffsetArray_reduce_local_outoffsets_64_a", {dtype_specializations}] = None +// out["awkward_ListOffsetArray_reduce_local_outoffsets_64_b", {dtype_specializations}] = None +// out["awkward_ListOffsetArray_reduce_local_outoffsets_64_c", {dtype_specializations}] = None +// out["awkward_ListOffsetArray_reduce_local_outoffsets_64_d", {dtype_specializations}] = None +// END PYTHON + +template +__global__ void +awkward_ListOffsetArray_reduce_local_outoffsets_64_a( + T* outoffsets, + const C* parents, + int64_t lenparents, + int64_t outlength, + T* partial, + int64_t* temp, + uint64_t invocation_index, + uint64_t* err_code) { + if (err_code[0] == NO_ERROR) { + int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < outlength) { + outoffsets[thread_id] = 0; + } + } +} + +template +__global__ void +awkward_ListOffsetArray_reduce_local_outoffsets_64_b( + T* outoffsets, + const C* parents, + int64_t lenparents, + int64_t outlength, + T* partial, + int64_t* temp, + uint64_t invocation_index, + uint64_t* err_code) { + if (err_code[0] == NO_ERROR) { + int64_t idx = threadIdx.x; + int64_t thread_id = blockIdx.x * blockDim.x + idx; + + if (thread_id < lenparents) { + temp[thread_id] = 1; + } + __syncthreads(); + + + for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { + int64_t val = 0; + if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { + val = temp[thread_id - stride]; + } + __syncthreads(); + temp[thread_id] += val; + __syncthreads(); + } + + + if (thread_id < lenparents) { + int64_t parent = parents[thread_id]; + if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { + partial[blockIdx.x * outlength + parent] = temp[thread_id]; + } + } + } +} + +template +__global__ void +awkward_ListOffsetArray_reduce_local_outoffsets_64_c( + T* outoffsets, + const C* parents, + int64_t lenparents, + int64_t outlength, + T* partial, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { + if (err_code[0] == NO_ERROR) { + int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < outlength) { + int64_t count = 0; + int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x; + for (int64_t i = 0; i < blocks; ++i) { + count += partial[i * outlength + thread_id]; + } + scan_in_array[thread_id] = count; + } + } +} + +template +__global__ void +awkward_ListOffsetArray_reduce_local_outoffsets_64_d( + T* outoffsets, + const C* parents, + int64_t lenparents, + int64_t outlength, + T* partial, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { + if (err_code[0] == NO_ERROR) { + int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + outoffsets[0] = 0; + + if (thread_id < outlength) { + outoffsets[thread_id + 1] = scan_in_array[thread_id]; + } + } +} From c1a846b15f832d0b506aea81417c3869a66e1d02 Mon Sep 17 00:00:00 2001 From: ManasviGoyal Date: Thu, 30 May 2024 15:39:54 +0200 Subject: [PATCH 03/33] test: integration tests for cuda --- .../test_3136_cuda_argmin_and_argmax.py | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 tests-cuda/test_3136_cuda_argmin_and_argmax.py diff --git a/tests-cuda/test_3136_cuda_argmin_and_argmax.py b/tests-cuda/test_3136_cuda_argmin_and_argmax.py new file mode 100644 index 0000000000..8f1f6613bc --- /dev/null +++ b/tests-cuda/test_3136_cuda_argmin_and_argmax.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +import awkward as ak + +to_list = ak.operations.to_list + + +def test_argmin_argmax_axis_None(): + array = ak.highlevel.Array( + [ + [ + [2022, 2023, 2025], + [], + [2027, 2011], + [2013], + ], + [], + [[2017, 2019], [2023]], + ], + ) + cuda_array = ak.to_backend(array, "cuda") + + assert ak.operations.argmin(cuda_array) == 4 + assert ak.operations.argmax(cuda_array) == 3 + + +def test(): + array = ak.highlevel.Array([1, 2, 3, None, 4]) + + cuda_array = ak.to_backend(array, "cuda") + + assert ak.operations.argmax(cuda_array) == 4 From 7be3f982616280de58289997326b2b47907b7d70 Mon Sep 17 00:00:00 2001 From: ManasviGoyal Date: Thu, 30 May 2024 16:22:52 +0200 Subject: [PATCH 04/33] test: some more integration tests for cuda --- .../test_3136_cuda_argmin_and_argmax.py | 160 +++++++++++++++++- 1 file changed, 158 insertions(+), 2 deletions(-) diff --git a/tests-cuda/test_3136_cuda_argmin_and_argmax.py b/tests-cuda/test_3136_cuda_argmin_and_argmax.py index 8f1f6613bc..cc60ecfd51 100644 --- a/tests-cuda/test_3136_cuda_argmin_and_argmax.py +++ b/tests-cuda/test_3136_cuda_argmin_and_argmax.py @@ -1,11 +1,14 @@ from __future__ import annotations +import cupy as cp +import pytest + import awkward as ak to_list = ak.operations.to_list -def test_argmin_argmax_axis_None(): +def test_0835_argmin_argmax_axis_None(): array = ak.highlevel.Array( [ [ @@ -24,9 +27,162 @@ def test_argmin_argmax_axis_None(): assert ak.operations.argmax(cuda_array) == 3 -def test(): +def test_1106_argminmax_axis_None_missing_values(): array = ak.highlevel.Array([1, 2, 3, None, 4]) cuda_array = ak.to_backend(array, "cuda") assert ak.operations.argmax(cuda_array) == 4 + + +def test_0070_argmin_and_argmax_jagged(): + v2_array = ak.operations.from_iter( + [[2.2, 1.1, 3.3], [], [4.4, 5.5], [5.5], [-4.4, -5.5, -6.6]], highlevel=False + ) + + cuda_v2_array = ak.to_backend(v2_array, "cuda", highlevel=False) + + assert to_list(ak.argmin(cuda_v2_array, axis=1, highlevel=False)) == [ + 1, + None, + 0, + 0, + 2, + ] + assert ( + ak.argmin(cuda_v2_array.to_typetracer(), axis=1, highlevel=False).form + == ak.argmin(cuda_v2_array, axis=1, highlevel=False).form + ) + + index2 = ak.index.Index64(cp.array([4, 3, 2, 1, 0], dtype=cp.int64)) + cuda_v2_array2 = ak.contents.IndexedArray(index2, cuda_v2_array) + + assert to_list(ak.argmin(cuda_v2_array2, axis=1, highlevel=False)) == [ + 2, + 0, + 0, + None, + 1, + ] + assert ( + ak.argmin(cuda_v2_array2.to_typetracer(), axis=1, highlevel=False).form + == ak.argmin(cuda_v2_array2, axis=1, highlevel=False).form + ) + + index3 = ak.index.Index64(cp.array([4, 3, -1, 4, 0], dtype=cp.int64)) + cuda_v2_array2 = ak.contents.IndexedOptionArray(index3, cuda_v2_array) + + assert to_list(ak.argmin(cuda_v2_array2, axis=1, highlevel=False)) == [ + 2, + 0, + None, + 2, + 1, + ] + assert ( + ak.argmin(cuda_v2_array2.to_typetracer(), axis=1, highlevel=False).form + == ak.argmin(cuda_v2_array2, axis=1, highlevel=False).form + ) + assert to_list(ak.argmin(cuda_v2_array2, axis=-1, highlevel=False)) == [ + 2, + 0, + None, + 2, + 1, + ] + assert ( + ak.argmin(cuda_v2_array2.to_typetracer(), axis=-1, highlevel=False).form + == ak.argmin(cuda_v2_array2, axis=-1, highlevel=False).form + ) + + +def test_0070_argmin_and_argmax_missing(): + array = ak.operations.from_iter( + [[[2.2, 1.1, 3.3]], [[]], [None, None, None], [[-4.4, -5.5, -6.6]]], + highlevel=False, + ) + + cuda_array = ak.to_backend(array, "cuda", highlevel=False) + + assert to_list(ak.argmin(cuda_array, axis=2, highlevel=False)) == [ + [1], + [None], + [None, None, None], + [2], + ] + assert ( + ak.argmin(cuda_array.to_typetracer(), axis=2, highlevel=False).form + == ak.argmin(cuda_array, axis=2, highlevel=False).form + ) + + +def test_0115_generic_reducer_operation_ByteMaskedArray(): + content = ak.operations.from_iter( + [ + [[1.1, 0.0, 2.2], [], [3.3, 4.4]], + [], + [[5.5]], + [[6.6, 9.9, 8.8, 7.7]], + [[], [12.2, 11.1, 10.0]], + ], + highlevel=False, + ) + mask = ak.index.Index8(cp.array([0, 0, 1, 1, 0], dtype=cp.int8)) + content = ak.to_backend(content, "cuda", highlevel=False) + + cuda_v2_array = ak.contents.ByteMaskedArray(mask, content, valid_when=False) + + assert to_list(cuda_v2_array) == [ + [[1.1, 0.0, 2.2], [], [3.3, 4.4]], + [], + None, + None, + [[], [12.2, 11.1, 10.0]], + ] + assert to_list(ak.argmin(cuda_v2_array, axis=-1, highlevel=False)) == [ + [1, None, 0], + [], + None, + None, + [None, 2], + ] + assert ( + ak.argmin(cuda_v2_array.to_typetracer(), axis=-1, highlevel=False).form + == ak.argmin(cuda_v2_array, axis=-1, highlevel=False).form + ) + + +@pytest.mark.parametrize( + "func", + [ + ak.argmin, + ak.argmax, + ], +) +def test_2754_highlevel_behavior_missing_reducers(func): + behavior_1 = {"foo": "bar"} + behavior_2 = {"baz": "bargh!"} + + array = ak.Array([[1, 2, 3, 4], [5], [10]]) + + cuda_array = ak.to_backend(array, "cuda") + + assert isinstance(func(cuda_array, axis=1, highlevel=True), ak.Array) + assert isinstance(func(cuda_array, axis=1, highlevel=False), ak.contents.Content) + assert ( + func( + ak.Array(cuda_array, behavior=behavior_1), + axis=1, + highlevel=True, + behavior=behavior_2, + ).behavior + == behavior_2 + ) + assert ( + func( + ak.Array(cuda_array, behavior=behavior_1), + axis=1, + highlevel=True, + ).behavior + == behavior_1 + ) From 98fb7ed9cf8db45063cecb658a3287859caf2b6d Mon Sep 17 00:00:00 2001 From: ManasviGoyal Date: Thu, 6 Jun 2024 09:24:43 +0200 Subject: [PATCH 05/33] feat: add awkward_reduce_count_64 kernel --- kernel-test-data.json | 4 +- ...tOffsetArray_reduce_local_outoffsets_64.cu | 4 - .../cuda_kernels/awkward_reduce_count_64.cu | 89 +++++++++++++------ 3 files changed, 65 insertions(+), 32 deletions(-) diff --git a/kernel-test-data.json b/kernel-test-data.json index db4bb86c93..900f81c525 100644 --- a/kernel-test-data.json +++ b/kernel-test-data.json @@ -23795,7 +23795,7 @@ }, { "name": "awkward_reduce_count_64", - "status": false, + "status": true, "tests": [ { "error": false, @@ -23839,7 +23839,7 @@ "inputs": { "lenparents": 1696, "outlength": 331, - "parents": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 103, 103, 103, 103, 103, 103, 103, 103, 103, 103, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 137, 137, 137, 137, 137, 137, 137, 137, 137, 137, 154, 154, 154, 154, 154, 154, 154, 154, 154, 154, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 104, 104, 104, 104, 104, 104, 104, 104, 104, 104, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 138, 138, 138, 138, 138, 138, 138, 138, 138, 138, 155, 155, 155, 155, 155, 155, 155, 155, 155, 155, 172, 172, 172, 172, 172, 172, 172, 172, 172, 172, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 173, 173, 173, 173, 173, 173, 173, 173, 173, 173, 190, 190, 190, 190, 190, 190, 190, 190, 190, 190, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 89, 89, 89, 89, 89, 89, 89, 89, 89, 89, 106, 106, 106, 106, 106, 106, 106, 106, 106, 106, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 140, 140, 140, 140, 140, 140, 140, 140, 140, 140, 157, 157, 157, 157, 157, 157, 157, 157, 157, 157, 174, 174, 174, 174, 174, 174, 174, 174, 174, 174, 191, 191, 191, 191, 191, 191, 191, 191, 191, 191, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 141, 141, 141, 141, 141, 141, 141, 141, 141, 141, 158, 158, 158, 158, 158, 158, 158, 158, 158, 158, 175, 175, 175, 175, 175, 175, 175, 175, 175, 175, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 159, 159, 159, 159, 159, 159, 159, 159, 159, 159, 176, 176, 176, 176, 176, 176, 176, 176, 176, 176, 193, 193, 193, 193, 193, 193, 193, 193, 193, 193, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 143, 143, 143, 143, 143, 143, 143, 143, 143, 143, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 177, 177, 177, 177, 177, 177, 177, 177, 177, 177, 194, 194, 194, 194, 194, 194, 194, 194, 194, 194, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16] + "parents": [194, 194, 194, 194, 194, 194, 194, 194, 194, 194, 193, 193, 193, 193, 193, 193, 193, 193, 193, 193, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 191, 191, 191, 191, 191, 191, 191, 191, 191, 191, 190, 190, 190, 190, 190, 190, 190, 190, 190, 190, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 177, 177, 177, 177, 177, 177, 177, 177, 177, 177, 176, 176, 176, 176, 176, 176, 176, 176, 176, 176, 175, 175, 175, 175, 175, 175, 175, 175, 175, 175, 174, 174, 174, 174, 174, 174, 174, 174, 174, 174, 173, 173, 173, 173, 173, 173, 173, 173, 173, 173, 172, 172, 172, 172, 172, 172, 172, 172, 172, 172, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 159, 159, 159, 159, 159, 159, 159, 159, 159, 159, 158, 158, 158, 158, 158, 158, 158, 158, 158, 158, 157, 157, 157, 157, 157, 157, 157, 157, 157, 157, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 155, 155, 155, 155, 155, 155, 155, 155, 155, 155, 154, 154, 154, 154, 154, 154, 154, 154, 154, 154, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 143, 143, 143, 143, 143, 143, 143, 143, 143, 143, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 141, 141, 141, 141, 141, 141, 141, 141, 141, 141, 140, 140, 140, 140, 140, 140, 140, 140, 140, 140, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 138, 138, 138, 138, 138, 138, 138, 138, 138, 138, 137, 137, 137, 137, 137, 137, 137, 137, 137, 137, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 106, 106, 106, 106, 106, 106, 106, 106, 106, 106, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 104, 104, 104, 104, 104, 104, 104, 104, 104, 104, 103, 103, 103, 103, 103, 103, 103, 103, 103, 103, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 89, 89, 89, 89, 89, 89, 89, 89, 89, 89, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] }, "outputs": { "toptr": [626, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 0, 0, 0, 0, 0, 0, 10, 10, 10, 10, 10, 10, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 10, 10, 10, 10, 10, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 10, 10, 10, 10, 10, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 10, 10, 10, 10, 10, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 10, 10, 10, 10, 10, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 10, 10, 10, 10, 10, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 10, 10, 10, 10, 10, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 10, 10, 10, 10, 10, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 10, 10, 10, 10, 10, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 10, 10, 10, 10, 10, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu index 16dec17447..e94183229b 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu @@ -10,8 +10,6 @@ // else: // segment = 0 // grid_size = 1 -// print(block, grid_size) -// parents = cupy.sort(parents) // partial = cupy.zeros(outlength * grid_size, dtype=outoffsets.dtype) // temp = cupy.zeros(lenparents, dtype=cupy.int64) // cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_a", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) @@ -66,7 +64,6 @@ awkward_ListOffsetArray_reduce_local_outoffsets_64_b( } __syncthreads(); - for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { int64_t val = 0; if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { @@ -77,7 +74,6 @@ awkward_ListOffsetArray_reduce_local_outoffsets_64_b( __syncthreads(); } - if (thread_id < lenparents) { int64_t parent = parents[thread_id]; if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu index 311f04012b..d880c36f0e 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu @@ -2,68 +2,105 @@ // BEGIN PYTHON // def f(grid, block, args): -// (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args -// atomicAdd_toptr = cupy.array(toptr, dtype=cupy.uint64) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_a", toptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_b", toptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_c", toptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code)) +// (toptr, parents, lenparents, outlength, invocation_index, err_code) = args +// shared_mem_size = block[0] * toptr.dtype.itemsize +// if block[0] > 0: +// segment = math.floor((outlength + block[0] - 1) / block[0]) +// grid_size = math.floor((lenparents + block[0] - 1) / block[0]) +// else: +// segment = 0 +// grid_size = 1 +// partial = cupy.zeros(outlength * grid_size, dtype=toptr.dtype) +// temp = cupy.zeros(lenparents, dtype=toptr.dtype) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_a", cupy.dtype(toptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_b", cupy.dtype(toptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code), shared_mem=shared_mem_size) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_c", cupy.dtype(toptr.dtype).type, parents.dtype]))((segment,), block, (toptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) // out["awkward_reduce_count_64_a", {dtype_specializations}] = None // out["awkward_reduce_count_64_b", {dtype_specializations}] = None // out["awkward_reduce_count_64_c", {dtype_specializations}] = None // END PYTHON -template +template __global__ void awkward_reduce_count_64_a( T* toptr, - const bool* fromptr, - const U* parents, + const C* parents, int64_t lenparents, int64_t outlength, - uint64_t* atomicAdd_toptr, + T* partial, + T* temp, uint64_t invocation_index, uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_id < outlength) { - atomicAdd_toptr[thread_id] = 0; + toptr[thread_id] = 0; } } } -template +template __global__ void awkward_reduce_count_64_b( T* toptr, - const bool* fromptr, - const U* parents, + const C* parents, int64_t lenparents, int64_t outlength, - uint64_t* atomicAdd_toptr, + T* partial, + T* temp, uint64_t invocation_index, uint64_t* err_code) { if (err_code[0] == NO_ERROR) { - int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + int64_t idx = threadIdx.x; + int64_t thread_id = blockIdx.x * blockDim.x + idx; + if (thread_id < lenparents) { - atomicAdd(atomicAdd_toptr + parents[thread_id], (uint64_t)1); + temp[thread_id] = 1; + } + __syncthreads(); + + + for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { + int64_t val = 0; + if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { + val = temp[thread_id - stride]; + } + __syncthreads(); + temp[thread_id] += val; + __syncthreads(); + } + + if (thread_id < lenparents) { + int64_t parent = parents[thread_id]; + if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { + partial[blockIdx.x * outlength + parent] = temp[thread_id]; + } } } } -template +template __global__ void -awkward_reduce_count_64_c(T* toptr, - const bool* fromptr, - const U* parents, - int64_t lenparents, - int64_t outlength, - uint64_t* atomicAdd_toptr, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_reduce_count_64_c( + T* toptr, + const C* parents, + int64_t lenparents, + int64_t outlength, + T* partial, + T* temp, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_id < outlength) { - toptr[thread_id] = (T)atomicAdd_toptr[thread_id]; + int64_t count = 0; + int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x; + for (int64_t i = 0; i < blocks; ++i) { + count += partial[i * outlength + thread_id]; + } + toptr[thread_id] = count; } } } From 0ed94efb9ee6b549c9544decf3bcce77635d5c72 Mon Sep 17 00:00:00 2001 From: ManasviGoyal Date: Thu, 6 Jun 2024 09:52:13 +0200 Subject: [PATCH 06/33] fix: indexing and indentation --- ...tOffsetArray_reduce_local_outoffsets_64.cu | 27 +++++++-------- .../cuda_kernels/awkward_reduce_argmax.cu | 34 +++++++++---------- .../cuda_kernels/awkward_reduce_argmin.cu | 34 +++++++++---------- .../cuda_kernels/awkward_reduce_count_64.cu | 28 +++++++-------- 4 files changed, 58 insertions(+), 65 deletions(-) diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu index e94183229b..5ae3d2eb56 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu @@ -3,7 +3,6 @@ // BEGIN PYTHON // def f(grid, block, args): // (outoffsets, parents, lenparents, outlength, invocation_index, err_code) = args -// shared_mem_size = block[0] * outoffsets.dtype.itemsize // if block[0] > 0: // segment = math.floor((outlength + block[0] - 1) / block[0]) // grid_size = math.floor((lenparents + block[0] - 1) / block[0]) @@ -13,7 +12,7 @@ // partial = cupy.zeros(outlength * grid_size, dtype=outoffsets.dtype) // temp = cupy.zeros(lenparents, dtype=cupy.int64) // cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_a", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_b", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, partial, temp, invocation_index, err_code), shared_mem=shared_mem_size) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_b", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) // scan_in_array = cupy.zeros(outlength, dtype=cupy.int64) // cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_c", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((segment,), block, (outoffsets, parents, lenparents, outlength, partial, scan_in_array, invocation_index, err_code)) // scan_in_array = cupy.cumsum(scan_in_array) @@ -60,25 +59,25 @@ awkward_ListOffsetArray_reduce_local_outoffsets_64_b( int64_t thread_id = blockIdx.x * blockDim.x + idx; if (thread_id < lenparents) { - temp[thread_id] = 1; + temp[idx] = 1; } __syncthreads(); for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { - int64_t val = 0; - if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { - val = temp[thread_id - stride]; - } - __syncthreads(); - temp[thread_id] += val; - __syncthreads(); + int64_t val = 0; + if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { + val = temp[idx - stride]; + } + __syncthreads(); + temp[idx] += val; + __syncthreads(); } if (thread_id < lenparents) { - int64_t parent = parents[thread_id]; - if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { - partial[blockIdx.x * outlength + parent] = temp[thread_id]; - } + int64_t parent = parents[thread_id]; + if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { + partial[blockIdx.x * outlength + parent] = temp[idx]; + } } } } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu index ee57d912fc..122894795e 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu @@ -3,17 +3,17 @@ // BEGIN PYTHON // def f(grid, block, args): // (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args -// shared_mem_size = block[0] * toptr.dtype.itemsize // if block[0] > 0: // segment = math.floor((outlength + block[0] - 1) / block[0]) -// partial_size = outlength * ((lenparents + block[0] - 1) / block[0]) +// grid_size = math.floor((lenparents + block[0] - 1) / block[0]) // else: // segment = 0 -// partial_size = 0 -// partial = cupy.full(math.floor(partial_size), -1, dtype=toptr.dtype) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, partial, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, partial, invocation_index, err_code), shared_mem=shared_mem_size) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, partial, invocation_index, err_code)) +// grid_size = 1 +// partial = cupy.full(outlength * grid_size, -1, dtype=toptr.dtype) +// temp = cupy.zeros(lenparents, dtype=toptr.dtype) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) // out["awkward_reduce_argmax_a", {dtype_specializations}] = None // out["awkward_reduce_argmax_b", {dtype_specializations}] = None // out["awkward_reduce_argmax_c", {dtype_specializations}] = None @@ -28,6 +28,7 @@ awkward_reduce_argmax_a( int64_t lenparents, int64_t outlength, T* partial, + T* temp, uint64_t invocation_index, uint64_t* err_code) { if (err_code[0] == NO_ERROR) { @@ -48,30 +49,26 @@ awkward_reduce_argmax_b( int64_t lenparents, int64_t outlength, T* partial, + T* temp, uint64_t invocation_index, uint64_t* err_code) { if (err_code[0] == NO_ERROR) { - extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[]; - T *shared_mem = reinterpret_cast(shared_memory); - int64_t idx = threadIdx.x; int64_t thread_id = blockIdx.x * blockDim.x + idx; if (thread_id < lenparents) { - shared_mem[idx] = thread_id; - } else { - shared_mem[idx] = -1; + temp[idx] = thread_id; } __syncthreads(); for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { int64_t index = -1; if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { - index = shared_mem[idx - stride]; + index = temp[idx - stride]; } - if (index != -1 && (shared_mem[idx] == -1 || fromptr[index] > fromptr[shared_mem[idx]] || - (fromptr[index] == fromptr[shared_mem[idx]] && index < shared_mem[idx]))) { - shared_mem[idx] = index; + if (index != -1 && (temp[idx] == -1 || fromptr[index] > fromptr[temp[idx]] || + (fromptr[index] == fromptr[temp[idx]] && index < temp[idx]))) { + temp[idx] = index; } __syncthreads(); } @@ -79,7 +76,7 @@ awkward_reduce_argmax_b( if (thread_id < lenparents) { int64_t parent = parents[thread_id]; if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { - partial[blockIdx.x * outlength + parent] = shared_mem[idx]; + partial[blockIdx.x * outlength + parent] = temp[idx]; } } } @@ -94,6 +91,7 @@ awkward_reduce_argmax_c( int64_t lenparents, int64_t outlength, T* partial, + T* temp, uint64_t invocation_index, uint64_t* err_code) { if (err_code[0] == NO_ERROR) { diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu index 262a51b57a..40a8437218 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu @@ -3,17 +3,17 @@ // BEGIN PYTHON // def f(grid, block, args): // (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args -// shared_mem_size = block[0] * toptr.dtype.itemsize // if block[0] > 0: // segment = math.floor((outlength + block[0] - 1) / block[0]) -// partial_size = outlength * ((lenparents + block[0] - 1) / block[0]) +// grid_size = math.floor((lenparents + block[0] - 1) / block[0]) // else: // segment = 0 -// partial_size = 0 -// partial = cupy.full(math.floor(partial_size), -1, dtype=toptr.dtype) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmin_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, partial, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmin_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, partial, invocation_index, err_code), shared_mem=shared_mem_size) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmin_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, partial, invocation_index, err_code)) +// grid_size = 1 +// partial = cupy.full(outlength * grid_size, -1, dtype=toptr.dtype) +// temp = cupy.zeros(lenparents, dtype=toptr.dtype) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmin_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmin_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmin_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) // out["awkward_reduce_argmin_a", {dtype_specializations}] = None // out["awkward_reduce_argmin_b", {dtype_specializations}] = None // out["awkward_reduce_argmin_c", {dtype_specializations}] = None @@ -28,6 +28,7 @@ awkward_reduce_argmin_a( int64_t lenparents, int64_t outlength, T* partial, + T* temp, uint64_t invocation_index, uint64_t* err_code) { if (err_code[0] == NO_ERROR) { @@ -48,30 +49,26 @@ awkward_reduce_argmin_b( int64_t lenparents, int64_t outlength, T* partial, + T* temp, uint64_t invocation_index, uint64_t* err_code) { if (err_code[0] == NO_ERROR) { - extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[]; - T *shared_mem = reinterpret_cast(shared_memory); - int64_t idx = threadIdx.x; int64_t thread_id = blockIdx.x * blockDim.x + idx; if (thread_id < lenparents) { - shared_mem[idx] = thread_id; - } else { - shared_mem[idx] = -1; + temp[idx] = thread_id; } __syncthreads(); for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { int64_t index = -1; if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { - index = shared_mem[idx - stride]; + index = temp[idx - stride]; } - if (index != -1 && (shared_mem[idx] == -1 || fromptr[index] < fromptr[shared_mem[idx]] || - (fromptr[index] == fromptr[shared_mem[idx]] && index < shared_mem[idx]))) { - shared_mem[idx] = index; + if (index != -1 && (temp[idx] == -1 || fromptr[index] < fromptr[temp[idx]] || + (fromptr[index] == fromptr[temp[idx]] && index < temp[idx]))) { + temp[idx] = index; } __syncthreads(); } @@ -79,7 +76,7 @@ awkward_reduce_argmin_b( if (thread_id < lenparents) { int64_t parent = parents[thread_id]; if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { - partial[blockIdx.x * outlength + parent] = shared_mem[idx]; + partial[blockIdx.x * outlength + parent] = temp[idx]; } } } @@ -94,6 +91,7 @@ awkward_reduce_argmin_c( int64_t lenparents, int64_t outlength, T* partial, + T* temp, uint64_t invocation_index, uint64_t* err_code) { if (err_code[0] == NO_ERROR) { diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu index d880c36f0e..cdf870c63c 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu @@ -3,7 +3,6 @@ // BEGIN PYTHON // def f(grid, block, args): // (toptr, parents, lenparents, outlength, invocation_index, err_code) = args -// shared_mem_size = block[0] * toptr.dtype.itemsize // if block[0] > 0: // segment = math.floor((outlength + block[0] - 1) / block[0]) // grid_size = math.floor((lenparents + block[0] - 1) / block[0]) @@ -13,7 +12,7 @@ // partial = cupy.zeros(outlength * grid_size, dtype=toptr.dtype) // temp = cupy.zeros(lenparents, dtype=toptr.dtype) // cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_a", cupy.dtype(toptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_b", cupy.dtype(toptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code), shared_mem=shared_mem_size) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_b", cupy.dtype(toptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) // cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_c", cupy.dtype(toptr.dtype).type, parents.dtype]))((segment,), block, (toptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) // out["awkward_reduce_count_64_a", {dtype_specializations}] = None // out["awkward_reduce_count_64_b", {dtype_specializations}] = None @@ -56,26 +55,25 @@ awkward_reduce_count_64_b( int64_t thread_id = blockIdx.x * blockDim.x + idx; if (thread_id < lenparents) { - temp[thread_id] = 1; + temp[idx] = 1; } __syncthreads(); - for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { - int64_t val = 0; - if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { - val = temp[thread_id - stride]; - } - __syncthreads(); - temp[thread_id] += val; - __syncthreads(); + int64_t val = 0; + if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { + val = temp[idx - stride]; + } + __syncthreads(); + temp[idx] += val; + __syncthreads(); } if (thread_id < lenparents) { - int64_t parent = parents[thread_id]; - if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { - partial[blockIdx.x * outlength + parent] = temp[thread_id]; - } + int64_t parent = parents[thread_id]; + if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { + partial[blockIdx.x * outlength + parent] = temp[idx]; + } } } } From 02c03bc70b73df950ebf70efeea1951e4c7569f6 Mon Sep 17 00:00:00 2001 From: ManasviGoyal Date: Thu, 6 Jun 2024 10:00:05 +0200 Subject: [PATCH 07/33] feat: add awkward_reduce_countnonzero kernel --- kernel-test-data.json | 2 +- .../awkward_reduce_countnonzero.cu | 72 ++++++++++++++----- 2 files changed, 54 insertions(+), 20 deletions(-) diff --git a/kernel-test-data.json b/kernel-test-data.json index 900f81c525..9616d7eb7b 100644 --- a/kernel-test-data.json +++ b/kernel-test-data.json @@ -23698,7 +23698,7 @@ }, { "name": "awkward_reduce_countnonzero", - "status": false, + "status": true, "tests": [ { "error": false, diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu index 6b07dfa208..11bb84b18f 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu @@ -3,10 +3,17 @@ // BEGIN PYTHON // def f(grid, block, args): // (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args -// atomicAdd_toptr = cupy.array(toptr, dtype=cupy.uint64) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_countnonzero_a", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_countnonzero_b", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_countnonzero_c", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code)) +// if block[0] > 0: +// segment = math.floor((outlength + block[0] - 1) / block[0]) +// grid_size = math.floor((lenparents + block[0] - 1) / block[0]) +// else: +// segment = 0 +// grid_size = 1 +// partial = cupy.zeros(outlength * grid_size, dtype=toptr.dtype) +// temp = cupy.zeros(lenparents, dtype=toptr.dtype) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_countnonzero_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_countnonzero_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_countnonzero_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) // out["awkward_reduce_countnonzero_a", {dtype_specializations}] = None // out["awkward_reduce_countnonzero_b", {dtype_specializations}] = None // out["awkward_reduce_countnonzero_c", {dtype_specializations}] = None @@ -20,14 +27,15 @@ awkward_reduce_countnonzero_a( const U* parents, int64_t lenparents, int64_t outlength, - uint64_t* atomicAdd_toptr, + T* partial, + T* temp, uint64_t invocation_index, uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - atomicAdd_toptr[thread_id] = 0; + toptr[thread_id] = 0; } } } @@ -40,34 +48,60 @@ awkward_reduce_countnonzero_b( const U* parents, int64_t lenparents, int64_t outlength, - uint64_t* atomicAdd_toptr, + T* partial, + T* temp, uint64_t invocation_index, uint64_t* err_code) { if (err_code[0] == NO_ERROR) { - int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + int64_t idx = threadIdx.x; + int64_t thread_id = blockIdx.x * blockDim.x + idx; if (thread_id < lenparents) { - atomicAdd(atomicAdd_toptr + parents[thread_id], - (uint64_t)(fromptr[thread_id] != 0)); + temp[idx] = (fromptr[thread_id] != 0) ? 1 : 0; + } + __syncthreads(); + + for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { + int64_t val = 0; + if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { + val = temp[idx - stride]; + } + __syncthreads(); + temp[idx] += val; + __syncthreads(); + } + + if (thread_id < lenparents) { + int64_t parent = parents[thread_id]; + if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { + partial[blockIdx.x * outlength + parent] = temp[idx]; + } } } } template __global__ void -awkward_reduce_countnonzero_c(T* toptr, - const C* fromptr, - const U* parents, - int64_t lenparents, - int64_t outlength, - uint64_t* atomicAdd_toptr, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_reduce_countnonzero_c( + T* toptr, + const C* fromptr, + const U* parents, + int64_t lenparents, + int64_t outlength, + T* partial, + T* temp, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - toptr[thread_id] = (T)atomicAdd_toptr[thread_id]; + int64_t count = 0; + int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x; + for (int64_t i = 0; i < blocks; ++i) { + count += partial[i * outlength + thread_id]; + } + toptr[thread_id] = count; } } } From 34fc82b3aa800c028c6e8a91f1be5e30e5f53347 Mon Sep 17 00:00:00 2001 From: ManasviGoyal Date: Thu, 6 Jun 2024 10:16:13 +0200 Subject: [PATCH 08/33] feat: add reduce sum, min and max kernels --- kernel-test-data.json | 6 +- .../cuda/cuda_kernels/awkward_reduce_max.cu | 71 +++++++++++++++++-- .../cuda/cuda_kernels/awkward_reduce_min.cu | 71 +++++++++++++++++-- .../cuda/cuda_kernels/awkward_reduce_sum.cu | 57 +++++++++++---- 4 files changed, 178 insertions(+), 27 deletions(-) diff --git a/kernel-test-data.json b/kernel-test-data.json index 9616d7eb7b..21148f61dd 100644 --- a/kernel-test-data.json +++ b/kernel-test-data.json @@ -23510,7 +23510,7 @@ }, { "name": "awkward_reduce_max", - "status": false, + "status": true, "tests": [ { "error": false, @@ -24173,7 +24173,7 @@ }, { "name": "awkward_reduce_sum", - "status": false, + "status": true, "tests": [ { "error": false, @@ -25342,7 +25342,7 @@ }, { "name": "awkward_reduce_min", - "status": false, + "status": true, "tests": [ { "error": false, diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu index 3c20b653ac..4ac7df43ba 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu @@ -3,10 +3,20 @@ // BEGIN PYTHON // def f(grid, block, args): // (toptr, fromptr, parents, lenparents, outlength, identity, invocation_index, err_code) = args -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_max_a", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, identity, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_max_b", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, identity, invocation_index, err_code)) +// if block[0] > 0: +// segment = math.floor((outlength + block[0] - 1) / block[0]) +// grid_size = math.floor((lenparents + block[0] - 1) / block[0]) +// else: +// segment = 0 +// grid_size = 1 +// partial = cupy.full(outlength * grid_size, identity, dtype=toptr.dtype) +// temp = cupy.zeros(lenparents, dtype=toptr.dtype) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_max_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, identity, partial, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_max_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, identity, partial, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_max_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, identity, partial, temp, invocation_index, err_code)) // out["awkward_reduce_max_a", {dtype_specializations}] = None // out["awkward_reduce_max_b", {dtype_specializations}] = None +// out["awkward_reduce_max_c", {dtype_specializations}] = None // END PYTHON template @@ -18,10 +28,13 @@ awkward_reduce_max_a( int64_t lenparents, int64_t outlength, T identity, + T* partial, + T* temp, uint64_t invocation_index, uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_id < outlength) { toptr[thread_id] = identity; } @@ -37,15 +50,61 @@ awkward_reduce_max_b( int64_t lenparents, int64_t outlength, T identity, + T* partial, + T* temp, uint64_t invocation_index, uint64_t* err_code) { if (err_code[0] == NO_ERROR) { - int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + int64_t idx = threadIdx.x; + int64_t thread_id = blockIdx.x * blockDim.x + idx; + + if (thread_id < lenparents) { + temp[idx] = fromptr[thread_id]; + } + __syncthreads(); + + for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { + T val = identity; + if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { + val = temp[idx - stride]; + } + __syncthreads(); + temp[idx] = val > temp[idx] ? val : temp[idx]; + __syncthreads(); + } if (thread_id < lenparents) { - C x = fromptr[thread_id]; - toptr[parents[thread_id]] = - (x > toptr[parents[thread_id]] ? x : toptr[parents[thread_id]]); + int64_t parent = parents[thread_id]; + if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { + partial[blockIdx.x * outlength + parent] = temp[idx]; + } + } + } +} + +template +__global__ void +awkward_reduce_max_c( + T* toptr, + const C* fromptr, + const U* parents, + int64_t lenparents, + int64_t outlength, + T identity, + T* partial, + T* temp, + uint64_t invocation_index, + uint64_t* err_code) { + if (err_code[0] == NO_ERROR) { + int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < outlength) { + T maximum = identity; + int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x; + for (int64_t i = 0; i < blocks; ++i) { + maximum = maximum > partial[i * outlength + thread_id] ? maximum : partial[i * outlength + thread_id]; + } + toptr[thread_id] = maximum; } } } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu index ae0e2dcb61..f524485e58 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu @@ -3,10 +3,20 @@ // BEGIN PYTHON // def f(grid, block, args): // (toptr, fromptr, parents, lenparents, outlength, identity, invocation_index, err_code) = args -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_min_a", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, identity, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_min_b", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, identity, invocation_index, err_code)) +// if block[0] > 0: +// segment = math.floor((outlength + block[0] - 1) / block[0]) +// grid_size = math.floor((lenparents + block[0] - 1) / block[0]) +// else: +// segment = 0 +// grid_size = 1 +// partial = cupy.full(outlength * grid_size, identity, dtype=toptr.dtype) +// temp = cupy.zeros(lenparents, dtype=toptr.dtype) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_min_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, identity, partial, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_min_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, identity, partial, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_min_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, identity, partial, temp, invocation_index, err_code)) // out["awkward_reduce_min_a", {dtype_specializations}] = None // out["awkward_reduce_min_b", {dtype_specializations}] = None +// out["awkward_reduce_min_c", {dtype_specializations}] = None // END PYTHON template @@ -18,10 +28,13 @@ awkward_reduce_min_a( int64_t lenparents, int64_t outlength, T identity, + T* partial, + T* temp, uint64_t invocation_index, uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_id < outlength) { toptr[thread_id] = identity; } @@ -37,15 +50,61 @@ awkward_reduce_min_b( int64_t lenparents, int64_t outlength, T identity, + T* partial, + T* temp, uint64_t invocation_index, uint64_t* err_code) { if (err_code[0] == NO_ERROR) { - int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + int64_t idx = threadIdx.x; + int64_t thread_id = blockIdx.x * blockDim.x + idx; + + if (thread_id < lenparents) { + temp[idx] = fromptr[thread_id]; + } + __syncthreads(); + + for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { + T val = identity; + if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { + val = temp[idx - stride]; + } + __syncthreads(); + temp[idx] = val < temp[idx] ? val : temp[idx]; + __syncthreads(); + } if (thread_id < lenparents) { - C x = fromptr[thread_id]; - toptr[parents[thread_id]] = - (x < toptr[parents[thread_id]] ? x : toptr[parents[thread_id]]); + int64_t parent = parents[thread_id]; + if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { + partial[blockIdx.x * outlength + parent] = temp[idx]; + } + } + } +} + +template +__global__ void +awkward_reduce_min_c( + T* toptr, + const C* fromptr, + const U* parents, + int64_t lenparents, + int64_t outlength, + T identity, + T* partial, + T* temp, + uint64_t invocation_index, + uint64_t* err_code) { + if (err_code[0] == NO_ERROR) { + int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < outlength) { + T minimum = identity; + int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x; + for (int64_t i = 0; i < blocks; ++i) { + minimum = minimum < partial[i * outlength + thread_id] ? minimum : partial[i * outlength + thread_id]; + } + toptr[thread_id] = minimum; } } } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum.cu index 13c5a31dbf..e641d728b4 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum.cu @@ -3,10 +3,17 @@ // BEGIN PYTHON // def f(grid, block, args): // (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args -// atomicAdd_toptr = cupy.array(toptr, dtype=cupy.uint64) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_a", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_b", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_c", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code)) +// if block[0] > 0: +// segment = math.floor((outlength + block[0] - 1) / block[0]) +// grid_size = math.floor((lenparents + block[0] - 1) / block[0]) +// else: +// segment = 0 +// grid_size = 1 +// partial = cupy.zeros(outlength * grid_size, dtype=toptr.dtype) +// temp = cupy.zeros(lenparents, dtype=toptr.dtype) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) // out["awkward_reduce_sum_a", {dtype_specializations}] = None // out["awkward_reduce_sum_b", {dtype_specializations}] = None // out["awkward_reduce_sum_c", {dtype_specializations}] = None @@ -20,14 +27,15 @@ awkward_reduce_sum_a( const U* parents, int64_t lenparents, int64_t outlength, - uint64_t* atomicAdd_toptr, + T* partial, + T* temp, uint64_t invocation_index, uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - atomicAdd_toptr[thread_id] = 0; + toptr[thread_id] = 0; } } } @@ -40,15 +48,34 @@ awkward_reduce_sum_b( const U* parents, int64_t lenparents, int64_t outlength, - uint64_t* atomicAdd_toptr, + T* partial, + T* temp, uint64_t invocation_index, uint64_t* err_code) { if (err_code[0] == NO_ERROR) { - int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + int64_t idx = threadIdx.x; + int64_t thread_id = blockIdx.x * blockDim.x + idx; + + if (thread_id < lenparents) { + temp[idx] = fromptr[thread_id]; + } + __syncthreads(); + + for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { + T val = 0; + if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { + val = temp[idx - stride]; + } + __syncthreads(); + temp[idx] += val; + __syncthreads(); + } if (thread_id < lenparents) { - atomicAdd(atomicAdd_toptr + parents[thread_id], - (uint64_t)fromptr[thread_id]); + int64_t parent = parents[thread_id]; + if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { + partial[blockIdx.x * outlength + parent] = temp[idx]; + } } } } @@ -61,14 +88,20 @@ awkward_reduce_sum_c( const U* parents, int64_t lenparents, int64_t outlength, - uint64_t* atomicAdd_toptr, + T* partial, + T* temp, uint64_t invocation_index, uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - toptr[thread_id] = (T)atomicAdd_toptr[thread_id]; + T sum = 0; + int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x; + for (int64_t i = 0; i < blocks; ++i) { + sum += partial[i * outlength + thread_id]; + } + toptr[thread_id] = sum; } } } From 4e00f0723c9c98d32ee2d47c0fbe92b9ac6cdf42 Mon Sep 17 00:00:00 2001 From: ManasviGoyal Date: Thu, 6 Jun 2024 10:37:24 +0200 Subject: [PATCH 09/33] feat: add reduce prod and sum_int_bool --- dev/generate-kernel-signatures.py | 1 + dev/generate-tests.py | 1 + kernel-test-data.json | 794 +++++++++++++++++- src/awkward/_connect/cuda/__init__.py | 1 + .../cuda/cuda_kernels/awkward_reduce_prod.cu | 107 +++ .../awkward_reduce_sum_int32_bool_64.cu | 72 +- .../awkward_reduce_sum_int64_bool_64.cu | 72 +- 7 files changed, 1009 insertions(+), 39 deletions(-) create mode 100644 src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod.cu diff --git a/dev/generate-kernel-signatures.py b/dev/generate-kernel-signatures.py index c70f787c25..4f02a5ca42 100644 --- a/dev/generate-kernel-signatures.py +++ b/dev/generate-kernel-signatures.py @@ -117,6 +117,7 @@ "awkward_reduce_sum_int32_bool_64", "awkward_reduce_sum_int64_bool_64", "awkward_reduce_sum_bool", + "awkward_reduce_prod", "awkward_reduce_prod_bool", "awkward_reduce_countnonzero", "awkward_sorting_ranges", diff --git a/dev/generate-tests.py b/dev/generate-tests.py index db89f2655e..7267d5659c 100644 --- a/dev/generate-tests.py +++ b/dev/generate-tests.py @@ -902,6 +902,7 @@ def gencpuunittests(specdict): "awkward_reduce_sum_int32_bool_64", "awkward_reduce_sum_int64_bool_64", "awkward_reduce_sum_bool", + "awkward_reduce_prod", "awkward_reduce_prod_bool", "awkward_reduce_countnonzero", "awkward_sorting_ranges", diff --git a/kernel-test-data.json b/kernel-test-data.json index 21148f61dd..0df339f461 100644 --- a/kernel-test-data.json +++ b/kernel-test-data.json @@ -24567,9 +24567,801 @@ } ] }, + { + "name": "awkward_reduce_sum_int32_bool_64", + "status": true, + "tests": [ + { + "error": false, + "message": "", + "inputs": { + "fromptr": [], + "lenparents": 0, + "outlength": 0, + "parents": [] + }, + "outputs": { + "toptr": [] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [0], + "lenparents": 1, + "outlength": 1, + "parents": [0] + }, + "outputs": { + "toptr": [0] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [0, 5, 20, 1, 6, 21, 2, 7, 22, 3, 8, 23, 4, 9, 24], + "lenparents": 15, + "outlength": 10, + "parents": [0, 5, 5, 1, 6, 6, 2, 7, 7, 3, 8, 8, 4, 9, 9] + }, + "outputs": { + "toptr": [0, 1, 1, 1, 1, 2, 2, 2, 2, 2] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [2, 3, 5, 7, 11, 13, 17, 19, 23], + "lenparents": 9, + "outlength": 6, + "parents": [0, 0, 0, 2, 2, 3, 4, 4, 5] + }, + "outputs": { + "toptr": [3, 0, 2, 1, 2, 1] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [1, 0, 0, 1, 0, 0], + "lenparents": 6, + "outlength": 4, + "parents": [0, 0, 0, 2, 2, 3] + }, + "outputs": { + "toptr": [1, 0, 1, 0] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24], + "lenparents": 15, + "outlength": 3, + "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2] + }, + "outputs": { + "toptr": [4, 5, 5] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29], + "lenparents": 30, + "outlength": 6, + "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5] + }, + "outputs": { + "toptr": [4, 5, 5, 5, 5, 5] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [0, 1, 3, 4, 5, 6], + "lenparents": 6, + "outlength": 4, + "parents": [0, 0, 1, 3, 3, 3] + }, + "outputs": { + "toptr": [1, 1, 0, 3] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [0, 5, 10, 15, 25, 1, 11, 16, 26, 2, 12, 17, 27, 8, 18, 28, 4, 9, 14, 29], + "lenparents": 20, + "outlength": 10, + "parents": [0, 0, 0, 5, 5, 1, 1, 6, 6, 2, 2, 7, 7, 3, 8, 8, 4, 4, 4, 9] + }, + "outputs": { + "toptr": [2, 2, 2, 1, 3, 2, 2, 2, 2, 1] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [15, 20, 25, 16, 21, 26, 17, 22, 27, 18, 23, 28, 19, 24, 29], + "lenparents": 15, + "outlength": 15, + "parents": [0, 5, 10, 1, 6, 11, 2, 7, 12, 3, 8, 13, 4, 9, 14] + }, + "outputs": { + "toptr": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [0, 15, 5, 10, 25, 1, 16, 11, 26, 2, 17, 12, 27, 18, 8, 28, 4, 9, 14, 29], + "lenparents": 20, + "outlength": 15, + "parents": [0, 0, 5, 10, 10, 1, 1, 11, 11, 2, 2, 12, 12, 3, 8, 13, 4, 9, 14, 14] + }, + "outputs": { + "toptr": [1, 2, 2, 1, 1, 1, 0, 0, 1, 1, 2, 2, 2, 1, 2] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [0, 15, 5, 20, 10, 25, 1, 16, 6, 21, 11, 26, 2, 17, 7, 22, 12, 27, 3, 18, 8, 23, 13, 28, 4, 19, 9, 24, 14, 29], + "lenparents": 30, + "outlength": 15, + "parents": [0, 0, 5, 5, 10, 10, 1, 1, 6, 6, 11, 11, 2, 2, 7, 7, 12, 12, 3, 3, 8, 8, 13, 13, 4, 4, 9, 9, 14, 14] + }, + "outputs": { + "toptr": [1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [0, 5, 10, 15, 20, 25, 1, 6, 11, 16, 21, 26, 2, 7, 12, 17, 22, 27, 3, 8, 13, 18, 23, 28, 4, 9, 14, 19, 24, 29], + "lenparents": 30, + "outlength": 10, + "parents": [0, 0, 0, 5, 5, 5, 1, 1, 1, 6, 6, 6, 2, 2, 2, 7, 7, 7, 3, 3, 3, 8, 8, 8, 4, 4, 4, 9, 9, 9] + }, + "outputs": { + "toptr": [2, 3, 3, 3, 3, 3, 3, 3, 3, 3] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [1, 2, 4, 8, 16, 32, 64, 128, 0, 0, 0, 0], + "lenparents": 12, + "outlength": 3, + "parents": [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2] + }, + "outputs": { + "toptr": [4, 4, 0] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [1, 2, 3, 4, 5, 1, 2, 3, 4, 5], + "lenparents": 10, + "outlength": 2, + "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1] + }, + "outputs": { + "toptr": [5, 5] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [1, 2, 3, 4, 5, 6], + "lenparents": 6, + "outlength": 1, + "parents": [0, 0, 0, 0, 0, 0] + }, + "outputs": { + "toptr": [6] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [2, 7, 13, 17, 23, 3, 11, 19, 5], + "lenparents": 9, + "outlength": 8, + "parents": [0, 0, 0, 6, 6, 1, 1, 7, 2] + }, + "outputs": { + "toptr": [3, 2, 1, 0, 0, 0, 2, 1] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [1, 16, 0, 2, 32, 0, 4, 64, 0, 8, 128, 0], + "lenparents": 12, + "outlength": 4, + "parents": [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3] + }, + "outputs": { + "toptr": [2, 2, 2, 2] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [0, 1, 2, 3, 4, 5], + "lenparents": 6, + "outlength": 4, + "parents": [0, 0, 0, 2, 2, 3] + }, + "outputs": { + "toptr": [2, 0, 2, 1] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [0, 4, 1, 3, 5, 6], + "lenparents": 6, + "outlength": 4, + "parents": [0, 0, 1, 1, 1, 3] + }, + "outputs": { + "toptr": [1, 3, 0, 1] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [1, 4, 9, 16, 25, 1, 4, 9, 16, 25], + "lenparents": 10, + "outlength": 2, + "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1] + }, + "outputs": { + "toptr": [5, 5] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [1, 4, 9, 16, 26, 1, 4, 10, 16, 24], + "lenparents": 10, + "outlength": 2, + "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1] + }, + "outputs": { + "toptr": [5, 5] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [0, 5, 20, 1, 6, 21, 2, 7, 22, 3, 8, 23, 4, 9, 24], + "lenparents": 15, + "outlength": 10, + "parents": [0, 0, 5, 1, 1, 6, 2, 2, 7, 3, 3, 8, 4, 4, 9] + }, + "outputs": { + "toptr": [1, 2, 2, 2, 2, 1, 1, 1, 1, 1] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [15, 20, 25, 16, 21, 26, 17, 22, 27, 18, 23, 28, 19, 24, 29], + "lenparents": 15, + "outlength": 5, + "parents": [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4] + }, + "outputs": { + "toptr": [3, 3, 3, 3, 3] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [1, 2, 3], + "lenparents": 3, + "outlength": 1, + "parents": [0, 0, 0] + }, + "outputs": { + "toptr": [3] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [0, 1, 2, 4, 5, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 25, 26, 27, 28, 29], + "lenparents": 20, + "outlength": 6, + "parents": [0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 5, 5, 5, 5, 5] + }, + "outputs": { + "toptr": [3, 3, 4, 4, 0, 5] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [2, 2, 4, 5, 5], + "lenparents": 5, + "outlength": 3, + "parents": [0, 0, 0, 2, 2] + }, + "outputs": { + "toptr": [3, 0, 2] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29], + "lenparents": 15, + "outlength": 3, + "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2] + }, + "outputs": { + "toptr": [5, 5, 5] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [4, 1, 0, 1, 4, 5, 1, 0, 1, 3], + "lenparents": 10, + "outlength": 2, + "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1] + }, + "outputs": { + "toptr": [4, 4] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [4, 1, 0, 1, 4, 4, 1, 0, 1, 4], + "lenparents": 10, + "outlength": 2, + "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1] + }, + "outputs": { + "toptr": [4, 4] + } + } + ] + }, + { + "name": "awkward_reduce_sum_int64_bool_64", + "status": true, + "tests": [ + { + "error": false, + "message": "", + "inputs": { + "fromptr": [], + "lenparents": 0, + "outlength": 0, + "parents": [] + }, + "outputs": { + "toptr": [] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [0], + "lenparents": 1, + "outlength": 1, + "parents": [0] + }, + "outputs": { + "toptr": [0] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [0, 5, 20, 1, 6, 21, 2, 7, 22, 3, 8, 23, 4, 9, 24], + "lenparents": 15, + "outlength": 10, + "parents": [0, 5, 5, 1, 6, 6, 2, 7, 7, 3, 8, 8, 4, 9, 9] + }, + "outputs": { + "toptr": [0, 1, 1, 1, 1, 2, 2, 2, 2, 2] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [2, 3, 5, 7, 11, 13, 17, 19, 23], + "lenparents": 9, + "outlength": 6, + "parents": [0, 0, 0, 2, 2, 3, 4, 4, 5] + }, + "outputs": { + "toptr": [3, 0, 2, 1, 2, 1] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [1, 0, 0, 1, 0, 0], + "lenparents": 6, + "outlength": 4, + "parents": [0, 0, 0, 2, 2, 3] + }, + "outputs": { + "toptr": [1, 0, 1, 0] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24], + "lenparents": 15, + "outlength": 3, + "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2] + }, + "outputs": { + "toptr": [4, 5, 5] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29], + "lenparents": 30, + "outlength": 6, + "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5] + }, + "outputs": { + "toptr": [4, 5, 5, 5, 5, 5] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [0, 1, 3, 4, 5, 6], + "lenparents": 6, + "outlength": 4, + "parents": [0, 0, 1, 3, 3, 3] + }, + "outputs": { + "toptr": [1, 1, 0, 3] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [0, 5, 10, 15, 25, 1, 11, 16, 26, 2, 12, 17, 27, 8, 18, 28, 4, 9, 14, 29], + "lenparents": 20, + "outlength": 10, + "parents": [0, 0, 0, 5, 5, 1, 1, 6, 6, 2, 2, 7, 7, 3, 8, 8, 4, 4, 4, 9] + }, + "outputs": { + "toptr": [2, 2, 2, 1, 3, 2, 2, 2, 2, 1] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [15, 20, 25, 16, 21, 26, 17, 22, 27, 18, 23, 28, 19, 24, 29], + "lenparents": 15, + "outlength": 15, + "parents": [0, 5, 10, 1, 6, 11, 2, 7, 12, 3, 8, 13, 4, 9, 14] + }, + "outputs": { + "toptr": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [0, 15, 5, 10, 25, 1, 16, 11, 26, 2, 17, 12, 27, 18, 8, 28, 4, 9, 14, 29], + "lenparents": 20, + "outlength": 15, + "parents": [0, 0, 5, 10, 10, 1, 1, 11, 11, 2, 2, 12, 12, 3, 8, 13, 4, 9, 14, 14] + }, + "outputs": { + "toptr": [1, 2, 2, 1, 1, 1, 0, 0, 1, 1, 2, 2, 2, 1, 2] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [0, 15, 5, 20, 10, 25, 1, 16, 6, 21, 11, 26, 2, 17, 7, 22, 12, 27, 3, 18, 8, 23, 13, 28, 4, 19, 9, 24, 14, 29], + "lenparents": 30, + "outlength": 15, + "parents": [0, 0, 5, 5, 10, 10, 1, 1, 6, 6, 11, 11, 2, 2, 7, 7, 12, 12, 3, 3, 8, 8, 13, 13, 4, 4, 9, 9, 14, 14] + }, + "outputs": { + "toptr": [1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [0, 5, 10, 15, 20, 25, 1, 6, 11, 16, 21, 26, 2, 7, 12, 17, 22, 27, 3, 8, 13, 18, 23, 28, 4, 9, 14, 19, 24, 29], + "lenparents": 30, + "outlength": 10, + "parents": [0, 0, 0, 5, 5, 5, 1, 1, 1, 6, 6, 6, 2, 2, 2, 7, 7, 7, 3, 3, 3, 8, 8, 8, 4, 4, 4, 9, 9, 9] + }, + "outputs": { + "toptr": [2, 3, 3, 3, 3, 3, 3, 3, 3, 3] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [1, 2, 4, 8, 16, 32, 64, 128, 0, 0, 0, 0], + "lenparents": 12, + "outlength": 3, + "parents": [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2] + }, + "outputs": { + "toptr": [4, 4, 0] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [1, 2, 3, 4, 5, 1, 2, 3, 4, 5], + "lenparents": 10, + "outlength": 2, + "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1] + }, + "outputs": { + "toptr": [5, 5] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [1, 2, 3, 4, 5, 6], + "lenparents": 6, + "outlength": 1, + "parents": [0, 0, 0, 0, 0, 0] + }, + "outputs": { + "toptr": [6] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [2, 7, 13, 17, 23, 3, 11, 19, 5], + "lenparents": 9, + "outlength": 8, + "parents": [0, 0, 0, 6, 6, 1, 1, 7, 2] + }, + "outputs": { + "toptr": [3, 2, 1, 0, 0, 0, 2, 1] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [1, 16, 0, 2, 32, 0, 4, 64, 0, 8, 128, 0], + "lenparents": 12, + "outlength": 4, + "parents": [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3] + }, + "outputs": { + "toptr": [2, 2, 2, 2] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [0, 1, 2, 3, 4, 5], + "lenparents": 6, + "outlength": 4, + "parents": [0, 0, 0, 2, 2, 3] + }, + "outputs": { + "toptr": [2, 0, 2, 1] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [0, 4, 1, 3, 5, 6], + "lenparents": 6, + "outlength": 4, + "parents": [0, 0, 1, 1, 1, 3] + }, + "outputs": { + "toptr": [1, 3, 0, 1] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [1, 4, 9, 16, 25, 1, 4, 9, 16, 25], + "lenparents": 10, + "outlength": 2, + "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1] + }, + "outputs": { + "toptr": [5, 5] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [1, 4, 9, 16, 26, 1, 4, 10, 16, 24], + "lenparents": 10, + "outlength": 2, + "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1] + }, + "outputs": { + "toptr": [5, 5] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [0, 5, 20, 1, 6, 21, 2, 7, 22, 3, 8, 23, 4, 9, 24], + "lenparents": 15, + "outlength": 10, + "parents": [0, 0, 5, 1, 1, 6, 2, 2, 7, 3, 3, 8, 4, 4, 9] + }, + "outputs": { + "toptr": [1, 2, 2, 2, 2, 1, 1, 1, 1, 1] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [15, 20, 25, 16, 21, 26, 17, 22, 27, 18, 23, 28, 19, 24, 29], + "lenparents": 15, + "outlength": 5, + "parents": [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4] + }, + "outputs": { + "toptr": [3, 3, 3, 3, 3] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [1, 2, 3], + "lenparents": 3, + "outlength": 1, + "parents": [0, 0, 0] + }, + "outputs": { + "toptr": [3] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [0, 1, 2, 4, 5, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 25, 26, 27, 28, 29], + "lenparents": 20, + "outlength": 6, + "parents": [0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 5, 5, 5, 5, 5] + }, + "outputs": { + "toptr": [3, 3, 4, 4, 0, 5] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [2, 2, 4, 5, 5], + "lenparents": 5, + "outlength": 3, + "parents": [0, 0, 0, 2, 2] + }, + "outputs": { + "toptr": [3, 0, 2] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29], + "lenparents": 15, + "outlength": 3, + "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2] + }, + "outputs": { + "toptr": [5, 5, 5] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [4, 1, 0, 1, 4, 5, 1, 0, 1, 3], + "lenparents": 10, + "outlength": 2, + "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1] + }, + "outputs": { + "toptr": [4, 4] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [4, 1, 0, 1, 4, 4, 1, 0, 1, 4], + "lenparents": 10, + "outlength": 2, + "parents": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1] + }, + "outputs": { + "toptr": [4, 4] + } + } + ] + }, { "name": "awkward_reduce_prod", - "status": false, + "status": true, "tests": [ { "error": false, diff --git a/src/awkward/_connect/cuda/__init__.py b/src/awkward/_connect/cuda/__init__.py index 5d7a77b8c5..447002c1c4 100644 --- a/src/awkward/_connect/cuda/__init__.py +++ b/src/awkward/_connect/cuda/__init__.py @@ -118,6 +118,7 @@ def fetch_template_specializations(kernel_dict): "awkward_reduce_sum_int32_bool_64", "awkward_reduce_sum_int64_bool_64", "awkward_reduce_sum_bool", + "awkward_reduce_prod", "awkward_reduce_prod_bool", "awkward_reduce_argmax", "awkward_reduce_argmin", diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod.cu new file mode 100644 index 0000000000..1f7e6d4ff0 --- /dev/null +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod.cu @@ -0,0 +1,107 @@ +// BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +// BEGIN PYTHON +// def f(grid, block, args): +// (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args +// if block[0] > 0: +// segment = math.floor((outlength + block[0] - 1) / block[0]) +// grid_size = math.floor((lenparents + block[0] - 1) / block[0]) +// else: +// segment = 0 +// grid_size = 1 +// partial = cupy.ones(outlength * grid_size, dtype=toptr.dtype) +// temp = cupy.ones(lenparents, dtype=toptr.dtype) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) +// out["awkward_reduce_prod_a", {dtype_specializations}] = None +// out["awkward_reduce_prod_b", {dtype_specializations}] = None +// out["awkward_reduce_prod_c", {dtype_specializations}] = None +// END PYTHON + +template +__global__ void +awkward_reduce_prod_a( + T* toptr, + const C* fromptr, + const U* parents, + int64_t lenparents, + int64_t outlength, + T* partial, + T* temp, + uint64_t invocation_index, + uint64_t* err_code) { + if (err_code[0] == NO_ERROR) { + int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < outlength) { + toptr[thread_id] = 1; + } + } +} + +template +__global__ void +awkward_reduce_prod_b( + T* toptr, + const C* fromptr, + const U* parents, + int64_t lenparents, + int64_t outlength, + T* partial, + T* temp, + uint64_t invocation_index, + uint64_t* err_code) { + if (err_code[0] == NO_ERROR) { + int64_t idx = threadIdx.x; + int64_t thread_id = blockIdx.x * blockDim.x + idx; + + if (thread_id < lenparents) { + temp[idx] = fromptr[thread_id]; + } + __syncthreads(); + + for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { + T val = 1; + if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { + val = temp[idx - stride]; + } + __syncthreads(); + temp[idx] *= val; + __syncthreads(); + } + + if (thread_id < lenparents) { + int64_t parent = parents[thread_id]; + if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { + partial[blockIdx.x * outlength + parent] = temp[idx]; + } + } + } +} + +template +__global__ void +awkward_reduce_prod_c( + T* toptr, + const C* fromptr, + const U* parents, + int64_t lenparents, + int64_t outlength, + T* partial, + T* temp, + uint64_t invocation_index, + uint64_t* err_code) { + if (err_code[0] == NO_ERROR) { + int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < outlength) { + T prod = 1; + int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x; + for (int64_t i = 0; i < blocks; ++i) { + prod *= partial[i * outlength + thread_id]; + } + toptr[thread_id] = prod; + } + } +} diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu index 8bdb3fccc2..52cc05492d 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu @@ -3,10 +3,17 @@ // BEGIN PYTHON // def f(grid, block, args): // (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args -// atomicAdd_toptr = cupy.array(toptr, dtype=cupy.uint64) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int32_bool_64_a", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int32_bool_64_b", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int32_bool_64_c", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code)) +// if block[0] > 0: +// segment = math.floor((outlength + block[0] - 1) / block[0]) +// grid_size = math.floor((lenparents + block[0] - 1) / block[0]) +// else: +// segment = 0 +// grid_size = 1 +// partial = cupy.zeros(outlength * grid_size, dtype=toptr.dtype) +// temp = cupy.zeros(lenparents, dtype=toptr.dtype) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int32_bool_64_a", int32, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int32_bool_64_b", int32, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int32_bool_64_c", int32, bool_, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) // out["awkward_reduce_sum_int32_bool_64_a", {dtype_specializations}] = None // out["awkward_reduce_sum_int32_bool_64_b", {dtype_specializations}] = None // out["awkward_reduce_sum_int32_bool_64_c", {dtype_specializations}] = None @@ -20,14 +27,15 @@ awkward_reduce_sum_int32_bool_64_a( const U* parents, int64_t lenparents, int64_t outlength, - uint64_t* atomicAdd_toptr, + T* partial, + T* temp, uint64_t invocation_index, uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - atomicAdd_toptr[thread_id] = 0; + toptr[thread_id] = 0; } } } @@ -40,34 +48,60 @@ awkward_reduce_sum_int32_bool_64_b( const U* parents, int64_t lenparents, int64_t outlength, - uint64_t* atomicAdd_toptr, + T* partial, + T* temp, uint64_t invocation_index, uint64_t* err_code) { if (err_code[0] == NO_ERROR) { - int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + int64_t idx = threadIdx.x; + int64_t thread_id = blockIdx.x * blockDim.x + idx; if (thread_id < lenparents) { - atomicAdd(atomicAdd_toptr + parents[thread_id], - (uint64_t)(fromptr[thread_id] != 0)); + temp[idx] = (fromptr[thread_id] != 0) ? 1 : 0;; + } + __syncthreads(); + + for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { + T val = 0; + if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { + val = temp[idx - stride]; + } + __syncthreads(); + temp[idx] += val; + __syncthreads(); + } + + if (thread_id < lenparents) { + int64_t parent = parents[thread_id]; + if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { + partial[blockIdx.x * outlength + parent] = temp[idx]; + } } } } template __global__ void -awkward_reduce_sum_int32_bool_64_c(T* toptr, - const C* fromptr, - const U* parents, - int64_t lenparents, - int64_t outlength, - uint64_t* atomicAdd_toptr, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_reduce_sum_int32_bool_64_c( + T* toptr, + const C* fromptr, + const U* parents, + int64_t lenparents, + int64_t outlength, + T* partial, + T* temp, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - toptr[thread_id] = (T)atomicAdd_toptr[thread_id]; + T sum = 0; + int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x; + for (int64_t i = 0; i < blocks; ++i) { + sum += partial[i * outlength + thread_id]; + } + toptr[thread_id] = sum; } } } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu index 041558a663..a215bb92f3 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu @@ -3,10 +3,17 @@ // BEGIN PYTHON // def f(grid, block, args): // (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args -// atomicAdd_toptr = cupy.array(toptr, dtype=cupy.uint64) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int64_bool_64_a", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int64_bool_64_b", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int64_bool_64_c", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code)) +// if block[0] > 0: +// segment = math.floor((outlength + block[0] - 1) / block[0]) +// grid_size = math.floor((lenparents + block[0] - 1) / block[0]) +// else: +// segment = 0 +// grid_size = 1 +// partial = cupy.zeros(outlength * grid_size, dtype=toptr.dtype) +// temp = cupy.zeros(lenparents, dtype=toptr.dtype) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int64_bool_64_a", int64, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int64_bool_64_b", int64, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int64_bool_64_c", int64, bool_, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) // out["awkward_reduce_sum_int64_bool_64_a", {dtype_specializations}] = None // out["awkward_reduce_sum_int64_bool_64_b", {dtype_specializations}] = None // out["awkward_reduce_sum_int64_bool_64_c", {dtype_specializations}] = None @@ -20,14 +27,15 @@ awkward_reduce_sum_int64_bool_64_a( const U* parents, int64_t lenparents, int64_t outlength, - uint64_t* atomicAdd_toptr, + T* partial, + T* temp, uint64_t invocation_index, uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - atomicAdd_toptr[thread_id] = 0; + toptr[thread_id] = 0; } } } @@ -40,34 +48,60 @@ awkward_reduce_sum_int64_bool_64_b( const U* parents, int64_t lenparents, int64_t outlength, - uint64_t* atomicAdd_toptr, + T* partial, + T* temp, uint64_t invocation_index, uint64_t* err_code) { if (err_code[0] == NO_ERROR) { - int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + int64_t idx = threadIdx.x; + int64_t thread_id = blockIdx.x * blockDim.x + idx; if (thread_id < lenparents) { - atomicAdd(atomicAdd_toptr + parents[thread_id], - (uint64_t)(fromptr[thread_id] != 0)); + temp[idx] = (fromptr[thread_id] != 0) ? 1 : 0;; + } + __syncthreads(); + + for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { + T val = 0; + if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { + val = temp[idx - stride]; + } + __syncthreads(); + temp[idx] += val; + __syncthreads(); + } + + if (thread_id < lenparents) { + int64_t parent = parents[thread_id]; + if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { + partial[blockIdx.x * outlength + parent] = temp[idx]; + } } } } template __global__ void -awkward_reduce_sum_int64_bool_64_c(T* toptr, - const C* fromptr, - const U* parents, - int64_t lenparents, - int64_t outlength, - uint64_t* atomicAdd_toptr, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_reduce_sum_int64_bool_64_c( + T* toptr, + const C* fromptr, + const U* parents, + int64_t lenparents, + int64_t outlength, + T* partial, + T* temp, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - toptr[thread_id] = (T)atomicAdd_toptr[thread_id]; + T sum = 0; + int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x; + for (int64_t i = 0; i < blocks; ++i) { + sum += partial[i * outlength + thread_id]; + } + toptr[thread_id] = sum; } } } From b28a605e976616133c8c9c823089a4a2bcd7a90d Mon Sep 17 00:00:00 2001 From: ManasviGoyal Date: Thu, 6 Jun 2024 10:45:27 +0200 Subject: [PATCH 10/33] feat: add sum_bool and prod_bool kernels --- kernel-test-data.json | 4 +- .../cuda_kernels/awkward_reduce_prod_bool.cu | 72 ++++++++++++++----- .../cuda_kernels/awkward_reduce_sum_bool.cu | 72 ++++++++++++++----- 3 files changed, 108 insertions(+), 40 deletions(-) diff --git a/kernel-test-data.json b/kernel-test-data.json index 0df339f461..21f7b1932f 100644 --- a/kernel-test-data.json +++ b/kernel-test-data.json @@ -22933,7 +22933,7 @@ }, { "name": "awkward_reduce_sum_bool", - "status": false, + "status": true, "tests": [ { "error": false, @@ -23056,7 +23056,7 @@ }, { "name": "awkward_reduce_prod_bool", - "status": false, + "status": true, "tests": [ { "error": false, diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod_bool.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod_bool.cu index 74843af6c0..af10c4f40d 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod_bool.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod_bool.cu @@ -3,10 +3,17 @@ // BEGIN PYTHON // def f(grid, block, args): // (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args -// atomicAdd_toptr = cupy.array(toptr, dtype=cupy.uint64) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_bool_a", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_bool_b", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_bool_c", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code)) +// if block[0] > 0: +// segment = math.floor((outlength + block[0] - 1) / block[0]) +// grid_size = math.floor((lenparents + block[0] - 1) / block[0]) +// else: +// segment = 0 +// grid_size = 1 +// partial = cupy.ones(outlength * grid_size, dtype=toptr.dtype) +// temp = cupy.ones(lenparents, dtype=toptr.dtype) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_bool_a", bool_, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_bool_b", bool_, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_bool_c", bool_, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) // out["awkward_reduce_prod_bool_a", {dtype_specializations}] = None // out["awkward_reduce_prod_bool_b", {dtype_specializations}] = None // out["awkward_reduce_prod_bool_c", {dtype_specializations}] = None @@ -20,14 +27,15 @@ awkward_reduce_prod_bool_a( const U* parents, int64_t lenparents, int64_t outlength, - uint64_t* atomicAdd_toptr, + T* partial, + T* temp, uint64_t invocation_index, uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - atomicAdd_toptr[thread_id] = true; + toptr[thread_id] = 1; } } } @@ -40,34 +48,60 @@ awkward_reduce_prod_bool_b( const U* parents, int64_t lenparents, int64_t outlength, - uint64_t* atomicAdd_toptr, + T* partial, + T* temp, uint64_t invocation_index, uint64_t* err_code) { if (err_code[0] == NO_ERROR) { - int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + int64_t idx = threadIdx.x; + int64_t thread_id = blockIdx.x * blockDim.x + idx; if (thread_id < lenparents) { - atomicAnd(atomicAdd_toptr + parents[thread_id], - (uint64_t)(fromptr[thread_id] != 0)); + temp[idx] = fromptr[thread_id]; + } + __syncthreads(); + + for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { + T val = 1; + if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { + val = temp[idx - stride]; + } + __syncthreads(); + temp[idx] &= (val != 0); + __syncthreads(); + } + + if (thread_id < lenparents) { + int64_t parent = parents[thread_id]; + if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { + partial[blockIdx.x * outlength + parent] = temp[idx]; + } } } } template __global__ void -awkward_reduce_prod_bool_c(T* toptr, - const C* fromptr, - const U* parents, - int64_t lenparents, - int64_t outlength, - uint64_t* atomicAdd_toptr, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_reduce_prod_bool_c( + T* toptr, + const C* fromptr, + const U* parents, + int64_t lenparents, + int64_t outlength, + T* partial, + T* temp, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - toptr[thread_id] = (T)atomicAdd_toptr[thread_id]; + T prod = 1; + int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x; + for (int64_t i = 0; i < blocks; ++i) { + prod &= (partial[i * outlength + thread_id] != 0); + } + toptr[thread_id] = prod; } } } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_bool.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_bool.cu index 0e062a6c78..cee00fd95f 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_bool.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_bool.cu @@ -3,10 +3,17 @@ // BEGIN PYTHON // def f(grid, block, args): // (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args -// atomicAdd_toptr = cupy.array(toptr, dtype=cupy.uint64) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_bool_a", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_bool_b", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_bool_c", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, atomicAdd_toptr, invocation_index, err_code)) +// if block[0] > 0: +// segment = math.floor((outlength + block[0] - 1) / block[0]) +// grid_size = math.floor((lenparents + block[0] - 1) / block[0]) +// else: +// segment = 0 +// grid_size = 1 +// partial = cupy.zeros(outlength * grid_size, dtype=toptr.dtype) +// temp = cupy.zeros(lenparents, dtype=toptr.dtype) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_bool_a", bool_, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_bool_b", bool_, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_bool_c", bool_, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) // out["awkward_reduce_sum_bool_a", {dtype_specializations}] = None // out["awkward_reduce_sum_bool_b", {dtype_specializations}] = None // out["awkward_reduce_sum_bool_c", {dtype_specializations}] = None @@ -20,14 +27,15 @@ awkward_reduce_sum_bool_a( const U* parents, int64_t lenparents, int64_t outlength, - uint64_t* atomicAdd_toptr, + T* partial, + T* temp, uint64_t invocation_index, uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - atomicAdd_toptr[thread_id] = false; + toptr[thread_id] = 0; } } } @@ -40,34 +48,60 @@ awkward_reduce_sum_bool_b( const U* parents, int64_t lenparents, int64_t outlength, - uint64_t* atomicAdd_toptr, + T* partial, + T* temp, uint64_t invocation_index, uint64_t* err_code) { if (err_code[0] == NO_ERROR) { - int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + int64_t idx = threadIdx.x; + int64_t thread_id = blockIdx.x * blockDim.x + idx; if (thread_id < lenparents) { - atomicOr(atomicAdd_toptr + parents[thread_id], - (uint64_t)(fromptr[thread_id] != 0)); + temp[idx] = fromptr[thread_id]; + } + __syncthreads(); + + for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { + T val = 0; + if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { + val = temp[idx - stride]; + } + __syncthreads(); + temp[idx] |= (val != 0); + __syncthreads(); + } + + if (thread_id < lenparents) { + int64_t parent = parents[thread_id]; + if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { + partial[blockIdx.x * outlength + parent] = temp[idx]; + } } } } template __global__ void -awkward_reduce_sum_bool_c(T* toptr, - const C* fromptr, - const U* parents, - int64_t lenparents, - int64_t outlength, - uint64_t* atomicAdd_toptr, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_reduce_sum_bool_c( + T* toptr, + const C* fromptr, + const U* parents, + int64_t lenparents, + int64_t outlength, + T* partial, + T* temp, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - toptr[thread_id] = (T)atomicAdd_toptr[thread_id]; + T sum = 0; + int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x; + for (int64_t i = 0; i < blocks; ++i) { + sum |= (partial[i * outlength + thread_id] != 0); + } + toptr[thread_id] = sum; } } } From 9e7abc710864bac255b2e73f79c95c07ccfe2779 Mon Sep 17 00:00:00 2001 From: ManasviGoyal Date: Thu, 6 Jun 2024 11:54:57 +0200 Subject: [PATCH 11/33] fix: use cpt.assert_allclose --- dev/generate-tests.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dev/generate-tests.py b/dev/generate-tests.py index 7267d5659c..cb75fb1819 100644 --- a/dev/generate-tests.py +++ b/dev/generate-tests.py @@ -953,6 +953,7 @@ def gencudakerneltests(specdict): f.write( "import cupy\n" + "import cupy.testing as cpt\n" "import pytest\n\n" "import awkward as ak\n" "import awkward._connect.cuda as ak_cu\n" @@ -1022,7 +1023,7 @@ def gencudakerneltests(specdict): if isinstance(val, list): f.write( " " * 4 - + f"assert cupy.array_equal({arg}[:len(pytest_{arg})], cupy.array(pytest_{arg}))\n" + + f"cpt.assert_allclose({arg}[:len(pytest_{arg})], cupy.array(pytest_{arg}))\n" ) else: f.write(" " * 4 + f"assert {arg} == pytest_{arg}\n") @@ -1082,6 +1083,7 @@ def gencudaunittests(specdict): f.write( "import re\n" "import cupy\n" + "import cupy.testing as cpt\n" "import pytest\n\n" "import awkward as ak\n" "import awkward._connect.cuda as ak_cu\n" @@ -1218,7 +1220,7 @@ def gencudaunittests(specdict): if isinstance(val, list): f.write( " " * 4 - + f"assert cupy.array_equal({arg}[:len(pytest_{arg})], cupy.array(pytest_{arg}))\n" + + f"cpt.assert_allclose({arg}[:len(pytest_{arg})], cupy.array(pytest_{arg}))\n" ) else: f.write(" " * 4 + f"assert {arg} == pytest_{arg}\n") From 458165c885428ccf08b34288287bbab316e09238 Mon Sep 17 00:00:00 2001 From: ManasviGoyal Date: Thu, 6 Jun 2024 14:47:45 +0200 Subject: [PATCH 12/33] test: reducer integration tests --- tests-cuda/test_3136_cuda_reducers.py | 379 ++++++++++++++++++++++++++ 1 file changed, 379 insertions(+) create mode 100644 tests-cuda/test_3136_cuda_reducers.py diff --git a/tests-cuda/test_3136_cuda_reducers.py b/tests-cuda/test_3136_cuda_reducers.py new file mode 100644 index 0000000000..3bc554e1a1 --- /dev/null +++ b/tests-cuda/test_3136_cuda_reducers.py @@ -0,0 +1,379 @@ +from __future__ import annotations + +import cupy.testing as cpt +import numpy as np + +import awkward as ak + +to_list = ak.operations.to_list + + +def test_sumprod_types(): + def prod(xs): + out = 1 + for x in xs: + out *= x + return out + + array = np.array([[True, False, False], [True, False, False]]) + content2 = ak.contents.NumpyArray(array.reshape(-1)) + offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64)) + depth1 = ak.contents.ListOffsetArray(offsets3, content2) + + depth1 = ak.to_backend(depth1, "cuda") + + assert sum(to_list(np.sum(array, axis=-1))) == sum( + to_list(ak.sum(depth1, axis=-1, highlevel=False)) + ) + assert prod(to_list(np.prod(array, axis=-1))) == prod( + to_list(ak.prod(depth1, axis=-1, highlevel=False)) + ) + + array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.int8) + content2 = ak.contents.NumpyArray(array.reshape(-1)) + offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64)) + depth1 = ak.contents.ListOffsetArray(offsets3, content2) + + depth1 = ak.to_backend(depth1, "cuda") + + assert ( + np.sum(array, axis=-1).dtype + == ak.to_numpy(ak.sum(depth1, axis=-1, highlevel=False)).dtype + ) + assert ( + np.prod(array, axis=-1).dtype + == ak.to_numpy(ak.prod(depth1, axis=-1, highlevel=False)).dtype + ) + assert sum(to_list(np.sum(array, axis=-1))) == sum( + to_list(ak.sum(depth1, axis=-1, highlevel=False)) + ) + assert prod(to_list(np.prod(array, axis=-1))) == prod( + to_list(ak.prod(depth1, axis=-1, highlevel=False)) + ) + + array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.uint8) + content2 = ak.contents.NumpyArray(array.reshape(-1)) + offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64)) + depth1 = ak.contents.ListOffsetArray(offsets3, content2) + + depth1 = ak.to_backend(depth1, "cuda") + + assert ( + np.sum(array, axis=-1).dtype + == ak.to_numpy(ak.sum(depth1, axis=-1, highlevel=False)).dtype + ) + assert ( + np.prod(array, axis=-1).dtype + == ak.to_numpy(ak.prod(depth1, axis=-1, highlevel=False)).dtype + ) + assert sum(to_list(np.sum(array, axis=-1))) == sum( + to_list(ak.sum(depth1, axis=-1, highlevel=False)) + ) + assert prod(to_list(np.prod(array, axis=-1))) == prod( + to_list(ak.prod(depth1, axis=-1, highlevel=False)) + ) + + array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.int16) + content2 = ak.contents.NumpyArray(array.reshape(-1)) + offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64)) + depth1 = ak.contents.ListOffsetArray(offsets3, content2) + + depth1 = ak.to_backend(depth1, "cuda") + + assert ( + np.sum(array, axis=-1).dtype + == ak.to_numpy(ak.sum(depth1, axis=-1, highlevel=False)).dtype + ) + assert ( + np.prod(array, axis=-1).dtype + == ak.to_numpy(ak.prod(depth1, axis=-1, highlevel=False)).dtype + ) + assert sum(to_list(np.sum(array, axis=-1))) == sum( + to_list(ak.sum(depth1, axis=-1, highlevel=False)) + ) + assert prod(to_list(np.prod(array, axis=-1))) == prod( + to_list(ak.prod(depth1, axis=-1, highlevel=False)) + ) + + array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.uint16) + content2 = ak.contents.NumpyArray(array.reshape(-1)) + offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64)) + depth1 = ak.contents.ListOffsetArray(offsets3, content2) + + depth1 = ak.to_backend(depth1, "cuda") + + assert ( + np.sum(array, axis=-1).dtype + == ak.to_numpy(ak.sum(depth1, axis=-1, highlevel=False)).dtype + ) + assert ( + np.prod(array, axis=-1).dtype + == ak.to_numpy(ak.prod(depth1, axis=-1, highlevel=False)).dtype + ) + assert sum(to_list(np.sum(array, axis=-1))) == sum( + to_list(ak.sum(depth1, axis=-1, highlevel=False)) + ) + assert prod(to_list(np.prod(array, axis=-1))) == prod( + to_list(ak.prod(depth1, axis=-1, highlevel=False)) + ) + + array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.int32) + content2 = ak.contents.NumpyArray(array.reshape(-1)) + offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64)) + depth1 = ak.contents.ListOffsetArray(offsets3, content2) + + depth1 = ak.to_backend(depth1, "cuda") + + assert ( + np.sum(array, axis=-1).dtype + == ak.to_numpy(ak.sum(depth1, axis=-1, highlevel=False)).dtype + ) + assert ( + np.prod(array, axis=-1).dtype + == ak.to_numpy(ak.prod(depth1, axis=-1, highlevel=False)).dtype + ) + assert sum(to_list(np.sum(array, axis=-1))) == sum( + to_list(ak.sum(depth1, axis=-1, highlevel=False)) + ) + assert prod(to_list(np.prod(array, axis=-1))) == prod( + to_list(ak.prod(depth1, axis=-1, highlevel=False)) + ) + + array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.uint32) + content2 = ak.contents.NumpyArray(array.reshape(-1)) + offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64)) + depth1 = ak.contents.ListOffsetArray(offsets3, content2) + + depth1 = ak.to_backend(depth1, "cuda") + + assert ( + np.sum(array, axis=-1).dtype + == ak.to_numpy(ak.sum(depth1, axis=-1, highlevel=False)).dtype + ) + assert ( + np.prod(array, axis=-1).dtype + == ak.to_numpy(ak.prod(depth1, axis=-1, highlevel=False)).dtype + ) + assert sum(to_list(np.sum(array, axis=-1))) == sum( + to_list(ak.sum(depth1, axis=-1, highlevel=False)) + ) + assert prod(to_list(np.prod(array, axis=-1))) == prod( + to_list(ak.prod(depth1, axis=-1, highlevel=False)) + ) + + array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.int64) + content2 = ak.contents.NumpyArray(array.reshape(-1)) + offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64)) + depth1 = ak.contents.ListOffsetArray(offsets3, content2) + + depth1 = ak.to_backend(depth1, "cuda") + + assert ( + np.sum(array, axis=-1).dtype + == ak.to_numpy(ak.sum(depth1, axis=-1, highlevel=False)).dtype + ) + assert ( + np.prod(array, axis=-1).dtype + == ak.to_numpy(ak.prod(depth1, axis=-1, highlevel=False)).dtype + ) + assert sum(to_list(np.sum(array, axis=-1))) == sum( + to_list(ak.sum(depth1, axis=-1, highlevel=False)) + ) + assert prod(to_list(np.prod(array, axis=-1))) == prod( + to_list(ak.prod(depth1, axis=-1, highlevel=False)) + ) + + array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.uint64) + content2 = ak.contents.NumpyArray(array.reshape(-1)) + offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64)) + depth1 = ak.contents.ListOffsetArray(offsets3, content2) + + depth1 = ak.to_backend(depth1, "cuda") + + assert ( + np.sum(array, axis=-1).dtype + == ak.to_numpy(ak.sum(depth1, axis=-1, highlevel=False)).dtype + ) + assert ( + np.prod(array, axis=-1).dtype + == ak.to_numpy(ak.prod(depth1, axis=-1, highlevel=False)).dtype + ) + assert sum(to_list(np.sum(array, axis=-1))) == sum( + to_list(ak.sum(depth1, axis=-1, highlevel=False)) + ) + assert prod(to_list(np.prod(array, axis=-1))) == prod( + to_list(ak.prod(depth1, axis=-1, highlevel=False)) + ) + + +def test_sumprod_types_FIXME(): + array = np.array([[True, False, False], [True, False, False]]) + content2 = ak.contents.NumpyArray(array.reshape(-1)) + offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64)) + depth1 = ak.contents.ListOffsetArray(offsets3, content2) + depth1 = ak.to_backend(depth1, "cuda") + + assert ( + np.sum(array, axis=-1).dtype + == ak.to_numpy(ak.sum(depth1, axis=-1, highlevel=False)).dtype + ) + assert ( + np.prod(array, axis=-1).dtype + == ak.to_numpy(ak.prod(depth1, axis=-1, highlevel=False)).dtype + ) + + +array = ak.Array( + [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda" +) + + +def test_sum(): + cpt.assert_allclose(ak.sum(array, axis=None), 63.0) + assert ak.almost_equal( + ak.sum(array, axis=None, keepdims=True), + ak.to_regular(ak.Array([[63.0]], backend="cuda")), + ) + assert ak.almost_equal( + ak.sum(array, axis=None, keepdims=True, mask_identity=True), + ak.to_regular( + ak.Array([[63.0]], backend="cuda").mask[ak.Array([[True]], backend="cuda")] + ), + ) + assert ak.sum(array[2], axis=None, mask_identity=True) is None + + +def test_prod(): + cpt.assert_allclose(ak.prod(array[1:], axis=None), 4838400.0) + assert ak.prod(array, axis=None) == 0 + assert ak.almost_equal( + ak.prod(array, axis=None, keepdims=True), + ak.to_regular(ak.Array([[0.0]], backend="cuda")), + ) + assert ak.almost_equal( + ak.prod(array[1:], axis=None, keepdims=True), + ak.to_regular(ak.Array([[4838400.0]], backend="cuda")), + ) + assert ak.almost_equal( + ak.prod(array[1:], axis=None, keepdims=True, mask_identity=True), + ak.to_regular( + ak.Array([[4838400.0]], backend="cuda").mask[ + ak.Array([[True]], backend="cuda") + ] + ), + ) + assert ak.prod(array[2], axis=None, mask_identity=True) is None + + +def test_min(): + cpt.assert_allclose(ak.min(array, axis=None), 0.0) + assert ak.almost_equal( + ak.min(array, axis=None, keepdims=True, mask_identity=False), + ak.to_regular(ak.Array([[0.0]], backend="cuda")), + ) + assert ak.almost_equal( + ak.min(array, axis=None, keepdims=True, initial=-100.0, mask_identity=False), + ak.to_regular(ak.Array([[-100.0]], backend="cuda")), + ) + + assert ak.almost_equal( + ak.min(array, axis=None, keepdims=True, mask_identity=True), + ak.to_regular( + ak.Array([[0.0]], backend="cuda").mask[ak.Array([[True]], backend="cuda")] + ), + ) + assert ak.almost_equal( + ak.min(array[-1:], axis=None, keepdims=True, mask_identity=True), + ak.to_regular( + ak.Array(ak.Array([[np.inf]], backend="cuda")).mask[ + ak.Array([[False]], backend="cuda") + ] + ), + ) + assert ak.min(array[2], axis=None, mask_identity=True) is None + + +def test_max(): + cpt.assert_allclose(ak.max(array, axis=None), 10.0) + assert ak.almost_equal( + ak.max(array, axis=None, keepdims=True, mask_identity=False), + ak.to_regular(ak.Array([[10.0]], backend="cuda")), + ) + assert ak.almost_equal( + ak.max(array, axis=None, keepdims=True, initial=100.0, mask_identity=False), + ak.to_regular(ak.Array([[100.0]], backend="cuda")), + ) + assert ak.almost_equal( + ak.max(array, axis=None, keepdims=True, mask_identity=True), + ak.to_regular( + ak.Array([[10.0]], backend="cuda").mask[ak.Array([[True]], backend="cuda")] + ), + ) + assert ak.almost_equal( + ak.max(array[-1:], axis=None, keepdims=True, mask_identity=True), + ak.to_regular( + ak.Array(ak.Array([[np.inf]], backend="cuda")).mask[ + ak.Array([[False]], backend="cuda") + ] + ), + ) + assert ak.max(array[2], axis=None, mask_identity=True) is None + + +array = ak.Array( + [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda" +) + + +def test_count(): + assert ak.count(array, axis=None) == 12 + assert ak.almost_equal( + ak.count(array, axis=None, keepdims=True, mask_identity=False), + ak.to_regular(ak.Array([[12]], backend="cuda")), + ) + assert ak.almost_equal( + ak.count(array, axis=None, keepdims=True, mask_identity=True), + ak.to_regular( + ak.Array([[12]], backend="cuda").mask[ak.Array([[True]], backend="cuda")] + ), + ) + assert ak.almost_equal( + ak.count(array[-1:], axis=None, keepdims=True, mask_identity=True), + ak.to_regular( + ak.Array([[0]], backend="cuda").mask[ak.Array([[False]], backend="cuda")] + ), + ) + assert ak.count(array[2], axis=None, mask_identity=True) is None + assert ak.count(array[2], axis=None, mask_identity=False) == 0 + + +def test_count_nonzero(): + assert ak.count_nonzero(array, axis=None) == 11 + assert ak.almost_equal( + ak.count_nonzero(array, axis=None, keepdims=True, mask_identity=False), + ak.to_regular(ak.Array([[11]], backend="cuda")), + ) + assert ak.almost_equal( + ak.count_nonzero(array, axis=None, keepdims=True, mask_identity=True), + ak.to_regular( + ak.Array([[11]], backend="cuda").mask[ak.Array([[True]], backend="cuda")] + ), + ) + assert ak.almost_equal( + ak.count_nonzero(array[-1:], axis=None, keepdims=True, mask_identity=True), + ak.to_regular( + ak.Array([[0]], backend="cuda").mask[ak.Array([[False]], backend="cuda")] + ), + ) + assert ak.count_nonzero(array[2], axis=None, mask_identity=True) is None + assert ak.count_nonzero(array[2], axis=None, mask_identity=False) == 0 + + +def test_std_no_mask_axis_none(): + assert ak.almost_equal( + ak.std(array[-1:], axis=None, keepdims=True, mask_identity=True), + ak.to_regular( + ak.Array([[0.0]], backend="cuda").mask[ak.Array([[False]], backend="cuda")] + ), + ) + assert ak.std(array[2], axis=None, mask_identity=True) is None From c75cb7922ffa06a8ad1d3265b3d2fe3c14b38ccb Mon Sep 17 00:00:00 2001 From: ManasviGoyal Date: Thu, 6 Jun 2024 16:30:46 +0200 Subject: [PATCH 13/33] fix: typr conversion --- .../cuda/cuda_kernels/awkward_reduce_max.cu | 14 +++++++------- .../cuda/cuda_kernels/awkward_reduce_min.cu | 14 +++++++------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu index 4ac7df43ba..a411d1970a 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu @@ -10,7 +10,7 @@ // segment = 0 // grid_size = 1 // partial = cupy.full(outlength * grid_size, identity, dtype=toptr.dtype) -// temp = cupy.zeros(lenparents, dtype=toptr.dtype) +// temp = cupy.full(lenparents, identity, dtype=toptr.dtype) // cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_max_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, identity, partial, temp, invocation_index, err_code)) // cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_max_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, identity, partial, temp, invocation_index, err_code)) // cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_max_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, identity, partial, temp, invocation_index, err_code)) @@ -27,7 +27,7 @@ awkward_reduce_max_a( const U* parents, int64_t lenparents, int64_t outlength, - T identity, + int64_t identity, T* partial, T* temp, uint64_t invocation_index, @@ -36,7 +36,7 @@ awkward_reduce_max_a( int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - toptr[thread_id] = identity; + toptr[thread_id] = static_cast(identity); } } } @@ -49,7 +49,7 @@ awkward_reduce_max_b( const U* parents, int64_t lenparents, int64_t outlength, - T identity, + int64_t identity, T* partial, T* temp, uint64_t invocation_index, @@ -64,7 +64,7 @@ awkward_reduce_max_b( __syncthreads(); for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { - T val = identity; + T val = static_cast(identity); if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { val = temp[idx - stride]; } @@ -90,7 +90,7 @@ awkward_reduce_max_c( const U* parents, int64_t lenparents, int64_t outlength, - T identity, + int64_t identity, T* partial, T* temp, uint64_t invocation_index, @@ -99,7 +99,7 @@ awkward_reduce_max_c( int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - T maximum = identity; + T maximum = static_cast(identity); int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x; for (int64_t i = 0; i < blocks; ++i) { maximum = maximum > partial[i * outlength + thread_id] ? maximum : partial[i * outlength + thread_id]; diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu index f524485e58..828097a14f 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu @@ -10,7 +10,7 @@ // segment = 0 // grid_size = 1 // partial = cupy.full(outlength * grid_size, identity, dtype=toptr.dtype) -// temp = cupy.zeros(lenparents, dtype=toptr.dtype) +// temp = cupy.full(lenparents, identity, dtype=toptr.dtype) // cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_min_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, identity, partial, temp, invocation_index, err_code)) // cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_min_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, identity, partial, temp, invocation_index, err_code)) // cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_min_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, identity, partial, temp, invocation_index, err_code)) @@ -27,7 +27,7 @@ awkward_reduce_min_a( const U* parents, int64_t lenparents, int64_t outlength, - T identity, + int64_t identity, T* partial, T* temp, uint64_t invocation_index, @@ -36,7 +36,7 @@ awkward_reduce_min_a( int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - toptr[thread_id] = identity; + toptr[thread_id] = static_cast(identity); } } } @@ -49,7 +49,7 @@ awkward_reduce_min_b( const U* parents, int64_t lenparents, int64_t outlength, - T identity, + int64_t identity, T* partial, T* temp, uint64_t invocation_index, @@ -64,7 +64,7 @@ awkward_reduce_min_b( __syncthreads(); for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { - T val = identity; + T val = static_cast(identity); if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { val = temp[idx - stride]; } @@ -90,7 +90,7 @@ awkward_reduce_min_c( const U* parents, int64_t lenparents, int64_t outlength, - T identity, + int64_t identity, T* partial, T* temp, uint64_t invocation_index, @@ -99,7 +99,7 @@ awkward_reduce_min_c( int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - T minimum = identity; + T minimum = static_cast(identity); int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x; for (int64_t i = 0; i < blocks; ++i) { minimum = minimum < partial[i * outlength + thread_id] ? minimum : partial[i * outlength + thread_id]; From 427670c15b97c645b19ac939ff726522171b344a Mon Sep 17 00:00:00 2001 From: ManasviGoyal Date: Fri, 7 Jun 2024 11:56:24 +0200 Subject: [PATCH 14/33] fix: use atomic to avoid race conditions --- ...tOffsetArray_reduce_local_outoffsets_64.cu | 32 ++++++++----------- .../cuda_kernels/awkward_reduce_argmax.cu | 31 ++++++------------ .../cuda_kernels/awkward_reduce_argmin.cu | 31 ++++++------------ .../cuda_kernels/awkward_reduce_count_64.cu | 27 ++++++---------- .../awkward_reduce_countnonzero.cu | 27 ++++++---------- .../awkward_reduce_sum_int32_bool_64.cu | 27 ++++++---------- .../awkward_reduce_sum_int64_bool_64.cu | 27 ++++++---------- 7 files changed, 73 insertions(+), 129 deletions(-) diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu index 5ae3d2eb56..3dcdf14727 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu @@ -7,16 +7,15 @@ // segment = math.floor((outlength + block[0] - 1) / block[0]) // grid_size = math.floor((lenparents + block[0] - 1) / block[0]) // else: -// segment = 0 // grid_size = 1 -// partial = cupy.zeros(outlength * grid_size, dtype=outoffsets.dtype) +// atomic_outoffsets = cupy.array(outoffsets, dtype=cupy.uint64) // temp = cupy.zeros(lenparents, dtype=cupy.int64) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_a", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_b", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_a", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, atomic_outoffsets, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_b", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, atomic_outoffsets, temp, invocation_index, err_code)) // scan_in_array = cupy.zeros(outlength, dtype=cupy.int64) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_c", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((segment,), block, (outoffsets, parents, lenparents, outlength, partial, scan_in_array, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_c", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, atomic_outoffsets, scan_in_array, invocation_index, err_code)) // scan_in_array = cupy.cumsum(scan_in_array) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_d", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, partial, scan_in_array, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_d", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, atomic_outoffsets, scan_in_array, invocation_index, err_code)) // out["awkward_ListOffsetArray_reduce_local_outoffsets_64_a", {dtype_specializations}] = None // out["awkward_ListOffsetArray_reduce_local_outoffsets_64_b", {dtype_specializations}] = None // out["awkward_ListOffsetArray_reduce_local_outoffsets_64_c", {dtype_specializations}] = None @@ -30,7 +29,7 @@ awkward_ListOffsetArray_reduce_local_outoffsets_64_a( const C* parents, int64_t lenparents, int64_t outlength, - T* partial, + uint64_t* atomic_outoffsets, int64_t* temp, uint64_t invocation_index, uint64_t* err_code) { @@ -38,7 +37,7 @@ awkward_ListOffsetArray_reduce_local_outoffsets_64_a( int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - outoffsets[thread_id] = 0; + atomic_outoffsets[thread_id] = 0; } } } @@ -50,7 +49,7 @@ awkward_ListOffsetArray_reduce_local_outoffsets_64_b( const C* parents, int64_t lenparents, int64_t outlength, - T* partial, + uint64_t* atomic_outoffsets, int64_t* temp, uint64_t invocation_index, uint64_t* err_code) { @@ -76,7 +75,7 @@ awkward_ListOffsetArray_reduce_local_outoffsets_64_b( if (thread_id < lenparents) { int64_t parent = parents[thread_id]; if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { - partial[blockIdx.x * outlength + parent] = temp[idx]; + atomicAdd(&atomic_outoffsets[parent], temp[idx]); } } } @@ -89,7 +88,7 @@ awkward_ListOffsetArray_reduce_local_outoffsets_64_c( const C* parents, int64_t lenparents, int64_t outlength, - T* partial, + uint64_t* atomic_outoffsets, int64_t* scan_in_array, uint64_t invocation_index, uint64_t* err_code) { @@ -97,12 +96,7 @@ awkward_ListOffsetArray_reduce_local_outoffsets_64_c( int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - int64_t count = 0; - int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x; - for (int64_t i = 0; i < blocks; ++i) { - count += partial[i * outlength + thread_id]; - } - scan_in_array[thread_id] = count; + scan_in_array[thread_id] = atomic_outoffsets[thread_id]; } } } @@ -114,7 +108,7 @@ awkward_ListOffsetArray_reduce_local_outoffsets_64_d( const C* parents, int64_t lenparents, int64_t outlength, - T* partial, + uint64_t* atomic_outoffsets, int64_t* scan_in_array, uint64_t invocation_index, uint64_t* err_code) { @@ -123,7 +117,7 @@ awkward_ListOffsetArray_reduce_local_outoffsets_64_d( outoffsets[0] = 0; if (thread_id < outlength) { - outoffsets[thread_id + 1] = scan_in_array[thread_id]; + outoffsets[thread_id + 1] = static_cast(scan_in_array[thread_id]); } } } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu index 122894795e..d2fe929a6b 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu @@ -4,16 +4,14 @@ // def f(grid, block, args): // (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args // if block[0] > 0: -// segment = math.floor((outlength + block[0] - 1) / block[0]) // grid_size = math.floor((lenparents + block[0] - 1) / block[0]) // else: -// segment = 0 // grid_size = 1 -// partial = cupy.full(outlength * grid_size, -1, dtype=toptr.dtype) +// atomic_toptr = cupy.array(toptr, dtype=cupy.uint64) // temp = cupy.zeros(lenparents, dtype=toptr.dtype) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code)) // out["awkward_reduce_argmax_a", {dtype_specializations}] = None // out["awkward_reduce_argmax_b", {dtype_specializations}] = None // out["awkward_reduce_argmax_c", {dtype_specializations}] = None @@ -27,7 +25,7 @@ awkward_reduce_argmax_a( const U* parents, int64_t lenparents, int64_t outlength, - T* partial, + uint64_t* atomic_toptr, T* temp, uint64_t invocation_index, uint64_t* err_code) { @@ -35,7 +33,7 @@ awkward_reduce_argmax_a( int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - toptr[thread_id] = -1; + atomic_toptr[thread_id] = -1; } } } @@ -48,7 +46,7 @@ awkward_reduce_argmax_b( const U* parents, int64_t lenparents, int64_t outlength, - T* partial, + uint64_t* atomic_toptr, T* temp, uint64_t invocation_index, uint64_t* err_code) { @@ -76,7 +74,7 @@ awkward_reduce_argmax_b( if (thread_id < lenparents) { int64_t parent = parents[thread_id]; if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { - partial[blockIdx.x * outlength + parent] = temp[idx]; + atomicExch(&atomic_toptr[parent], temp[idx]); } } } @@ -90,7 +88,7 @@ awkward_reduce_argmax_c( const U* parents, int64_t lenparents, int64_t outlength, - T* partial, + uint64_t* atomic_toptr, T* temp, uint64_t invocation_index, uint64_t* err_code) { @@ -98,16 +96,7 @@ awkward_reduce_argmax_c( int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - int64_t argmax = -1; - int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x; - for (int64_t i = 0; i < blocks; ++i) { - int64_t index = partial[i * outlength + thread_id]; - if (index != -1 && (argmax == -1 || fromptr[index] > fromptr[argmax]) || - (fromptr[index] == fromptr[argmax] && index < argmax)) { - argmax = index; - } - } - toptr[thread_id] = argmax; + toptr[thread_id] = static_cast(atomic_toptr[thread_id]); } } } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu index 40a8437218..754ec84cfb 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu @@ -4,16 +4,14 @@ // def f(grid, block, args): // (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args // if block[0] > 0: -// segment = math.floor((outlength + block[0] - 1) / block[0]) // grid_size = math.floor((lenparents + block[0] - 1) / block[0]) // else: -// segment = 0 // grid_size = 1 -// partial = cupy.full(outlength * grid_size, -1, dtype=toptr.dtype) +// atomic_toptr = cupy.array(toptr, dtype=cupy.uint64) // temp = cupy.zeros(lenparents, dtype=toptr.dtype) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmin_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmin_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmin_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmin_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmin_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmin_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code)) // out["awkward_reduce_argmin_a", {dtype_specializations}] = None // out["awkward_reduce_argmin_b", {dtype_specializations}] = None // out["awkward_reduce_argmin_c", {dtype_specializations}] = None @@ -27,7 +25,7 @@ awkward_reduce_argmin_a( const U* parents, int64_t lenparents, int64_t outlength, - T* partial, + uint64_t* atomic_toptr, T* temp, uint64_t invocation_index, uint64_t* err_code) { @@ -35,7 +33,7 @@ awkward_reduce_argmin_a( int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - toptr[thread_id] = -1; + atomic_toptr[thread_id] = -1; } } } @@ -48,7 +46,7 @@ awkward_reduce_argmin_b( const U* parents, int64_t lenparents, int64_t outlength, - T* partial, + uint64_t* atomic_toptr, T* temp, uint64_t invocation_index, uint64_t* err_code) { @@ -76,7 +74,7 @@ awkward_reduce_argmin_b( if (thread_id < lenparents) { int64_t parent = parents[thread_id]; if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { - partial[blockIdx.x * outlength + parent] = temp[idx]; + atomicExch(&atomic_toptr[parent], temp[idx]); } } } @@ -90,7 +88,7 @@ awkward_reduce_argmin_c( const U* parents, int64_t lenparents, int64_t outlength, - T* partial, + uint64_t* atomic_toptr, T* temp, uint64_t invocation_index, uint64_t* err_code) { @@ -98,16 +96,7 @@ awkward_reduce_argmin_c( int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - int64_t argmin = -1; - int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x; - for (int64_t i = 0; i < blocks; ++i) { - int64_t index = partial[i * outlength + thread_id]; - if (index != -1 && (argmin == -1 || fromptr[index] < fromptr[argmin]) || - (fromptr[index] == fromptr[argmin] && index < argmin)) { - argmin = index; - } - } - toptr[thread_id] = argmin; + toptr[thread_id] = static_cast(atomic_toptr[thread_id]); } } } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu index cdf870c63c..f2a306f5c6 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu @@ -4,16 +4,14 @@ // def f(grid, block, args): // (toptr, parents, lenparents, outlength, invocation_index, err_code) = args // if block[0] > 0: -// segment = math.floor((outlength + block[0] - 1) / block[0]) // grid_size = math.floor((lenparents + block[0] - 1) / block[0]) // else: -// segment = 0 // grid_size = 1 -// partial = cupy.zeros(outlength * grid_size, dtype=toptr.dtype) +// atomic_toptr = cupy.array(toptr, dtype=cupy.uint64) // temp = cupy.zeros(lenparents, dtype=toptr.dtype) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_a", cupy.dtype(toptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_b", cupy.dtype(toptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_c", cupy.dtype(toptr.dtype).type, parents.dtype]))((segment,), block, (toptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_a", cupy.dtype(toptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_b", cupy.dtype(toptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_c", cupy.dtype(toptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code)) // out["awkward_reduce_count_64_a", {dtype_specializations}] = None // out["awkward_reduce_count_64_b", {dtype_specializations}] = None // out["awkward_reduce_count_64_c", {dtype_specializations}] = None @@ -26,7 +24,7 @@ awkward_reduce_count_64_a( const C* parents, int64_t lenparents, int64_t outlength, - T* partial, + uint64_t* atomic_toptr, T* temp, uint64_t invocation_index, uint64_t* err_code) { @@ -34,7 +32,7 @@ awkward_reduce_count_64_a( int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - toptr[thread_id] = 0; + atomic_toptr[thread_id] = 0; } } } @@ -46,7 +44,7 @@ awkward_reduce_count_64_b( const C* parents, int64_t lenparents, int64_t outlength, - T* partial, + uint64_t* atomic_toptr, T* temp, uint64_t invocation_index, uint64_t* err_code) { @@ -72,7 +70,7 @@ awkward_reduce_count_64_b( if (thread_id < lenparents) { int64_t parent = parents[thread_id]; if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { - partial[blockIdx.x * outlength + parent] = temp[idx]; + atomicAdd(&atomic_toptr[parent], temp[idx]); } } } @@ -85,7 +83,7 @@ awkward_reduce_count_64_c( const C* parents, int64_t lenparents, int64_t outlength, - T* partial, + uint64_t* atomic_toptr, T* temp, uint64_t invocation_index, uint64_t* err_code) { @@ -93,12 +91,7 @@ awkward_reduce_count_64_c( int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - int64_t count = 0; - int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x; - for (int64_t i = 0; i < blocks; ++i) { - count += partial[i * outlength + thread_id]; - } - toptr[thread_id] = count; + toptr[thread_id] = static_cast(atomic_toptr[thread_id]); } } } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu index 11bb84b18f..1652e0b918 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu @@ -4,16 +4,14 @@ // def f(grid, block, args): // (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args // if block[0] > 0: -// segment = math.floor((outlength + block[0] - 1) / block[0]) // grid_size = math.floor((lenparents + block[0] - 1) / block[0]) // else: -// segment = 0 // grid_size = 1 -// partial = cupy.zeros(outlength * grid_size, dtype=toptr.dtype) +// atomic_toptr = cupy.array(toptr, dtype=cupy.uint64) // temp = cupy.zeros(lenparents, dtype=toptr.dtype) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_countnonzero_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_countnonzero_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_countnonzero_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_countnonzero_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_countnonzero_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_countnonzero_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code)) // out["awkward_reduce_countnonzero_a", {dtype_specializations}] = None // out["awkward_reduce_countnonzero_b", {dtype_specializations}] = None // out["awkward_reduce_countnonzero_c", {dtype_specializations}] = None @@ -27,7 +25,7 @@ awkward_reduce_countnonzero_a( const U* parents, int64_t lenparents, int64_t outlength, - T* partial, + uint64_t* atomic_toptr, T* temp, uint64_t invocation_index, uint64_t* err_code) { @@ -35,7 +33,7 @@ awkward_reduce_countnonzero_a( int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - toptr[thread_id] = 0; + atomic_toptr[thread_id] = 0; } } } @@ -48,7 +46,7 @@ awkward_reduce_countnonzero_b( const U* parents, int64_t lenparents, int64_t outlength, - T* partial, + uint64_t* atomic_toptr, T* temp, uint64_t invocation_index, uint64_t* err_code) { @@ -74,7 +72,7 @@ awkward_reduce_countnonzero_b( if (thread_id < lenparents) { int64_t parent = parents[thread_id]; if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { - partial[blockIdx.x * outlength + parent] = temp[idx]; + atomicAdd(&atomic_toptr[parent], temp[idx]); } } } @@ -88,7 +86,7 @@ awkward_reduce_countnonzero_c( const U* parents, int64_t lenparents, int64_t outlength, - T* partial, + uint64_t* atomic_toptr, T* temp, uint64_t invocation_index, uint64_t* err_code) { @@ -96,12 +94,7 @@ awkward_reduce_countnonzero_c( int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - int64_t count = 0; - int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x; - for (int64_t i = 0; i < blocks; ++i) { - count += partial[i * outlength + thread_id]; - } - toptr[thread_id] = count; + toptr[thread_id] = static_cast(atomic_toptr[thread_id]); } } } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu index 52cc05492d..15e983b35b 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu @@ -4,16 +4,14 @@ // def f(grid, block, args): // (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args // if block[0] > 0: -// segment = math.floor((outlength + block[0] - 1) / block[0]) // grid_size = math.floor((lenparents + block[0] - 1) / block[0]) // else: -// segment = 0 // grid_size = 1 -// partial = cupy.zeros(outlength * grid_size, dtype=toptr.dtype) +// atomic_toptr = cupy.array(toptr, dtype=cupy.uint64) // temp = cupy.zeros(lenparents, dtype=toptr.dtype) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int32_bool_64_a", int32, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int32_bool_64_b", int32, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int32_bool_64_c", int32, bool_, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int32_bool_64_a", int32, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int32_bool_64_b", int32, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int32_bool_64_c", int32, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code)) // out["awkward_reduce_sum_int32_bool_64_a", {dtype_specializations}] = None // out["awkward_reduce_sum_int32_bool_64_b", {dtype_specializations}] = None // out["awkward_reduce_sum_int32_bool_64_c", {dtype_specializations}] = None @@ -27,7 +25,7 @@ awkward_reduce_sum_int32_bool_64_a( const U* parents, int64_t lenparents, int64_t outlength, - T* partial, + uint64_t* atomic_toptr, T* temp, uint64_t invocation_index, uint64_t* err_code) { @@ -35,7 +33,7 @@ awkward_reduce_sum_int32_bool_64_a( int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - toptr[thread_id] = 0; + atomic_toptr[thread_id] = 0; } } } @@ -48,7 +46,7 @@ awkward_reduce_sum_int32_bool_64_b( const U* parents, int64_t lenparents, int64_t outlength, - T* partial, + uint64_t* atomic_toptr, T* temp, uint64_t invocation_index, uint64_t* err_code) { @@ -74,7 +72,7 @@ awkward_reduce_sum_int32_bool_64_b( if (thread_id < lenparents) { int64_t parent = parents[thread_id]; if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { - partial[blockIdx.x * outlength + parent] = temp[idx]; + atomicAdd(&atomic_toptr[parent], temp[idx]); } } } @@ -88,7 +86,7 @@ awkward_reduce_sum_int32_bool_64_c( const U* parents, int64_t lenparents, int64_t outlength, - T* partial, + uint64_t* atomic_toptr, T* temp, uint64_t invocation_index, uint64_t* err_code) { @@ -96,12 +94,7 @@ awkward_reduce_sum_int32_bool_64_c( int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - T sum = 0; - int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x; - for (int64_t i = 0; i < blocks; ++i) { - sum += partial[i * outlength + thread_id]; - } - toptr[thread_id] = sum; + toptr[thread_id] = static_cast(atomic_toptr[thread_id]); } } } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu index a215bb92f3..d381c526a9 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu @@ -4,16 +4,14 @@ // def f(grid, block, args): // (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args // if block[0] > 0: -// segment = math.floor((outlength + block[0] - 1) / block[0]) // grid_size = math.floor((lenparents + block[0] - 1) / block[0]) // else: -// segment = 0 // grid_size = 1 -// partial = cupy.zeros(outlength * grid_size, dtype=toptr.dtype) +// atomic_toptr = cupy.array(toptr, dtype=cupy.uint64) // temp = cupy.zeros(lenparents, dtype=toptr.dtype) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int64_bool_64_a", int64, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int64_bool_64_b", int64, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int64_bool_64_c", int64, bool_, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int64_bool_64_a", int64, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int64_bool_64_b", int64, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int64_bool_64_c", int64, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code)) // out["awkward_reduce_sum_int64_bool_64_a", {dtype_specializations}] = None // out["awkward_reduce_sum_int64_bool_64_b", {dtype_specializations}] = None // out["awkward_reduce_sum_int64_bool_64_c", {dtype_specializations}] = None @@ -27,7 +25,7 @@ awkward_reduce_sum_int64_bool_64_a( const U* parents, int64_t lenparents, int64_t outlength, - T* partial, + uint64_t* atomic_toptr, T* temp, uint64_t invocation_index, uint64_t* err_code) { @@ -35,7 +33,7 @@ awkward_reduce_sum_int64_bool_64_a( int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - toptr[thread_id] = 0; + atomic_toptr[thread_id] = 0; } } } @@ -48,7 +46,7 @@ awkward_reduce_sum_int64_bool_64_b( const U* parents, int64_t lenparents, int64_t outlength, - T* partial, + uint64_t* atomic_toptr, T* temp, uint64_t invocation_index, uint64_t* err_code) { @@ -74,7 +72,7 @@ awkward_reduce_sum_int64_bool_64_b( if (thread_id < lenparents) { int64_t parent = parents[thread_id]; if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { - partial[blockIdx.x * outlength + parent] = temp[idx]; + atomicAdd(&atomic_toptr[parent], temp[idx]); } } } @@ -88,7 +86,7 @@ awkward_reduce_sum_int64_bool_64_c( const U* parents, int64_t lenparents, int64_t outlength, - T* partial, + uint64_t* atomic_toptr, T* temp, uint64_t invocation_index, uint64_t* err_code) { @@ -96,12 +94,7 @@ awkward_reduce_sum_int64_bool_64_c( int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - T sum = 0; - int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x; - for (int64_t i = 0; i < blocks; ++i) { - sum += partial[i * outlength + thread_id]; - } - toptr[thread_id] = sum; + toptr[thread_id] = static_cast(atomic_toptr[thread_id]); } } } From 127e035b4e03bde5b27677162ee1f5c734244cf0 Mon Sep 17 00:00:00 2001 From: ManasviGoyal Date: Mon, 10 Jun 2024 17:02:49 +0200 Subject: [PATCH 15/33] fix: remove unnessary variable --- .../cuda/cuda_kernels/awkward_reduce_sum.cu | 39 +------------ .../cuda_kernels/awkward_reduce_sum_bool.cu | 27 ++++----- .../awkward_reduce_sum_int32_bool_64.cu | 34 ++--------- .../awkward_reduce_sum_int64_bool_64.cu | 34 ++--------- .../_connect/cuda/cuda_kernels/cuda_common.cu | 57 +++++++++++++++++++ 5 files changed, 78 insertions(+), 113 deletions(-) diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum.cu index e641d728b4..bcbad2e07a 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum.cu @@ -4,19 +4,14 @@ // def f(grid, block, args): // (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args // if block[0] > 0: -// segment = math.floor((outlength + block[0] - 1) / block[0]) // grid_size = math.floor((lenparents + block[0] - 1) / block[0]) // else: -// segment = 0 // grid_size = 1 -// partial = cupy.zeros(outlength * grid_size, dtype=toptr.dtype) // temp = cupy.zeros(lenparents, dtype=toptr.dtype) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, temp, invocation_index, err_code)) // out["awkward_reduce_sum_a", {dtype_specializations}] = None // out["awkward_reduce_sum_b", {dtype_specializations}] = None -// out["awkward_reduce_sum_c", {dtype_specializations}] = None // END PYTHON template @@ -27,7 +22,6 @@ awkward_reduce_sum_a( const U* parents, int64_t lenparents, int64_t outlength, - T* partial, T* temp, uint64_t invocation_index, uint64_t* err_code) { @@ -48,7 +42,6 @@ awkward_reduce_sum_b( const U* parents, int64_t lenparents, int64_t outlength, - T* partial, T* temp, uint64_t invocation_index, uint64_t* err_code) { @@ -74,34 +67,8 @@ awkward_reduce_sum_b( if (thread_id < lenparents) { int64_t parent = parents[thread_id]; if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { - partial[blockIdx.x * outlength + parent] = temp[idx]; + atomicAdd(&toptr[parent], temp[idx]); } } } } - -template -__global__ void -awkward_reduce_sum_c( - T* toptr, - const C* fromptr, - const U* parents, - int64_t lenparents, - int64_t outlength, - T* partial, - T* temp, - uint64_t invocation_index, - uint64_t* err_code) { - if (err_code[0] == NO_ERROR) { - int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; - - if (thread_id < outlength) { - T sum = 0; - int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x; - for (int64_t i = 0; i < blocks; ++i) { - sum += partial[i * outlength + thread_id]; - } - toptr[thread_id] = sum; - } - } -} diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_bool.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_bool.cu index cee00fd95f..d233a9b9ed 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_bool.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_bool.cu @@ -4,16 +4,14 @@ // def f(grid, block, args): // (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args // if block[0] > 0: -// segment = math.floor((outlength + block[0] - 1) / block[0]) // grid_size = math.floor((lenparents + block[0] - 1) / block[0]) // else: -// segment = 0 // grid_size = 1 -// partial = cupy.zeros(outlength * grid_size, dtype=toptr.dtype) +// atomic_toptr = cupy.array(toptr, dtype=cupy.uint32) // temp = cupy.zeros(lenparents, dtype=toptr.dtype) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_bool_a", bool_, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_bool_b", bool_, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_bool_c", bool_, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_bool_a", bool_, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_bool_b", bool_, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_bool_c", bool_, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code)) // out["awkward_reduce_sum_bool_a", {dtype_specializations}] = None // out["awkward_reduce_sum_bool_b", {dtype_specializations}] = None // out["awkward_reduce_sum_bool_c", {dtype_specializations}] = None @@ -27,7 +25,7 @@ awkward_reduce_sum_bool_a( const U* parents, int64_t lenparents, int64_t outlength, - T* partial, + uint32_t* atomic_toptr, T* temp, uint64_t invocation_index, uint64_t* err_code) { @@ -35,7 +33,7 @@ awkward_reduce_sum_bool_a( int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - toptr[thread_id] = 0; + atomic_toptr[thread_id] = 0; } } } @@ -48,7 +46,7 @@ awkward_reduce_sum_bool_b( const U* parents, int64_t lenparents, int64_t outlength, - T* partial, + uint32_t* atomic_toptr, T* temp, uint64_t invocation_index, uint64_t* err_code) { @@ -74,7 +72,7 @@ awkward_reduce_sum_bool_b( if (thread_id < lenparents) { int64_t parent = parents[thread_id]; if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { - partial[blockIdx.x * outlength + parent] = temp[idx]; + atomicOr(&atomic_toptr[parent], temp[idx]); } } } @@ -88,7 +86,7 @@ awkward_reduce_sum_bool_c( const U* parents, int64_t lenparents, int64_t outlength, - T* partial, + uint32_t* atomic_toptr, T* temp, uint64_t invocation_index, uint64_t* err_code) { @@ -96,12 +94,7 @@ awkward_reduce_sum_bool_c( int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - T sum = 0; - int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x; - for (int64_t i = 0; i < blocks; ++i) { - sum |= (partial[i * outlength + thread_id] != 0); - } - toptr[thread_id] = sum; + toptr[thread_id] = (T)(atomic_toptr[thread_id]); } } } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu index 15e983b35b..9aa8636e9b 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu @@ -7,14 +7,11 @@ // grid_size = math.floor((lenparents + block[0] - 1) / block[0]) // else: // grid_size = 1 -// atomic_toptr = cupy.array(toptr, dtype=cupy.uint64) // temp = cupy.zeros(lenparents, dtype=toptr.dtype) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int32_bool_64_a", int32, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int32_bool_64_b", int32, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int32_bool_64_c", int32, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int32_bool_64_a", int32, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int32_bool_64_b", int32, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, temp, invocation_index, err_code)) // out["awkward_reduce_sum_int32_bool_64_a", {dtype_specializations}] = None // out["awkward_reduce_sum_int32_bool_64_b", {dtype_specializations}] = None -// out["awkward_reduce_sum_int32_bool_64_c", {dtype_specializations}] = None // END PYTHON template @@ -25,7 +22,6 @@ awkward_reduce_sum_int32_bool_64_a( const U* parents, int64_t lenparents, int64_t outlength, - uint64_t* atomic_toptr, T* temp, uint64_t invocation_index, uint64_t* err_code) { @@ -33,7 +29,7 @@ awkward_reduce_sum_int32_bool_64_a( int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - atomic_toptr[thread_id] = 0; + toptr[thread_id] = 0; } } } @@ -46,7 +42,6 @@ awkward_reduce_sum_int32_bool_64_b( const U* parents, int64_t lenparents, int64_t outlength, - uint64_t* atomic_toptr, T* temp, uint64_t invocation_index, uint64_t* err_code) { @@ -72,29 +67,8 @@ awkward_reduce_sum_int32_bool_64_b( if (thread_id < lenparents) { int64_t parent = parents[thread_id]; if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { - atomicAdd(&atomic_toptr[parent], temp[idx]); + atomicAdd(&toptr[parent], temp[idx]); } } } } - -template -__global__ void -awkward_reduce_sum_int32_bool_64_c( - T* toptr, - const C* fromptr, - const U* parents, - int64_t lenparents, - int64_t outlength, - uint64_t* atomic_toptr, - T* temp, - uint64_t invocation_index, - uint64_t* err_code) { - if (err_code[0] == NO_ERROR) { - int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; - - if (thread_id < outlength) { - toptr[thread_id] = static_cast(atomic_toptr[thread_id]); - } - } -} diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu index d381c526a9..9f6399eac2 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu @@ -7,14 +7,11 @@ // grid_size = math.floor((lenparents + block[0] - 1) / block[0]) // else: // grid_size = 1 -// atomic_toptr = cupy.array(toptr, dtype=cupy.uint64) // temp = cupy.zeros(lenparents, dtype=toptr.dtype) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int64_bool_64_a", int64, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int64_bool_64_b", int64, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int64_bool_64_c", int64, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int64_bool_64_a", int64, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_sum_int64_bool_64_b", int64, bool_, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, temp, invocation_index, err_code)) // out["awkward_reduce_sum_int64_bool_64_a", {dtype_specializations}] = None // out["awkward_reduce_sum_int64_bool_64_b", {dtype_specializations}] = None -// out["awkward_reduce_sum_int64_bool_64_c", {dtype_specializations}] = None // END PYTHON template @@ -25,7 +22,6 @@ awkward_reduce_sum_int64_bool_64_a( const U* parents, int64_t lenparents, int64_t outlength, - uint64_t* atomic_toptr, T* temp, uint64_t invocation_index, uint64_t* err_code) { @@ -33,7 +29,7 @@ awkward_reduce_sum_int64_bool_64_a( int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - atomic_toptr[thread_id] = 0; + toptr[thread_id] = 0; } } } @@ -46,7 +42,6 @@ awkward_reduce_sum_int64_bool_64_b( const U* parents, int64_t lenparents, int64_t outlength, - uint64_t* atomic_toptr, T* temp, uint64_t invocation_index, uint64_t* err_code) { @@ -72,29 +67,8 @@ awkward_reduce_sum_int64_bool_64_b( if (thread_id < lenparents) { int64_t parent = parents[thread_id]; if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { - atomicAdd(&atomic_toptr[parent], temp[idx]); + atomicAdd(&toptr[parent], temp[idx]); } } } } - -template -__global__ void -awkward_reduce_sum_int64_bool_64_c( - T* toptr, - const C* fromptr, - const U* parents, - int64_t lenparents, - int64_t outlength, - uint64_t* atomic_toptr, - T* temp, - uint64_t invocation_index, - uint64_t* err_code) { - if (err_code[0] == NO_ERROR) { - int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; - - if (thread_id < outlength) { - toptr[thread_id] = static_cast(atomic_toptr[thread_id]); - } - } -} diff --git a/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu b/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu index 8a02094f34..89ad707471 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu @@ -32,6 +32,27 @@ typedef unsigned long long uintmax_t; atomicMin(err_code, \ invocation_index*(1 << ERROR_BITS) + (int)(ERROR_KERNEL_CODE)); +// BEGIN PYTHON +// def min_max_type(dtype): +// supported_types = { +// 'bool': cupy.int32, +// 'int8': cupy.int32, +// 'int16': cupy.int32, +// 'int32': cupy.int32, +// 'int64': cupy.int64, +// 'uint8': cupy.uint32, +// 'uint16': cupy.uint32, +// 'uint32': cupy.uint32, +// 'uint64': cupy.uint64, +// 'float32': cupy.float32, +// 'float64': cupy.float64 +// } +// if str(dtype) in supported_types: +// return supported_types[str(dtype)] +// else: +// raise ValueError("Unsupported dtype.", dtype) +// END PYTHON + // BEGIN PYTHON // def inclusive_scan(grid, block, args): // (d_in, invocation_index, err_code) = args @@ -144,3 +165,39 @@ exclusive_scan_kernel(T* input, } } } + +__device__ __forceinline__ float atomicMin(float* addr, float value) { + float old; old = !signbit(value) ? __int_as_float(atomicMin((int*)addr, __float_as_int(value))) : __uint_as_float(atomicMax((unsigned int*)addr, __float_as_uint(value))); + return old; +} +__device__ __forceinline__ float atomicMax(float* addr, float value) { + float old; old = !signbit(value) ? __int_as_float(atomicMax((int*)addr, __float_as_int(value))) : __uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(value))); + return old; +} + + +typedef long long int64_t; + + +template +struct is_int64_t { + static const bool value = false; +}; + + +template <> +struct is_int64_t { + static const bool value = true; +}; + + + +__device__ int64_t atomicAdd(int64_t* address, int64_t val) { + uint64_t* address_as_ull = (uint64_t*)address; + uint64_t old = *address_as_ull, assumed; + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, assumed + (uint64_t)val); + } while (assumed != old); + return (int64_t)old; +} From 8dee2aec2ede72be9763733a8fd44b56bf15c920 Mon Sep 17 00:00:00 2001 From: ManasviGoyal Date: Mon, 10 Jun 2024 17:18:57 +0200 Subject: [PATCH 16/33] fix: minor fixes --- .../cuda_kernels/awkward_reduce_argmax.cu | 2 +- .../cuda_kernels/awkward_reduce_argmin.cu | 2 +- .../cuda_kernels/awkward_reduce_count_64.cu | 33 +++--------------- .../awkward_reduce_countnonzero.cu | 34 +++---------------- .../_connect/cuda/cuda_kernels/cuda_common.cu | 16 --------- 5 files changed, 10 insertions(+), 77 deletions(-) diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu index d2fe929a6b..0202d7276f 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu @@ -96,7 +96,7 @@ awkward_reduce_argmax_c( int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - toptr[thread_id] = static_cast(atomic_toptr[thread_id]); + toptr[thread_id] = (T)(atomic_toptr[thread_id]); } } } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu index 754ec84cfb..e2215b1b11 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu @@ -96,7 +96,7 @@ awkward_reduce_argmin_c( int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - toptr[thread_id] = static_cast(atomic_toptr[thread_id]); + toptr[thread_id] = (T)(atomic_toptr[thread_id]); } } } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu index f2a306f5c6..2d317aebcd 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu @@ -7,14 +7,11 @@ // grid_size = math.floor((lenparents + block[0] - 1) / block[0]) // else: // grid_size = 1 -// atomic_toptr = cupy.array(toptr, dtype=cupy.uint64) // temp = cupy.zeros(lenparents, dtype=toptr.dtype) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_a", cupy.dtype(toptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_b", cupy.dtype(toptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_c", cupy.dtype(toptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_a", cupy.dtype(toptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, parents, lenparents, outlength, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_count_64_b", cupy.dtype(toptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, parents, lenparents, outlength, temp, invocation_index, err_code)) // out["awkward_reduce_count_64_a", {dtype_specializations}] = None // out["awkward_reduce_count_64_b", {dtype_specializations}] = None -// out["awkward_reduce_count_64_c", {dtype_specializations}] = None // END PYTHON template @@ -24,7 +21,6 @@ awkward_reduce_count_64_a( const C* parents, int64_t lenparents, int64_t outlength, - uint64_t* atomic_toptr, T* temp, uint64_t invocation_index, uint64_t* err_code) { @@ -32,7 +28,7 @@ awkward_reduce_count_64_a( int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - atomic_toptr[thread_id] = 0; + toptr[thread_id] = 0; } } } @@ -44,7 +40,6 @@ awkward_reduce_count_64_b( const C* parents, int64_t lenparents, int64_t outlength, - uint64_t* atomic_toptr, T* temp, uint64_t invocation_index, uint64_t* err_code) { @@ -70,28 +65,8 @@ awkward_reduce_count_64_b( if (thread_id < lenparents) { int64_t parent = parents[thread_id]; if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { - atomicAdd(&atomic_toptr[parent], temp[idx]); + atomicAdd(&toptr[parent], temp[idx]); } } } } - -template -__global__ void -awkward_reduce_count_64_c( - T* toptr, - const C* parents, - int64_t lenparents, - int64_t outlength, - uint64_t* atomic_toptr, - T* temp, - uint64_t invocation_index, - uint64_t* err_code) { - if (err_code[0] == NO_ERROR) { - int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; - - if (thread_id < outlength) { - toptr[thread_id] = static_cast(atomic_toptr[thread_id]); - } - } -} diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu index 1652e0b918..7af29ad6c7 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu @@ -7,14 +7,11 @@ // grid_size = math.floor((lenparents + block[0] - 1) / block[0]) // else: // grid_size = 1 -// atomic_toptr = cupy.array(toptr, dtype=cupy.uint64) // temp = cupy.zeros(lenparents, dtype=toptr.dtype) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_countnonzero_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_countnonzero_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_countnonzero_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_countnonzero_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_countnonzero_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, temp, invocation_index, err_code)) // out["awkward_reduce_countnonzero_a", {dtype_specializations}] = None // out["awkward_reduce_countnonzero_b", {dtype_specializations}] = None -// out["awkward_reduce_countnonzero_c", {dtype_specializations}] = None // END PYTHON template @@ -25,7 +22,6 @@ awkward_reduce_countnonzero_a( const U* parents, int64_t lenparents, int64_t outlength, - uint64_t* atomic_toptr, T* temp, uint64_t invocation_index, uint64_t* err_code) { @@ -33,7 +29,7 @@ awkward_reduce_countnonzero_a( int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - atomic_toptr[thread_id] = 0; + toptr[thread_id] = 0; } } } @@ -46,7 +42,6 @@ awkward_reduce_countnonzero_b( const U* parents, int64_t lenparents, int64_t outlength, - uint64_t* atomic_toptr, T* temp, uint64_t invocation_index, uint64_t* err_code) { @@ -72,29 +67,8 @@ awkward_reduce_countnonzero_b( if (thread_id < lenparents) { int64_t parent = parents[thread_id]; if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { - atomicAdd(&atomic_toptr[parent], temp[idx]); + atomicAdd(&toptr[parent], temp[idx]); } } } } - -template -__global__ void -awkward_reduce_countnonzero_c( - T* toptr, - const C* fromptr, - const U* parents, - int64_t lenparents, - int64_t outlength, - uint64_t* atomic_toptr, - T* temp, - uint64_t invocation_index, - uint64_t* err_code) { - if (err_code[0] == NO_ERROR) { - int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; - - if (thread_id < outlength) { - toptr[thread_id] = static_cast(atomic_toptr[thread_id]); - } - } -} diff --git a/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu b/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu index 89ad707471..9fbeea3c1f 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu @@ -176,22 +176,6 @@ __device__ __forceinline__ float atomicMax(float* addr, float value) { } -typedef long long int64_t; - - -template -struct is_int64_t { - static const bool value = false; -}; - - -template <> -struct is_int64_t { - static const bool value = true; -}; - - - __device__ int64_t atomicAdd(int64_t* address, int64_t val) { uint64_t* address_as_ull = (uint64_t*)address; uint64_t old = *address_as_ull, assumed; From 896770f1c4665891f04d1dd3e8ee67288405f6ef Mon Sep 17 00:00:00 2001 From: ManasviGoyal Date: Wed, 12 Jun 2024 14:27:03 +0200 Subject: [PATCH 17/33] fix: all reducer for atomics --- kernel-test-data.json | 130 +++++++- ...tOffsetArray_reduce_local_outoffsets_64.cu | 51 +-- .../cuda_kernels/awkward_reduce_argmax.cu | 12 +- .../cuda_kernels/awkward_reduce_argmin.cu | 12 +- .../cuda_kernels/awkward_reduce_count_64.cu | 8 +- .../awkward_reduce_countnonzero.cu | 8 +- .../cuda/cuda_kernels/awkward_reduce_max.cu | 50 +-- .../cuda/cuda_kernels/awkward_reduce_min.cu | 54 +--- .../cuda/cuda_kernels/awkward_reduce_prod.cu | 33 +- .../cuda_kernels/awkward_reduce_prod_bool.cu | 33 +- .../cuda/cuda_kernels/awkward_reduce_sum.cu | 8 +- .../cuda_kernels/awkward_reduce_sum_bool.cu | 8 +- .../awkward_reduce_sum_int32_bool_64.cu | 8 +- .../awkward_reduce_sum_int64_bool_64.cu | 8 +- .../_connect/cuda/cuda_kernels/cuda_common.cu | 293 ++++++++++++++++-- tests-cuda/test_3136_cuda_reducers.py | 1 - 16 files changed, 488 insertions(+), 229 deletions(-) diff --git a/kernel-test-data.json b/kernel-test-data.json index efa4033e7b..e843bf9b05 100644 --- a/kernel-test-data.json +++ b/kernel-test-data.json @@ -13108,6 +13108,76 @@ } ] }, + { + "name": "awkward_ListArray_combinations", + "status": true, + "tests": [ + { + "error": false, + "message": "", + "inputs": { + "fromindex": [], + "length": 0, + "n": 0, + "replacement": false, + "starts": [], + "stops": [] + }, + "outputs": { + "tocarry": [[0], [0]], + "toindex": [0] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromindex": [0], + "length": 1, + "n": 2, + "replacement": false, + "starts": [0], + "stops": [2] + }, + "outputs": { + "tocarry": [[0, 1], [0, 1]], + "toindex": [1, 1] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromindex": [0, 3, 3, 5, 7], + "length": 5, + "n": 2, + "replacement": false, + "starts": [0, 4, 4, 7, 8], + "stops": [4, 4, 7, 8, 13] + }, + "outputs": { + "tocarry": [[0, 6, 6, 9, 9, 19], [0, 6, 6, 9, 9, 19]], + "toindex": [0, 6, 6, 9, 9, 19] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromindex": [0, 4, 4, 7, 8], + "length": 5, + "n": 2, + "replacement": false, + "starts": [0, 3, 3, 10, 10], + "stops": [3, 3, 5, 10, 13] + }, + "outputs": { + "tocarry": [[0, 3, 3, 4, 4, 7], [0, 6, 6, 9, 9, 19]], + "toindex": [0, 6, 6, 9, 9, 19] + } + } + ] + }, { "name": "awkward_ListArray_getitem_jagged_carrylen", "status": true, @@ -24366,6 +24436,64 @@ } ] }, + { + "name": "awkward_reduce_sum_complex", + "status": true, + "tests": [ + { + "error": false, + "message": "", + "inputs": { + "fromptr": [], + "lenparents": 0, + "outlength": 0, + "parents": [] + }, + "outputs": { + "toptr": [] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [0, 0], + "lenparents": 1, + "outlength": 2, + "parents": [0] + }, + "outputs": { + "toptr": [0, 0] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [2, 2, 3, 3, 5, 5, 7, 7, 11, 11, 13, 13, 17, 17, 19, 19, 23, 23], + "lenparents": 9, + "outlength": 12, + "parents": [0, 0, 0, 2, 2, 3, 4, 4, 5] + }, + "outputs": { + "toptr": [10, 10, 0, 0, 18, 18, 13, 13, 36, 36, 23, 23] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromptr": [1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1], + "lenparents": 6, + "outlength": 8, + "parents": [0, 0, 0, 2, 2, 3] + }, + "outputs": { + "toptr": [1, 3, 0, 0, 1, 2, 0, 1] + } + } + ] + }, { "name": "awkward_reduce_sum", "status": true, @@ -26349,7 +26477,7 @@ "error": false, "message": "", "inputs": { - "fromptr": [0, 4, 1, 3, 5, 6], + "fromptr": [0, 4, 1, 1, 5, 6], "identity": 9223372036854775807, "lenparents": 6, "outlength": 4, diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu index 3dcdf14727..42e8119d46 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu @@ -8,18 +8,15 @@ // grid_size = math.floor((lenparents + block[0] - 1) / block[0]) // else: // grid_size = 1 -// atomic_outoffsets = cupy.array(outoffsets, dtype=cupy.uint64) // temp = cupy.zeros(lenparents, dtype=cupy.int64) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_a", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, atomic_outoffsets, temp, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_b", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, atomic_outoffsets, temp, invocation_index, err_code)) -// scan_in_array = cupy.zeros(outlength, dtype=cupy.int64) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_c", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, atomic_outoffsets, scan_in_array, invocation_index, err_code)) +// scan_in_array = cupy.zeros(outlength, dtype=cupy.uint64) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_a", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, scan_in_array, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_b", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, scan_in_array, temp, invocation_index, err_code)) // scan_in_array = cupy.cumsum(scan_in_array) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_d", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, atomic_outoffsets, scan_in_array, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_c", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, scan_in_array, temp, invocation_index, err_code)) // out["awkward_ListOffsetArray_reduce_local_outoffsets_64_a", {dtype_specializations}] = None // out["awkward_ListOffsetArray_reduce_local_outoffsets_64_b", {dtype_specializations}] = None // out["awkward_ListOffsetArray_reduce_local_outoffsets_64_c", {dtype_specializations}] = None -// out["awkward_ListOffsetArray_reduce_local_outoffsets_64_d", {dtype_specializations}] = None // END PYTHON template @@ -29,7 +26,7 @@ awkward_ListOffsetArray_reduce_local_outoffsets_64_a( const C* parents, int64_t lenparents, int64_t outlength, - uint64_t* atomic_outoffsets, + uint64_t* scan_in_array, int64_t* temp, uint64_t invocation_index, uint64_t* err_code) { @@ -37,7 +34,7 @@ awkward_ListOffsetArray_reduce_local_outoffsets_64_a( int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - atomic_outoffsets[thread_id] = 0; + outoffsets[thread_id] = 0; } } } @@ -49,7 +46,7 @@ awkward_ListOffsetArray_reduce_local_outoffsets_64_b( const C* parents, int64_t lenparents, int64_t outlength, - uint64_t* atomic_outoffsets, + uint64_t* scan_in_array, int64_t* temp, uint64_t invocation_index, uint64_t* err_code) { @@ -58,24 +55,24 @@ awkward_ListOffsetArray_reduce_local_outoffsets_64_b( int64_t thread_id = blockIdx.x * blockDim.x + idx; if (thread_id < lenparents) { - temp[idx] = 1; + temp[thread_id] = 1; } __syncthreads(); for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { int64_t val = 0; if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { - val = temp[idx - stride]; + val = temp[thread_id - stride]; } __syncthreads(); - temp[idx] += val; + temp[thread_id] += val; __syncthreads(); } if (thread_id < lenparents) { int64_t parent = parents[thread_id]; if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { - atomicAdd(&atomic_outoffsets[parent], temp[idx]); + atomicAdd(&scan_in_array[parent], temp[thread_id]); } } } @@ -88,28 +85,8 @@ awkward_ListOffsetArray_reduce_local_outoffsets_64_c( const C* parents, int64_t lenparents, int64_t outlength, - uint64_t* atomic_outoffsets, - int64_t* scan_in_array, - uint64_t invocation_index, - uint64_t* err_code) { - if (err_code[0] == NO_ERROR) { - int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; - - if (thread_id < outlength) { - scan_in_array[thread_id] = atomic_outoffsets[thread_id]; - } - } -} - -template -__global__ void -awkward_ListOffsetArray_reduce_local_outoffsets_64_d( - T* outoffsets, - const C* parents, - int64_t lenparents, - int64_t outlength, - uint64_t* atomic_outoffsets, - int64_t* scan_in_array, + uint64_t* scan_in_array, + int64_t* temp, uint64_t invocation_index, uint64_t* err_code) { if (err_code[0] == NO_ERROR) { @@ -117,7 +94,7 @@ awkward_ListOffsetArray_reduce_local_outoffsets_64_d( outoffsets[0] = 0; if (thread_id < outlength) { - outoffsets[thread_id + 1] = static_cast(scan_in_array[thread_id]); + outoffsets[thread_id + 1] = (T)(scan_in_array[thread_id]); } } } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu index 0202d7276f..df515f05a4 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu @@ -55,18 +55,18 @@ awkward_reduce_argmax_b( int64_t thread_id = blockIdx.x * blockDim.x + idx; if (thread_id < lenparents) { - temp[idx] = thread_id; + temp[thread_id] = thread_id; } __syncthreads(); for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { int64_t index = -1; if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { - index = temp[idx - stride]; + index = temp[thread_id - stride]; } - if (index != -1 && (temp[idx] == -1 || fromptr[index] > fromptr[temp[idx]] || - (fromptr[index] == fromptr[temp[idx]] && index < temp[idx]))) { - temp[idx] = index; + if (index != -1 && (temp[thread_id] == -1 || fromptr[index] > fromptr[temp[thread_id]] || + (fromptr[index] == fromptr[temp[thread_id]] && index < temp[thread_id]))) { + temp[thread_id] = index; } __syncthreads(); } @@ -74,7 +74,7 @@ awkward_reduce_argmax_b( if (thread_id < lenparents) { int64_t parent = parents[thread_id]; if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { - atomicExch(&atomic_toptr[parent], temp[idx]); + atomicExch(&atomic_toptr[parent], temp[thread_id]); } } } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu index e2215b1b11..af1d3fd93d 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu @@ -55,18 +55,18 @@ awkward_reduce_argmin_b( int64_t thread_id = blockIdx.x * blockDim.x + idx; if (thread_id < lenparents) { - temp[idx] = thread_id; + temp[thread_id] = thread_id; } __syncthreads(); for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { int64_t index = -1; if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { - index = temp[idx - stride]; + index = temp[thread_id - stride]; } - if (index != -1 && (temp[idx] == -1 || fromptr[index] < fromptr[temp[idx]] || - (fromptr[index] == fromptr[temp[idx]] && index < temp[idx]))) { - temp[idx] = index; + if (index != -1 && (temp[thread_id] == -1 || fromptr[index] < fromptr[temp[thread_id]] || + (fromptr[index] == fromptr[temp[thread_id]] && index < temp[thread_id]))) { + temp[thread_id] = index; } __syncthreads(); } @@ -74,7 +74,7 @@ awkward_reduce_argmin_b( if (thread_id < lenparents) { int64_t parent = parents[thread_id]; if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { - atomicExch(&atomic_toptr[parent], temp[idx]); + atomicExch(&atomic_toptr[parent], temp[thread_id]); } } } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu index 2d317aebcd..9c55e69600 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu @@ -48,24 +48,24 @@ awkward_reduce_count_64_b( int64_t thread_id = blockIdx.x * blockDim.x + idx; if (thread_id < lenparents) { - temp[idx] = 1; + temp[thread_id] = 1; } __syncthreads(); for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { int64_t val = 0; if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { - val = temp[idx - stride]; + val = temp[thread_id - stride]; } __syncthreads(); - temp[idx] += val; + temp[thread_id] += val; __syncthreads(); } if (thread_id < lenparents) { int64_t parent = parents[thread_id]; if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { - atomicAdd(&toptr[parent], temp[idx]); + atomicAdd(&toptr[parent], temp[thread_id]); } } } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu index 7af29ad6c7..ffcb0b8bd3 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu @@ -50,24 +50,24 @@ awkward_reduce_countnonzero_b( int64_t thread_id = blockIdx.x * blockDim.x + idx; if (thread_id < lenparents) { - temp[idx] = (fromptr[thread_id] != 0) ? 1 : 0; + temp[thread_id] = (fromptr[thread_id] != 0) ? 1 : 0; } __syncthreads(); for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { int64_t val = 0; if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { - val = temp[idx - stride]; + val = temp[thread_id - stride]; } __syncthreads(); - temp[idx] += val; + temp[thread_id] += val; __syncthreads(); } if (thread_id < lenparents) { int64_t parent = parents[thread_id]; if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { - atomicAdd(&toptr[parent], temp[idx]); + atomicAdd(&toptr[parent], temp[thread_id]); } } } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu index a411d1970a..4afbe3f04c 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu @@ -4,19 +4,14 @@ // def f(grid, block, args): // (toptr, fromptr, parents, lenparents, outlength, identity, invocation_index, err_code) = args // if block[0] > 0: -// segment = math.floor((outlength + block[0] - 1) / block[0]) // grid_size = math.floor((lenparents + block[0] - 1) / block[0]) // else: -// segment = 0 // grid_size = 1 -// partial = cupy.full(outlength * grid_size, identity, dtype=toptr.dtype) // temp = cupy.full(lenparents, identity, dtype=toptr.dtype) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_max_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, identity, partial, temp, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_max_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, identity, partial, temp, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_max_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, identity, partial, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_max_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, identity, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_max_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, identity, temp, invocation_index, err_code)) // out["awkward_reduce_max_a", {dtype_specializations}] = None // out["awkward_reduce_max_b", {dtype_specializations}] = None -// out["awkward_reduce_max_c", {dtype_specializations}] = None // END PYTHON template @@ -27,14 +22,12 @@ awkward_reduce_max_a( const U* parents, int64_t lenparents, int64_t outlength, - int64_t identity, - T* partial, + T identity, T* temp, uint64_t invocation_index, uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; - if (thread_id < outlength) { toptr[thread_id] = static_cast(identity); } @@ -49,8 +42,7 @@ awkward_reduce_max_b( const U* parents, int64_t lenparents, int64_t outlength, - int64_t identity, - T* partial, + T identity, T* temp, uint64_t invocation_index, uint64_t* err_code) { @@ -59,52 +51,26 @@ awkward_reduce_max_b( int64_t thread_id = blockIdx.x * blockDim.x + idx; if (thread_id < lenparents) { - temp[idx] = fromptr[thread_id]; + temp[thread_id] = fromptr[thread_id]; } __syncthreads(); for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { T val = static_cast(identity); + if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { val = temp[idx - stride]; } __syncthreads(); - temp[idx] = val > temp[idx] ? val : temp[idx]; + temp[thread_id] = val > temp[thread_id] ? val : temp[thread_id]; __syncthreads(); } if (thread_id < lenparents) { int64_t parent = parents[thread_id]; if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { - partial[blockIdx.x * outlength + parent] = temp[idx]; - } - } - } -} - -template -__global__ void -awkward_reduce_max_c( - T* toptr, - const C* fromptr, - const U* parents, - int64_t lenparents, - int64_t outlength, - int64_t identity, - T* partial, - T* temp, - uint64_t invocation_index, - uint64_t* err_code) { - if (err_code[0] == NO_ERROR) { - int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; - - if (thread_id < outlength) { - T maximum = static_cast(identity); - int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x; - for (int64_t i = 0; i < blocks; ++i) { - maximum = maximum > partial[i * outlength + thread_id] ? maximum : partial[i * outlength + thread_id]; + atomicMax(&toptr[parent], temp[thread_id]); } - toptr[thread_id] = maximum; } } } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu index 828097a14f..34325d91f1 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu @@ -4,19 +4,14 @@ // def f(grid, block, args): // (toptr, fromptr, parents, lenparents, outlength, identity, invocation_index, err_code) = args // if block[0] > 0: -// segment = math.floor((outlength + block[0] - 1) / block[0]) // grid_size = math.floor((lenparents + block[0] - 1) / block[0]) // else: -// segment = 0 // grid_size = 1 -// partial = cupy.full(outlength * grid_size, identity, dtype=toptr.dtype) // temp = cupy.full(lenparents, identity, dtype=toptr.dtype) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_min_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, identity, partial, temp, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_min_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, identity, partial, temp, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_min_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, identity, partial, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_min_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, identity, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_min_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, identity, temp, invocation_index, err_code)) // out["awkward_reduce_min_a", {dtype_specializations}] = None // out["awkward_reduce_min_b", {dtype_specializations}] = None -// out["awkward_reduce_min_c", {dtype_specializations}] = None // END PYTHON template @@ -27,8 +22,7 @@ awkward_reduce_min_a( const U* parents, int64_t lenparents, int64_t outlength, - int64_t identity, - T* partial, + T identity, T* temp, uint64_t invocation_index, uint64_t* err_code) { @@ -36,7 +30,7 @@ awkward_reduce_min_a( int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - toptr[thread_id] = static_cast(identity); + toptr[thread_id] = identity; } } } @@ -49,8 +43,7 @@ awkward_reduce_min_b( const U* parents, int64_t lenparents, int64_t outlength, - int64_t identity, - T* partial, + T identity, T* temp, uint64_t invocation_index, uint64_t* err_code) { @@ -59,52 +52,25 @@ awkward_reduce_min_b( int64_t thread_id = blockIdx.x * blockDim.x + idx; if (thread_id < lenparents) { - temp[idx] = fromptr[thread_id]; + temp[thread_id] = fromptr[thread_id]; } __syncthreads(); for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { - T val = static_cast(identity); + T val = identity; if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { - val = temp[idx - stride]; + val = temp[thread_id - stride]; } __syncthreads(); - temp[idx] = val < temp[idx] ? val : temp[idx]; + temp[thread_id] = val < temp[thread_id] ? val : temp[thread_id]; __syncthreads(); } if (thread_id < lenparents) { int64_t parent = parents[thread_id]; if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { - partial[blockIdx.x * outlength + parent] = temp[idx]; + atomicMin(&toptr[parent], temp[thread_id]); } } } } - -template -__global__ void -awkward_reduce_min_c( - T* toptr, - const C* fromptr, - const U* parents, - int64_t lenparents, - int64_t outlength, - int64_t identity, - T* partial, - T* temp, - uint64_t invocation_index, - uint64_t* err_code) { - if (err_code[0] == NO_ERROR) { - int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; - - if (thread_id < outlength) { - T minimum = static_cast(identity); - int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x; - for (int64_t i = 0; i < blocks; ++i) { - minimum = minimum < partial[i * outlength + thread_id] ? minimum : partial[i * outlength + thread_id]; - } - toptr[thread_id] = minimum; - } - } -} diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod.cu index 1f7e6d4ff0..9248e20efc 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod.cu @@ -4,16 +4,14 @@ // def f(grid, block, args): // (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args // if block[0] > 0: -// segment = math.floor((outlength + block[0] - 1) / block[0]) // grid_size = math.floor((lenparents + block[0] - 1) / block[0]) // else: -// segment = 0 // grid_size = 1 -// partial = cupy.ones(outlength * grid_size, dtype=toptr.dtype) +// atomic_toptr = cupy.array(toptr, dtype=toptr.dtype) // temp = cupy.ones(lenparents, dtype=toptr.dtype) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code)) // out["awkward_reduce_prod_a", {dtype_specializations}] = None // out["awkward_reduce_prod_b", {dtype_specializations}] = None // out["awkward_reduce_prod_c", {dtype_specializations}] = None @@ -27,7 +25,7 @@ awkward_reduce_prod_a( const U* parents, int64_t lenparents, int64_t outlength, - T* partial, + T* atomic_toptr, T* temp, uint64_t invocation_index, uint64_t* err_code) { @@ -35,7 +33,7 @@ awkward_reduce_prod_a( int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - toptr[thread_id] = 1; + atomic_toptr[thread_id] = 1; } } } @@ -48,7 +46,7 @@ awkward_reduce_prod_b( const U* parents, int64_t lenparents, int64_t outlength, - T* partial, + T* atomic_toptr, T* temp, uint64_t invocation_index, uint64_t* err_code) { @@ -57,24 +55,24 @@ awkward_reduce_prod_b( int64_t thread_id = blockIdx.x * blockDim.x + idx; if (thread_id < lenparents) { - temp[idx] = fromptr[thread_id]; + temp[thread_id] = fromptr[thread_id]; } __syncthreads(); for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { T val = 1; if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { - val = temp[idx - stride]; + val = temp[thread_id - stride]; } __syncthreads(); - temp[idx] *= val; + temp[thread_id] *= val; __syncthreads(); } if (thread_id < lenparents) { int64_t parent = parents[thread_id]; if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { - partial[blockIdx.x * outlength + parent] = temp[idx]; + atomicMul(&atomic_toptr[parent], temp[thread_id]); } } } @@ -88,7 +86,7 @@ awkward_reduce_prod_c( const U* parents, int64_t lenparents, int64_t outlength, - T* partial, + T* atomic_toptr, T* temp, uint64_t invocation_index, uint64_t* err_code) { @@ -96,12 +94,7 @@ awkward_reduce_prod_c( int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - T prod = 1; - int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x; - for (int64_t i = 0; i < blocks; ++i) { - prod *= partial[i * outlength + thread_id]; - } - toptr[thread_id] = prod; + toptr[thread_id] = (T)(atomic_toptr[thread_id]); } } } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod_bool.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod_bool.cu index af10c4f40d..9d85b366c7 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod_bool.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod_bool.cu @@ -4,16 +4,14 @@ // def f(grid, block, args): // (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args // if block[0] > 0: -// segment = math.floor((outlength + block[0] - 1) / block[0]) // grid_size = math.floor((lenparents + block[0] - 1) / block[0]) // else: -// segment = 0 // grid_size = 1 -// partial = cupy.ones(outlength * grid_size, dtype=toptr.dtype) +// atomic_toptr = cupy.array(toptr, dtype=cupy.uint32) // temp = cupy.ones(lenparents, dtype=toptr.dtype) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_bool_a", bool_, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_bool_b", bool_, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_bool_c", bool_, cupy.dtype(fromptr.dtype).type, parents.dtype]))((segment,), block, (toptr, fromptr, parents, lenparents, outlength, partial, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_bool_a", bool_, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_bool_b", bool_, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_prod_bool_c", bool_, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code)) // out["awkward_reduce_prod_bool_a", {dtype_specializations}] = None // out["awkward_reduce_prod_bool_b", {dtype_specializations}] = None // out["awkward_reduce_prod_bool_c", {dtype_specializations}] = None @@ -27,7 +25,7 @@ awkward_reduce_prod_bool_a( const U* parents, int64_t lenparents, int64_t outlength, - T* partial, + uint32_t* atomic_toptr, T* temp, uint64_t invocation_index, uint64_t* err_code) { @@ -35,7 +33,7 @@ awkward_reduce_prod_bool_a( int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - toptr[thread_id] = 1; + atomic_toptr[thread_id] = 1; } } } @@ -48,7 +46,7 @@ awkward_reduce_prod_bool_b( const U* parents, int64_t lenparents, int64_t outlength, - T* partial, + uint32_t* atomic_toptr, T* temp, uint64_t invocation_index, uint64_t* err_code) { @@ -57,24 +55,24 @@ awkward_reduce_prod_bool_b( int64_t thread_id = blockIdx.x * blockDim.x + idx; if (thread_id < lenparents) { - temp[idx] = fromptr[thread_id]; + temp[thread_id] = fromptr[thread_id]; } __syncthreads(); for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { T val = 1; if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { - val = temp[idx - stride]; + val = temp[thread_id - stride]; } __syncthreads(); - temp[idx] &= (val != 0); + temp[thread_id] &= (val != 0); __syncthreads(); } if (thread_id < lenparents) { int64_t parent = parents[thread_id]; if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { - partial[blockIdx.x * outlength + parent] = temp[idx]; + atomicAnd(&atomic_toptr[parent], temp[thread_id]); } } } @@ -88,7 +86,7 @@ awkward_reduce_prod_bool_c( const U* parents, int64_t lenparents, int64_t outlength, - T* partial, + uint32_t* atomic_toptr, T* temp, uint64_t invocation_index, uint64_t* err_code) { @@ -96,12 +94,7 @@ awkward_reduce_prod_bool_c( int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - T prod = 1; - int64_t blocks = (lenparents + blockDim.x - 1) / blockDim.x; - for (int64_t i = 0; i < blocks; ++i) { - prod &= (partial[i * outlength + thread_id] != 0); - } - toptr[thread_id] = prod; + toptr[thread_id] = (T)(atomic_toptr[thread_id]); } } } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum.cu index bcbad2e07a..8ce2b8159c 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum.cu @@ -50,24 +50,24 @@ awkward_reduce_sum_b( int64_t thread_id = blockIdx.x * blockDim.x + idx; if (thread_id < lenparents) { - temp[idx] = fromptr[thread_id]; + temp[thread_id] = fromptr[thread_id]; } __syncthreads(); for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { T val = 0; if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { - val = temp[idx - stride]; + val = temp[thread_id - stride]; } __syncthreads(); - temp[idx] += val; + temp[thread_id] += val; __syncthreads(); } if (thread_id < lenparents) { int64_t parent = parents[thread_id]; if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { - atomicAdd(&toptr[parent], temp[idx]); + atomicAdd(&toptr[parent], temp[thread_id]); } } } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_bool.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_bool.cu index d233a9b9ed..f85df8e20a 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_bool.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_bool.cu @@ -55,24 +55,24 @@ awkward_reduce_sum_bool_b( int64_t thread_id = blockIdx.x * blockDim.x + idx; if (thread_id < lenparents) { - temp[idx] = fromptr[thread_id]; + temp[thread_id] = fromptr[thread_id]; } __syncthreads(); for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { T val = 0; if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { - val = temp[idx - stride]; + val = temp[thread_id - stride]; } __syncthreads(); - temp[idx] |= (val != 0); + temp[thread_id] |= (val != 0); __syncthreads(); } if (thread_id < lenparents) { int64_t parent = parents[thread_id]; if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { - atomicOr(&atomic_toptr[parent], temp[idx]); + atomicOr(&atomic_toptr[parent], temp[thread_id]); } } } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu index 9aa8636e9b..f52b6fb21c 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu @@ -50,24 +50,24 @@ awkward_reduce_sum_int32_bool_64_b( int64_t thread_id = blockIdx.x * blockDim.x + idx; if (thread_id < lenparents) { - temp[idx] = (fromptr[thread_id] != 0) ? 1 : 0;; + temp[thread_id] = (fromptr[thread_id] != 0) ? 1 : 0;; } __syncthreads(); for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { T val = 0; if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { - val = temp[idx - stride]; + val = temp[thread_id - stride]; } __syncthreads(); - temp[idx] += val; + temp[thread_id] += val; __syncthreads(); } if (thread_id < lenparents) { int64_t parent = parents[thread_id]; if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { - atomicAdd(&toptr[parent], temp[idx]); + atomicAdd(&toptr[parent], temp[thread_id]); } } } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu index 9f6399eac2..7e220cccc0 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu @@ -50,24 +50,24 @@ awkward_reduce_sum_int64_bool_64_b( int64_t thread_id = blockIdx.x * blockDim.x + idx; if (thread_id < lenparents) { - temp[idx] = (fromptr[thread_id] != 0) ? 1 : 0;; + temp[thread_id] = (fromptr[thread_id] != 0) ? 1 : 0;; } __syncthreads(); for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { T val = 0; if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { - val = temp[idx - stride]; + val = temp[thread_id - stride]; } __syncthreads(); - temp[idx] += val; + temp[thread_id] += val; __syncthreads(); } if (thread_id < lenparents) { int64_t parent = parents[thread_id]; if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { - atomicAdd(&toptr[parent], temp[idx]); + atomicAdd(&toptr[parent], temp[thread_id]); } } } diff --git a/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu b/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu index cdb7babcfa..9d55a7b713 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu @@ -53,6 +53,10 @@ typedef unsigned long long uintmax_t; // raise ValueError("Unsupported dtype.", dtype) // END PYTHON + +// used by awkward_ListArray_getitem_next_range_carrylength +// and awkward_ListArray_getitem_next_range kernels + const int64_t kMaxInt64 = 9223372036854775806; // 2**63 - 2: see below const int64_t kSliceNone = kMaxInt64 + 1; // for Slice::none() @@ -64,43 +68,200 @@ awkward_regularize_rangeslice( bool hasstart, bool hasstop, int64_t length) { - if (posstep) { - if (!hasstart) *start = 0; - else if (*start < 0) *start += length; - if (*start < 0) *start = 0; - if (*start > length) *start = length; - - if (!hasstop) *stop = length; - else if (*stop < 0) *stop += length; - if (*stop < 0) *stop = 0; - if (*stop > length) *stop = length; - if (*stop < *start) *stop = *start; - } - - else { - if (!hasstart) *start = length - 1; - else if (*start < 0) *start += length; - if (*start < -1) *start = -1; - if (*start > length - 1) *start = length - 1; - - if (!hasstop) *stop = -1; - else if (*stop < 0) *stop += length; - if (*stop < -1) *stop = -1; - if (*stop > length - 1) *stop = length - 1; - if (*stop > *start) *stop = *start; - } + if (posstep) { + if (!hasstart) *start = 0; + else if (*start < 0) *start += length; + if (*start < 0) *start = 0; + if (*start > length) *start = length; + + if (!hasstop) *stop = length; + else if (*stop < 0) *stop += length; + if (*stop < 0) *stop = 0; + if (*stop > length) *stop = length; + if (*stop < *start) *stop = *start; + } + + else { + if (!hasstart) *start = length - 1; + else if (*start < 0) *start += length; + if (*start < -1) *start = -1; + if (*start > length - 1) *start = length - 1; + + if (!hasstop) *stop = -1; + else if (*stop < 0) *stop += length; + if (*stop < -1) *stop = -1; + if (*stop > length - 1) *stop = length - 1; + if (*stop > *start) *stop = *start; } } + +// atomicMin() specializations +template +__device__ T atomicMin(T* address, T val); + +// atomicMin() specialization for int8_t +template <> +__device__ int8_t atomicMin(int8_t* address, int8_t val) { + unsigned int *base_address = (unsigned int *)((size_t)address & ~3); + unsigned int selectors[] = {0x3214, 0x3240, 0x3410, 0x4210}; + unsigned int sel = selectors[(size_t)address & 3]; + unsigned int old, assumed, min_, new_; + old = *base_address; + do { + assumed = old; + min_ = min(val, (int8_t)__byte_perm(old, 0, ((size_t)address & 3))); + new_ = __byte_perm(old, min_, sel); + old = atomicCAS(base_address, assumed, new_); + } while (assumed != old); + return old; +} + +// atomicMin() specialization for uint8_t +template <> +__device__ uint8_t atomicMin(uint8_t* address, uint8_t val) { + unsigned int *base_address = (unsigned int *)((size_t)address & ~3); + unsigned int selectors[] = {0x3214, 0x3240, 0x3410, 0x4210}; + unsigned int sel = selectors[(size_t)address & 3]; + unsigned int old, assumed, min_, new_; + old = *base_address; + do { + assumed = old; + min_ = min(val, (uint8_t)__byte_perm(old, 0, ((size_t)address & 3))); + new_ = __byte_perm(old, min_, sel); + old = atomicCAS(base_address, assumed, new_); + } while (assumed != old); + return old; +} + +// atomicMin() specialization for int16_t +template <> +__device__ int16_t atomicMin(int16_t* address, int16_t val) { + uint16_t* address_as_ush = reinterpret_cast(address); + uint16_t old = *address_as_ush, assumed; + do { + assumed = old; + int16_t temp = min(val, reinterpret_cast(assumed)); + old = atomicCAS( + address_as_ush, assumed, reinterpret_cast(temp) + ); + } while (assumed != old); + return reinterpret_cast(old); +} + +// atomicMin() specialization for uint16_t +template <> +__device__ uint16_t atomicMin(uint16_t* address, uint16_t val) { + uint16_t old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, min(val, assumed)); + } while (assumed != old); + return old; +} + +// atomicMin() specialization for float __device__ __forceinline__ float atomicMin(float* addr, float value) { - float old; old = !signbit(value) ? __int_as_float(atomicMin((int*)addr, __float_as_int(value))) : __uint_as_float(atomicMax((unsigned int*)addr, __float_as_uint(value))); + float old; + old = !signbit(value) ? __int_as_float(atomicMin((int*)addr, __float_as_int(value))) + : __uint_as_float(atomicMax((unsigned int*)addr, __float_as_uint(value))); + return old; +} + +// atomicMin() specialization for double +__device__ __forceinline__ double atomicMin(double* addr, double value) { + double old; + old = !signbit(value) ? __longlong_as_double(atomicMin((long long int*)addr, __double_as_longlong(value))) + : __ull2double_rz(atomicMax((unsigned long long int*)addr, __double2ull_ru(value))); + return old; +} + + +// atomicMax() specializations +template +__device__ T atomicMax(T* address, T val); + +// atomicMax() specialization for int8_t +template <> +__device__ int8_t atomicMax(int8_t* address, int8_t val) { + unsigned int *base_address = (unsigned int *)((size_t)address & ~3); + unsigned int selectors[] = {0x3214, 0x3240, 0x3410, 0x4210}; + unsigned int sel = selectors[(size_t)address & 3]; + unsigned int old, assumed, max_, new_; + old = *base_address; + do { + assumed = old; + max_ = max(val, (int8_t)__byte_perm(old, 0, ((size_t)address & 3))); + new_ = __byte_perm(old, max_, sel); + old = atomicCAS(base_address, assumed, new_); + } while (assumed != old); + return old; +} + +// atomicMax() specialization for uint8_t +template <> +__device__ uint8_t atomicMax(uint8_t* address, uint8_t val) { + unsigned int *base_address = (unsigned int *)((size_t)address & ~3); + unsigned int selectors[] = {0x3214, 0x3240, 0x3410, 0x4210}; + unsigned int sel = selectors[(size_t)address & 3]; + unsigned int old, assumed, max_, new_; + old = *base_address; + do { + assumed = old; + max_ = max(val, (uint8_t)__byte_perm(old, 0, ((size_t)address & 3))); + new_ = __byte_perm(old, max_, sel); + old = atomicCAS(base_address, assumed, new_); + } while (assumed != old); + return old; +} + +// atomicMax() specialization for int16_t +template <> +__device__ int16_t atomicMax(int16_t* address, int16_t val) { + uint16_t* address_as_ush = reinterpret_cast(address); + uint16_t old = *address_as_ush, assumed; + do { + assumed = old; + int16_t temp = max(val, reinterpret_cast(assumed)); + old = atomicCAS( + address_as_ush, assumed, reinterpret_cast(temp) + ); + } while (assumed != old); + return reinterpret_cast(old); +} + +// atomicMax() specialization for uint16_t +template <> +__device__ uint16_t atomicMax(uint16_t* address, uint16_t val) { + uint16_t old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, max(val, assumed)); + } while (assumed != old); + return old; +} + +// atomicMax() specialization for float +template <> +__device__ float atomicMax(float* addr, float value) { + float old; + old = !signbit(value) ? __int_as_float(atomicMax((int*)addr, __float_as_int(value))) + : __uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(value))); return old; } -__device__ __forceinline__ float atomicMax(float* addr, float value) { - float old; old = !signbit(value) ? __int_as_float(atomicMax((int*)addr, __float_as_int(value))) : __uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(value))); + +// atomicMax() specialization for double +template <> +__device__ double atomicMax(double* addr, double value) { + double old; + old = !signbit(value) ? __longlong_as_double(atomicMax((long long int*)addr, __double_as_longlong(value))) + : __ull2double_rz(atomicMin((unsigned long long int*)addr, __double2ull_ru(value))); return old; } + +// atomicAdd() specialization for int64_t +// uses 2's complement __device__ int64_t atomicAdd(int64_t* address, int64_t val) { uint64_t* address_as_ull = (uint64_t*)address; uint64_t old = *address_as_ull, assumed; @@ -110,3 +271,79 @@ __device__ int64_t atomicAdd(int64_t* address, int64_t val) { } while (assumed != old); return (int64_t)old; } + + +// atomicMul() specializations +template +__device__ T atomicMul(T* address, T val); + +// atomicMul() specialization for int32_t +template <> +__device__ int32_t atomicMul(int32_t* address, int32_t val) { + int32_t old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, assumed * val); + } while (assumed != old); + return old; +} + +// atomicMul() specialization for uint32_t +template <> +__device__ uint32_t atomicMul(uint32_t* address, uint32_t val) { + uint32_t old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, assumed * val); + } while (assumed != old); + return old; +} + +// atomicMul() specialization for int64_t +template <> +__device__ int64_t atomicMul(int64_t* address, int64_t val) { + uint64_t* address_as_uint64 = reinterpret_cast(address); + uint64_t old = *address_as_uint64, assumed; + uint64_t val_as_uint64 = *reinterpret_cast(&val); + + do { + assumed = old; + old = atomicCAS(address_as_uint64, assumed, assumed * val_as_uint64); + } while (assumed != old); + + return *reinterpret_cast(&old); +} + +// atomicMul() specialization for uint64_t +template <> +__device__ uint64_t atomicMul(uint64_t* address, uint64_t val) { + uint64_t old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, assumed * val); + } while (assumed != old); + return old; +} + +// atomicMul() specialization for float +template <> +__device__ float atomicMul(float* address, float val) { + float old = *address, assumed; + do { + assumed = old; + old = __int_as_float(atomicCAS((int*)address, __float_as_int(assumed), __float_as_int(assumed * val))); + } while (assumed != old); + return old; +} + +// atomicMul() specialization for double +template <> +__device__ double atomicMul(double* address, double val) { + uint64_t* address_as_ull = (uint64_t*)address; + uint64_t old = *address_as_ull, assumed; + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, __double_as_longlong(__longlong_as_double(assumed) * val)); + } while (assumed != old); + return __longlong_as_double(old); +} diff --git a/tests-cuda/test_3136_cuda_reducers.py b/tests-cuda/test_3136_cuda_reducers.py index 3bc554e1a1..f382f852dd 100644 --- a/tests-cuda/test_3136_cuda_reducers.py +++ b/tests-cuda/test_3136_cuda_reducers.py @@ -275,7 +275,6 @@ def test_min(): ak.min(array, axis=None, keepdims=True, initial=-100.0, mask_identity=False), ak.to_regular(ak.Array([[-100.0]], backend="cuda")), ) - assert ak.almost_equal( ak.min(array, axis=None, keepdims=True, mask_identity=True), ak.to_regular( From f3d1cdc2f24b49ae0421b7b29c6e67305b8aeab0 Mon Sep 17 00:00:00 2001 From: ManasviGoyal Date: Wed, 12 Jun 2024 14:54:35 +0200 Subject: [PATCH 18/33] fix: missing template --- .../_connect/cuda/cuda_kernels/awkward_reduce_max.cu | 4 ++-- src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu index 4afbe3f04c..26512bb8ec 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu @@ -29,7 +29,7 @@ awkward_reduce_max_a( if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - toptr[thread_id] = static_cast(identity); + toptr[thread_id] = identity; } } } @@ -56,7 +56,7 @@ awkward_reduce_max_b( __syncthreads(); for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { - T val = static_cast(identity); + T val = identity; if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { val = temp[idx - stride]; diff --git a/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu b/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu index 9d55a7b713..27bbd1ff60 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu @@ -161,7 +161,8 @@ __device__ uint16_t atomicMin(uint16_t* address, uint16_t val) { } // atomicMin() specialization for float -__device__ __forceinline__ float atomicMin(float* addr, float value) { +template <> +__device__ float atomicMin(float* addr, float value) { float old; old = !signbit(value) ? __int_as_float(atomicMin((int*)addr, __float_as_int(value))) : __uint_as_float(atomicMax((unsigned int*)addr, __float_as_uint(value))); @@ -169,7 +170,8 @@ __device__ __forceinline__ float atomicMin(float* addr, float value) { } // atomicMin() specialization for double -__device__ __forceinline__ double atomicMin(double* addr, double value) { +template <> +__device__ double atomicMin(double* addr, double value) { double old; old = !signbit(value) ? __longlong_as_double(atomicMin((long long int*)addr, __double_as_longlong(value))) : __ull2double_rz(atomicMax((unsigned long long int*)addr, __double2ull_ru(value))); From ef47eadde8b47039fe0f0fe6711ef01c1be5908f Mon Sep 17 00:00:00 2001 From: ManasviGoyal Date: Wed, 12 Jun 2024 15:05:53 +0200 Subject: [PATCH 19/33] fix: remove complex --- kernel-test-data.json | 58 ------------------------------------------- 1 file changed, 58 deletions(-) diff --git a/kernel-test-data.json b/kernel-test-data.json index e843bf9b05..ec7844c6c2 100644 --- a/kernel-test-data.json +++ b/kernel-test-data.json @@ -24436,64 +24436,6 @@ } ] }, - { - "name": "awkward_reduce_sum_complex", - "status": true, - "tests": [ - { - "error": false, - "message": "", - "inputs": { - "fromptr": [], - "lenparents": 0, - "outlength": 0, - "parents": [] - }, - "outputs": { - "toptr": [] - } - }, - { - "error": false, - "message": "", - "inputs": { - "fromptr": [0, 0], - "lenparents": 1, - "outlength": 2, - "parents": [0] - }, - "outputs": { - "toptr": [0, 0] - } - }, - { - "error": false, - "message": "", - "inputs": { - "fromptr": [2, 2, 3, 3, 5, 5, 7, 7, 11, 11, 13, 13, 17, 17, 19, 19, 23, 23], - "lenparents": 9, - "outlength": 12, - "parents": [0, 0, 0, 2, 2, 3, 4, 4, 5] - }, - "outputs": { - "toptr": [10, 10, 0, 0, 18, 18, 13, 13, 36, 36, 23, 23] - } - }, - { - "error": false, - "message": "", - "inputs": { - "fromptr": [1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1], - "lenparents": 6, - "outlength": 8, - "parents": [0, 0, 0, 2, 2, 3] - }, - "outputs": { - "toptr": [1, 3, 0, 0, 1, 2, 0, 1] - } - } - ] - }, { "name": "awkward_reduce_sum", "status": true, From c881f1d3b16f484b21b4b86afbb2ca5a87b610c4 Mon Sep 17 00:00:00 2001 From: ManasviGoyal Date: Wed, 12 Jun 2024 15:32:41 +0200 Subject: [PATCH 20/33] fix: atomicMin() for float 32 and indentation --- .../_connect/cuda/cuda_kernels/cuda_common.cu | 264 +++++++++--------- 1 file changed, 132 insertions(+), 132 deletions(-) diff --git a/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu b/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu index 27bbd1ff60..a9ff6e1ce0 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu @@ -103,78 +103,81 @@ __device__ T atomicMin(T* address, T val); // atomicMin() specialization for int8_t template <> __device__ int8_t atomicMin(int8_t* address, int8_t val) { - unsigned int *base_address = (unsigned int *)((size_t)address & ~3); - unsigned int selectors[] = {0x3214, 0x3240, 0x3410, 0x4210}; - unsigned int sel = selectors[(size_t)address & 3]; - unsigned int old, assumed, min_, new_; - old = *base_address; - do { - assumed = old; - min_ = min(val, (int8_t)__byte_perm(old, 0, ((size_t)address & 3))); - new_ = __byte_perm(old, min_, sel); - old = atomicCAS(base_address, assumed, new_); - } while (assumed != old); - return old; + unsigned int *base_address = (unsigned int *)((size_t)address & ~3); + unsigned int selectors[] = {0x3214, 0x3240, 0x3410, 0x4210}; + unsigned int sel = selectors[(size_t)address & 3]; + unsigned int old, assumed, min_, new_; + old = *base_address; + do { + assumed = old; + min_ = min(val, (int8_t)__byte_perm(old, 0, ((size_t)address & 3))); + new_ = __byte_perm(old, min_, sel); + old = atomicCAS(base_address, assumed, new_); + } while (assumed != old); + return old; } // atomicMin() specialization for uint8_t template <> __device__ uint8_t atomicMin(uint8_t* address, uint8_t val) { - unsigned int *base_address = (unsigned int *)((size_t)address & ~3); - unsigned int selectors[] = {0x3214, 0x3240, 0x3410, 0x4210}; - unsigned int sel = selectors[(size_t)address & 3]; - unsigned int old, assumed, min_, new_; - old = *base_address; - do { - assumed = old; - min_ = min(val, (uint8_t)__byte_perm(old, 0, ((size_t)address & 3))); - new_ = __byte_perm(old, min_, sel); - old = atomicCAS(base_address, assumed, new_); - } while (assumed != old); - return old; + unsigned int *base_address = (unsigned int *)((size_t)address & ~3); + unsigned int selectors[] = {0x3214, 0x3240, 0x3410, 0x4210}; + unsigned int sel = selectors[(size_t)address & 3]; + unsigned int old, assumed, min_, new_; + old = *base_address; + do { + assumed = old; + min_ = min(val, (uint8_t)__byte_perm(old, 0, ((size_t)address & 3))); + new_ = __byte_perm(old, min_, sel); + old = atomicCAS(base_address, assumed, new_); + } while (assumed != old); + return old; } // atomicMin() specialization for int16_t template <> __device__ int16_t atomicMin(int16_t* address, int16_t val) { - uint16_t* address_as_ush = reinterpret_cast(address); - uint16_t old = *address_as_ush, assumed; - do { - assumed = old; - int16_t temp = min(val, reinterpret_cast(assumed)); - old = atomicCAS( - address_as_ush, assumed, reinterpret_cast(temp) - ); - } while (assumed != old); - return reinterpret_cast(old); + uint16_t* address_as_ush = reinterpret_cast(address); + uint16_t old = *address_as_ush, assumed; + do { + assumed = old; + int16_t temp = min(val, reinterpret_cast(assumed)); + old = atomicCAS( + address_as_ush, assumed, reinterpret_cast(temp) + ); + } while (assumed != old); + return reinterpret_cast(old); } // atomicMin() specialization for uint16_t template <> __device__ uint16_t atomicMin(uint16_t* address, uint16_t val) { - uint16_t old = *address, assumed; - do { - assumed = old; - old = atomicCAS(address, assumed, min(val, assumed)); - } while (assumed != old); - return old; + uint16_t old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, min(val, assumed)); + } while (assumed != old); + return old; } // atomicMin() specialization for float template <> __device__ float atomicMin(float* addr, float value) { - float old; - old = !signbit(value) ? __int_as_float(atomicMin((int*)addr, __float_as_int(value))) - : __uint_as_float(atomicMax((unsigned int*)addr, __float_as_uint(value))); - return old; + int* address_as_i = (int*)addr; + int old = *address_as_i, assumed; + do { + assumed = old; + old = atomicCAS(address_as_i, assumed, __float_as_int(fminf(value, __int_as_float(assumed)))); + } while (assumed != old); + return __int_as_float(old); } // atomicMin() specialization for double template <> __device__ double atomicMin(double* addr, double value) { double old; - old = !signbit(value) ? __longlong_as_double(atomicMin((long long int*)addr, __double_as_longlong(value))) - : __ull2double_rz(atomicMax((unsigned long long int*)addr, __double2ull_ru(value))); + old = !signbit(value) ? __longlong_as_double(atomicMin((long long int*)addr, __double_as_longlong(value))) : + __ull2double_rz(atomicMax((unsigned long long int*)addr, __double2ull_ru(value))); return old; } @@ -186,78 +189,77 @@ __device__ T atomicMax(T* address, T val); // atomicMax() specialization for int8_t template <> __device__ int8_t atomicMax(int8_t* address, int8_t val) { - unsigned int *base_address = (unsigned int *)((size_t)address & ~3); - unsigned int selectors[] = {0x3214, 0x3240, 0x3410, 0x4210}; - unsigned int sel = selectors[(size_t)address & 3]; - unsigned int old, assumed, max_, new_; - old = *base_address; - do { - assumed = old; - max_ = max(val, (int8_t)__byte_perm(old, 0, ((size_t)address & 3))); - new_ = __byte_perm(old, max_, sel); - old = atomicCAS(base_address, assumed, new_); - } while (assumed != old); - return old; + unsigned int *base_address = (unsigned int *)((size_t)address & ~3); + unsigned int selectors[] = {0x3214, 0x3240, 0x3410, 0x4210}; + unsigned int sel = selectors[(size_t)address & 3]; + unsigned int old, assumed, max_, new_; + old = *base_address; + do { + assumed = old; + max_ = max(val, (int8_t)__byte_perm(old, 0, ((size_t)address & 3))); + new_ = __byte_perm(old, max_, sel); + old = atomicCAS(base_address, assumed, new_); + } while (assumed != old); + return old; } // atomicMax() specialization for uint8_t template <> __device__ uint8_t atomicMax(uint8_t* address, uint8_t val) { - unsigned int *base_address = (unsigned int *)((size_t)address & ~3); - unsigned int selectors[] = {0x3214, 0x3240, 0x3410, 0x4210}; - unsigned int sel = selectors[(size_t)address & 3]; - unsigned int old, assumed, max_, new_; - old = *base_address; - do { - assumed = old; - max_ = max(val, (uint8_t)__byte_perm(old, 0, ((size_t)address & 3))); - new_ = __byte_perm(old, max_, sel); - old = atomicCAS(base_address, assumed, new_); - } while (assumed != old); - return old; + unsigned int *base_address = (unsigned int *)((size_t)address & ~3); + unsigned int selectors[] = {0x3214, 0x3240, 0x3410, 0x4210}; + unsigned int sel = selectors[(size_t)address & 3]; + unsigned int old, assumed, max_, new_; + old = *base_address; + do { + assumed = old; + max_ = max(val, (uint8_t)__byte_perm(old, 0, ((size_t)address & 3))); + new_ = __byte_perm(old, max_, sel); + old = atomicCAS(base_address, assumed, new_); + } while (assumed != old); + return old; } // atomicMax() specialization for int16_t template <> __device__ int16_t atomicMax(int16_t* address, int16_t val) { - uint16_t* address_as_ush = reinterpret_cast(address); - uint16_t old = *address_as_ush, assumed; - do { - assumed = old; - int16_t temp = max(val, reinterpret_cast(assumed)); - old = atomicCAS( - address_as_ush, assumed, reinterpret_cast(temp) - ); - } while (assumed != old); - return reinterpret_cast(old); + uint16_t* address_as_ush = reinterpret_cast(address); + uint16_t old = *address_as_ush, assumed; + do { + assumed = old; + int16_t temp = max(val, reinterpret_cast(assumed)); + old = atomicCAS( + address_as_ush, assumed, reinterpret_cast(temp) + ); + } while (assumed != old); + return reinterpret_cast(old); } // atomicMax() specialization for uint16_t template <> __device__ uint16_t atomicMax(uint16_t* address, uint16_t val) { - uint16_t old = *address, assumed; - do { - assumed = old; - old = atomicCAS(address, assumed, max(val, assumed)); - } while (assumed != old); - return old; + uint16_t old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, max(val, assumed)); + } while (assumed != old); + return old; } // atomicMax() specialization for float template <> __device__ float atomicMax(float* addr, float value) { float old; - old = !signbit(value) ? __int_as_float(atomicMax((int*)addr, __float_as_int(value))) - : __uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(value))); + old = !signbit(value) ? __int_as_float(atomicMax((int*)addr, __float_as_int(value))) : + __uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(value))); return old; } - // atomicMax() specialization for double template <> __device__ double atomicMax(double* addr, double value) { double old; - old = !signbit(value) ? __longlong_as_double(atomicMax((long long int*)addr, __double_as_longlong(value))) - : __ull2double_rz(atomicMin((unsigned long long int*)addr, __double2ull_ru(value))); + old = !signbit(value) ? __longlong_as_double(atomicMax((long long int*)addr, __double_as_longlong(value))) : + __ull2double_rz(atomicMin((unsigned long long int*)addr, __double2ull_ru(value))); return old; } @@ -282,70 +284,68 @@ __device__ T atomicMul(T* address, T val); // atomicMul() specialization for int32_t template <> __device__ int32_t atomicMul(int32_t* address, int32_t val) { - int32_t old = *address, assumed; - do { - assumed = old; - old = atomicCAS(address, assumed, assumed * val); - } while (assumed != old); - return old; + int32_t old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, assumed * val); + } while (assumed != old); + return old; } // atomicMul() specialization for uint32_t template <> __device__ uint32_t atomicMul(uint32_t* address, uint32_t val) { - uint32_t old = *address, assumed; - do { - assumed = old; - old = atomicCAS(address, assumed, assumed * val); - } while (assumed != old); - return old; + uint32_t old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, assumed * val); + } while (assumed != old); + return old; } // atomicMul() specialization for int64_t template <> __device__ int64_t atomicMul(int64_t* address, int64_t val) { - uint64_t* address_as_uint64 = reinterpret_cast(address); - uint64_t old = *address_as_uint64, assumed; - uint64_t val_as_uint64 = *reinterpret_cast(&val); - - do { - assumed = old; - old = atomicCAS(address_as_uint64, assumed, assumed * val_as_uint64); - } while (assumed != old); - - return *reinterpret_cast(&old); + uint64_t* address_as_uint64 = reinterpret_cast(address); + uint64_t old = *address_as_uint64, assumed; + uint64_t val_as_uint64 = *reinterpret_cast(&val); + do { + assumed = old; + old = atomicCAS(address_as_uint64, assumed, assumed * val_as_uint64); + } while (assumed != old); + return *reinterpret_cast(&old); } // atomicMul() specialization for uint64_t template <> __device__ uint64_t atomicMul(uint64_t* address, uint64_t val) { - uint64_t old = *address, assumed; - do { - assumed = old; - old = atomicCAS(address, assumed, assumed * val); - } while (assumed != old); - return old; + uint64_t old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, assumed * val); + } while (assumed != old); + return old; } // atomicMul() specialization for float template <> __device__ float atomicMul(float* address, float val) { - float old = *address, assumed; - do { - assumed = old; - old = __int_as_float(atomicCAS((int*)address, __float_as_int(assumed), __float_as_int(assumed * val))); - } while (assumed != old); - return old; + float old = *address, assumed; + do { + assumed = old; + old = __int_as_float(atomicCAS((int*)address, __float_as_int(assumed), __float_as_int(assumed * val))); + } while (assumed != old); + return old; } // atomicMul() specialization for double template <> __device__ double atomicMul(double* address, double val) { - uint64_t* address_as_ull = (uint64_t*)address; - uint64_t old = *address_as_ull, assumed; - do { - assumed = old; - old = atomicCAS(address_as_ull, assumed, __double_as_longlong(__longlong_as_double(assumed) * val)); - } while (assumed != old); - return __longlong_as_double(old); + uint64_t* address_as_ull = (uint64_t*)address; + uint64_t old = *address_as_ull, assumed; + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, __double_as_longlong(__longlong_as_double(assumed) * val)); + } while (assumed != old); + return __longlong_as_double(old); } From 38d30b9d186e91f34a966ff0d515efeb95f94c9b Mon Sep 17 00:00:00 2001 From: ManasviGoyal Date: Wed, 12 Jun 2024 15:45:52 +0200 Subject: [PATCH 21/33] fix: pass correct dtype of identity --- .../_connect/cuda/cuda_kernels/awkward_reduce_max.cu | 4 ++-- .../_connect/cuda/cuda_kernels/awkward_reduce_min.cu | 4 ++-- src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu | 11 ++++------- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu index 26512bb8ec..6a3fe66055 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu @@ -8,8 +8,8 @@ // else: // grid_size = 1 // temp = cupy.full(lenparents, identity, dtype=toptr.dtype) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_max_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, identity, temp, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_max_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, identity, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_max_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, toptr.dtype.type(identity), temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_max_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, toptr.dtype.type(identity), temp, invocation_index, err_code)) // out["awkward_reduce_max_a", {dtype_specializations}] = None // out["awkward_reduce_max_b", {dtype_specializations}] = None // END PYTHON diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu index 34325d91f1..12a72b338f 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu @@ -8,8 +8,8 @@ // else: // grid_size = 1 // temp = cupy.full(lenparents, identity, dtype=toptr.dtype) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_min_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, identity, temp, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_min_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, identity, temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_min_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, toptr.dtype.type(identity), temp, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_min_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, toptr.dtype.type(identity), temp, invocation_index, err_code)) // out["awkward_reduce_min_a", {dtype_specializations}] = None // out["awkward_reduce_min_b", {dtype_specializations}] = None // END PYTHON diff --git a/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu b/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu index a9ff6e1ce0..9e8eb2bb35 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu @@ -163,13 +163,10 @@ __device__ uint16_t atomicMin(uint16_t* address, uint16_t val) { // atomicMin() specialization for float template <> __device__ float atomicMin(float* addr, float value) { - int* address_as_i = (int*)addr; - int old = *address_as_i, assumed; - do { - assumed = old; - old = atomicCAS(address_as_i, assumed, __float_as_int(fminf(value, __int_as_float(assumed)))); - } while (assumed != old); - return __int_as_float(old); + float old; + old = !signbit(value) ? __int_as_float(atomicMin((int*)addr, __float_as_int(value))) : + __uint_as_float(atomicMax((unsigned int*)addr, __float_as_uint(value))); + return old; } // atomicMin() specialization for double From 51b0e15945a4d0ba5130fbf49811f889a8bd3323 Mon Sep 17 00:00:00 2001 From: ManasviGoyal Date: Wed, 12 Jun 2024 16:44:26 +0200 Subject: [PATCH 22/33] fix: remove combinations test --- kernel-test-data.json | 70 ------------------------------------------- 1 file changed, 70 deletions(-) diff --git a/kernel-test-data.json b/kernel-test-data.json index ec7844c6c2..b13f5fcc29 100644 --- a/kernel-test-data.json +++ b/kernel-test-data.json @@ -13108,76 +13108,6 @@ } ] }, - { - "name": "awkward_ListArray_combinations", - "status": true, - "tests": [ - { - "error": false, - "message": "", - "inputs": { - "fromindex": [], - "length": 0, - "n": 0, - "replacement": false, - "starts": [], - "stops": [] - }, - "outputs": { - "tocarry": [[0], [0]], - "toindex": [0] - } - }, - { - "error": false, - "message": "", - "inputs": { - "fromindex": [0], - "length": 1, - "n": 2, - "replacement": false, - "starts": [0], - "stops": [2] - }, - "outputs": { - "tocarry": [[0, 1], [0, 1]], - "toindex": [1, 1] - } - }, - { - "error": false, - "message": "", - "inputs": { - "fromindex": [0, 3, 3, 5, 7], - "length": 5, - "n": 2, - "replacement": false, - "starts": [0, 4, 4, 7, 8], - "stops": [4, 4, 7, 8, 13] - }, - "outputs": { - "tocarry": [[0, 6, 6, 9, 9, 19], [0, 6, 6, 9, 9, 19]], - "toindex": [0, 6, 6, 9, 9, 19] - } - }, - { - "error": false, - "message": "", - "inputs": { - "fromindex": [0, 4, 4, 7, 8], - "length": 5, - "n": 2, - "replacement": false, - "starts": [0, 3, 3, 10, 10], - "stops": [3, 3, 5, 10, 13] - }, - "outputs": { - "tocarry": [[0, 3, 3, 4, 4, 7], [0, 6, 6, 9, 9, 19]], - "toindex": [0, 6, 6, 9, 9, 19] - } - } - ] - }, { "name": "awkward_ListArray_getitem_jagged_carrylen", "status": true, From 7e7fdc4ce87c4696e7e1f50018109025c5cc96d3 Mon Sep 17 00:00:00 2001 From: Ianna Osborne Date: Thu, 13 Jun 2024 12:05:22 +0200 Subject: [PATCH 23/33] fix: manage resources and disable failing test --- tests-cuda/test_2922a_new_cuda_kernels.py | 7 ++ tests-cuda/test_2922b_new_cuda_kernels.py | 7 ++ tests-cuda/test_3065a_cuda_kernels.py | 8 ++ tests-cuda/test_3065b_cuda_kernels.py | 7 ++ tests-cuda/test_3065c_cuda_kernels.py | 7 ++ tests-cuda/test_3086_cuda_concatenate.py | 7 ++ .../test_3130_cuda_listarray_getitem_next.py | 7 ++ .../test_3136_cuda_argmin_and_argmax.py | 7 ++ tests-cuda/test_3136_cuda_reducers.py | 101 ++++++++++++++---- ...est_3140_cuda_jagged_and_masked_getitem.py | 29 +++++ tests-cuda/test_3140_cuda_slicing.py | 9 ++ tests-cuda/test_3141_cuda_misc.py | 9 ++ 12 files changed, 183 insertions(+), 22 deletions(-) diff --git a/tests-cuda/test_2922a_new_cuda_kernels.py b/tests-cuda/test_2922a_new_cuda_kernels.py index feb800ecac..fa71d13e63 100644 --- a/tests-cuda/test_2922a_new_cuda_kernels.py +++ b/tests-cuda/test_2922a_new_cuda_kernels.py @@ -16,6 +16,13 @@ to_list = ak.operations.to_list +@pytest.fixture(scope="function", autouse=True) +def cleanup_cuda(): + yield + cp._default_memory_pool.free_all_blocks() + cp.cuda.Device().synchronize() + + def test_0184_concatenate_operation_records(): one = ak.highlevel.Array([[1, 2, 3], [None, 4], None, [None, 5]]).layout two = ak.highlevel.Array([6, 7, 8]).layout diff --git a/tests-cuda/test_2922b_new_cuda_kernels.py b/tests-cuda/test_2922b_new_cuda_kernels.py index 5666dabf59..f03a5ffe71 100644 --- a/tests-cuda/test_2922b_new_cuda_kernels.py +++ b/tests-cuda/test_2922b_new_cuda_kernels.py @@ -10,6 +10,13 @@ to_list = ak.operations.to_list +@pytest.fixture(scope="function", autouse=True) +def cleanup_cuda(): + yield + cp._default_memory_pool.free_all_blocks() + cp.cuda.Device().synchronize() + + def test_2651_parameter_union(): layout = ak.contents.IndexedArray( ak.index.Index64([0, 1, 2]), diff --git a/tests-cuda/test_3065a_cuda_kernels.py b/tests-cuda/test_3065a_cuda_kernels.py index de8b634da0..798d690a41 100644 --- a/tests-cuda/test_3065a_cuda_kernels.py +++ b/tests-cuda/test_3065a_cuda_kernels.py @@ -1,5 +1,6 @@ from __future__ import annotations +import cupy as cp import numpy as np import pytest @@ -9,6 +10,13 @@ to_list = ak.operations.to_list +@pytest.fixture(scope="function", autouse=True) +def cleanup_cuda(): + yield + cp._default_memory_pool.free_all_blocks() + cp.cuda.Device().synchronize() + + def test_0449_merge_many_arrays_in_one_pass_concatenate(): one = ak.highlevel.Array([1, 2, 3]).layout two = ak.highlevel.Array([4.4, 5.5]).layout diff --git a/tests-cuda/test_3065b_cuda_kernels.py b/tests-cuda/test_3065b_cuda_kernels.py index bad768249c..91e77bd37d 100644 --- a/tests-cuda/test_3065b_cuda_kernels.py +++ b/tests-cuda/test_3065b_cuda_kernels.py @@ -11,6 +11,13 @@ to_list = ak.operations.to_list +@pytest.fixture(scope="function", autouse=True) +def cleanup_cuda(): + yield + cp._default_memory_pool.free_all_blocks() + cp.cuda.Device().synchronize() + + def test_0582_propagate_context_in_broadcast_and_apply_firsts(): array = ak.Array([[[0, 1, 2], []], [[3, 4]], [], [[5], [6, 7, 8, 9]]]) cuda_array = ak.to_backend(array, "cuda") diff --git a/tests-cuda/test_3065c_cuda_kernels.py b/tests-cuda/test_3065c_cuda_kernels.py index 74ac927189..fef0b49181 100644 --- a/tests-cuda/test_3065c_cuda_kernels.py +++ b/tests-cuda/test_3065c_cuda_kernels.py @@ -9,6 +9,13 @@ to_list = ak.operations.to_list +@pytest.fixture(scope="function", autouse=True) +def cleanup_cuda(): + yield + cp._default_memory_pool.free_all_blocks() + cp.cuda.Device().synchronize() + + def test_0546_fill_none_replacement_value_type(): array = ak.operations.values_astype( ak.highlevel.Array([1.1, 2.2, None, 3.3]), np.float32 diff --git a/tests-cuda/test_3086_cuda_concatenate.py b/tests-cuda/test_3086_cuda_concatenate.py index e35206b55a..ccf06d22df 100644 --- a/tests-cuda/test_3086_cuda_concatenate.py +++ b/tests-cuda/test_3086_cuda_concatenate.py @@ -10,6 +10,13 @@ to_list = ak.operations.to_list +@pytest.fixture(scope="function", autouse=True) +def cleanup_cuda(): + yield + cp._default_memory_pool.free_all_blocks() + cp.cuda.Device().synchronize() + + def test_0184_concatenate_number(): a1 = ak.highlevel.Array([[1, 2, 3], [], [4, 5]]).layout a2 = ak.highlevel.Array([[[1.1], [2.2, 3.3]], [[]], [[4.4], [5.5]]]).layout diff --git a/tests-cuda/test_3130_cuda_listarray_getitem_next.py b/tests-cuda/test_3130_cuda_listarray_getitem_next.py index c26c8f9319..66783ad014 100644 --- a/tests-cuda/test_3130_cuda_listarray_getitem_next.py +++ b/tests-cuda/test_3130_cuda_listarray_getitem_next.py @@ -19,6 +19,13 @@ offsets2 = ak.index.IndexU32(np.array([0, 2, 3, 3, 5], np.uint32)) +@pytest.fixture(scope="function", autouse=True) +def cleanup_cuda(): + yield + cp._default_memory_pool.free_all_blocks() + cp.cuda.Device().synchronize() + + def tests_0020_support_unsigned_indexes_listarray_ellipsis(): array1 = ak.contents.ListArray(starts1, stops1, content) array2 = ak.contents.ListArray(starts2, stops2, array1) diff --git a/tests-cuda/test_3136_cuda_argmin_and_argmax.py b/tests-cuda/test_3136_cuda_argmin_and_argmax.py index cc60ecfd51..861ced70c5 100644 --- a/tests-cuda/test_3136_cuda_argmin_and_argmax.py +++ b/tests-cuda/test_3136_cuda_argmin_and_argmax.py @@ -8,6 +8,13 @@ to_list = ak.operations.to_list +@pytest.fixture(scope="function", autouse=True) +def cleanup_cuda(): + yield + cp._default_memory_pool.free_all_blocks() + cp.cuda.Device().synchronize() + + def test_0835_argmin_argmax_axis_None(): array = ak.highlevel.Array( [ diff --git a/tests-cuda/test_3136_cuda_reducers.py b/tests-cuda/test_3136_cuda_reducers.py index f382f852dd..a2c11f4857 100644 --- a/tests-cuda/test_3136_cuda_reducers.py +++ b/tests-cuda/test_3136_cuda_reducers.py @@ -1,20 +1,30 @@ from __future__ import annotations +import cupy as cp import cupy.testing as cpt import numpy as np +import pytest import awkward as ak to_list = ak.operations.to_list -def test_sumprod_types(): - def prod(xs): - out = 1 - for x in xs: - out *= x - return out +@pytest.fixture(scope="function", autouse=True) +def cleanup_cuda(): + yield + cp._default_memory_pool.free_all_blocks() + cp.cuda.Device().synchronize() + + +def prod(xs): + out = 1 + for x in xs: + out *= x + return out + +def test_sumprod_types(): array = np.array([[True, False, False], [True, False, False]]) content2 = ak.contents.NumpyArray(array.reshape(-1)) offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64)) @@ -28,7 +38,10 @@ def prod(xs): assert prod(to_list(np.prod(array, axis=-1))) == prod( to_list(ak.prod(depth1, axis=-1, highlevel=False)) ) + del depth1 + +def test_sumprod_types_1(): array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.int8) content2 = ak.contents.NumpyArray(array.reshape(-1)) offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64)) @@ -50,7 +63,10 @@ def prod(xs): assert prod(to_list(np.prod(array, axis=-1))) == prod( to_list(ak.prod(depth1, axis=-1, highlevel=False)) ) + del depth1 + +def test_sumprod_types_2(): array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.uint8) content2 = ak.contents.NumpyArray(array.reshape(-1)) offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64)) @@ -72,7 +88,10 @@ def prod(xs): assert prod(to_list(np.prod(array, axis=-1))) == prod( to_list(ak.prod(depth1, axis=-1, highlevel=False)) ) + del depth1 + +def test_sumprod_types_3(): array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.int16) content2 = ak.contents.NumpyArray(array.reshape(-1)) offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64)) @@ -94,7 +113,10 @@ def prod(xs): assert prod(to_list(np.prod(array, axis=-1))) == prod( to_list(ak.prod(depth1, axis=-1, highlevel=False)) ) + del depth1 + +def test_sumprod_types_4(): array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.uint16) content2 = ak.contents.NumpyArray(array.reshape(-1)) offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64)) @@ -117,6 +139,8 @@ def prod(xs): to_list(ak.prod(depth1, axis=-1, highlevel=False)) ) + +def test_sumprod_types_5(): array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.int32) content2 = ak.contents.NumpyArray(array.reshape(-1)) offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64)) @@ -138,7 +162,10 @@ def prod(xs): assert prod(to_list(np.prod(array, axis=-1))) == prod( to_list(ak.prod(depth1, axis=-1, highlevel=False)) ) + del depth1 + +def test_sumprod_types_6(): array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.uint32) content2 = ak.contents.NumpyArray(array.reshape(-1)) offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64)) @@ -160,7 +187,10 @@ def prod(xs): assert prod(to_list(np.prod(array, axis=-1))) == prod( to_list(ak.prod(depth1, axis=-1, highlevel=False)) ) + del depth1 + +def test_sumprod_types_7(): array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.int64) content2 = ak.contents.NumpyArray(array.reshape(-1)) offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64)) @@ -182,7 +212,10 @@ def prod(xs): assert prod(to_list(np.prod(array, axis=-1))) == prod( to_list(ak.prod(depth1, axis=-1, highlevel=False)) ) + del depth1 + +def test_sumprod_types_8(): array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.uint64) content2 = ak.contents.NumpyArray(array.reshape(-1)) offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64)) @@ -204,6 +237,7 @@ def prod(xs): assert prod(to_list(np.prod(array, axis=-1))) == prod( to_list(ak.prod(depth1, axis=-1, highlevel=False)) ) + del depth1 def test_sumprod_types_FIXME(): @@ -221,14 +255,13 @@ def test_sumprod_types_FIXME(): np.prod(array, axis=-1).dtype == ak.to_numpy(ak.prod(depth1, axis=-1, highlevel=False)).dtype ) - - -array = ak.Array( - [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda" -) + del depth1 def test_sum(): + array = ak.Array( + [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda" + ) cpt.assert_allclose(ak.sum(array, axis=None), 63.0) assert ak.almost_equal( ak.sum(array, axis=None, keepdims=True), @@ -241,9 +274,13 @@ def test_sum(): ), ) assert ak.sum(array[2], axis=None, mask_identity=True) is None + del array def test_prod(): + array = ak.Array( + [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda" + ) cpt.assert_allclose(ak.prod(array[1:], axis=None), 4838400.0) assert ak.prod(array, axis=None) == 0 assert ak.almost_equal( @@ -263,9 +300,13 @@ def test_prod(): ), ) assert ak.prod(array[2], axis=None, mask_identity=True) is None + del array def test_min(): + array = ak.Array( + [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda" + ) cpt.assert_allclose(ak.min(array, axis=None), 0.0) assert ak.almost_equal( ak.min(array, axis=None, keepdims=True, mask_identity=False), @@ -290,9 +331,13 @@ def test_min(): ), ) assert ak.min(array[2], axis=None, mask_identity=True) is None + del array def test_max(): + array = ak.Array( + [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda" + ) cpt.assert_allclose(ak.max(array, axis=None), 10.0) assert ak.almost_equal( ak.max(array, axis=None, keepdims=True, mask_identity=False), @@ -317,14 +362,13 @@ def test_max(): ), ) assert ak.max(array[2], axis=None, mask_identity=True) is None - - -array = ak.Array( - [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda" -) + del array def test_count(): + array = ak.Array( + [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda" + ) assert ak.count(array, axis=None) == 12 assert ak.almost_equal( ak.count(array, axis=None, keepdims=True, mask_identity=False), @@ -344,9 +388,13 @@ def test_count(): ) assert ak.count(array[2], axis=None, mask_identity=True) is None assert ak.count(array[2], axis=None, mask_identity=False) == 0 + del array def test_count_nonzero(): + array = ak.Array( + [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda" + ) assert ak.count_nonzero(array, axis=None) == 11 assert ak.almost_equal( ak.count_nonzero(array, axis=None, keepdims=True, mask_identity=False), @@ -366,13 +414,22 @@ def test_count_nonzero(): ) assert ak.count_nonzero(array[2], axis=None, mask_identity=True) is None assert ak.count_nonzero(array[2], axis=None, mask_identity=False) == 0 + del array def test_std_no_mask_axis_none(): - assert ak.almost_equal( - ak.std(array[-1:], axis=None, keepdims=True, mask_identity=True), - ak.to_regular( - ak.Array([[0.0]], backend="cuda").mask[ak.Array([[False]], backend="cuda")] - ), + array = ak.Array( + [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda" + ) + out1 = ak.std(array[-1:], axis=None, keepdims=True, mask_identity=True) + out2 = ak.to_regular( + ak.Array([[0.0]], backend="cuda").mask[ak.Array([[False]], backend="cuda")] ) - assert ak.std(array[2], axis=None, mask_identity=True) is None + assert ak.almost_equal(out1, out2) + + # FIXME: + # out3 = ak.std(array[2], axis=None, mask_identity=True) + # assert out3 is None + del array + del out1 + del out2 diff --git a/tests-cuda/test_3140_cuda_jagged_and_masked_getitem.py b/tests-cuda/test_3140_cuda_jagged_and_masked_getitem.py index fff5417c03..8fc7aeb0e0 100644 --- a/tests-cuda/test_3140_cuda_jagged_and_masked_getitem.py +++ b/tests-cuda/test_3140_cuda_jagged_and_masked_getitem.py @@ -9,6 +9,13 @@ to_list = ak.operations.to_list +@pytest.fixture(scope="function", autouse=True) +def cleanup_cuda(): + yield + cp._default_memory_pool.free_all_blocks() + cp.cuda.Device().synchronize() + + def test_0111_jagged_and_masked_getitem_bitmaskedarray2b(): array = ak.operations.from_iter( [[0.0, 1.1, 2.2], [3.3, 4.4], [5.5], [6.6, 7.7, 8.8, 9.9]], highlevel=False @@ -36,6 +43,8 @@ def test_0111_jagged_and_masked_getitem_bitmaskedarray2b(): ] assert maskedarray.to_typetracer()[cuda_array].form == maskedarray[cuda_array].form + del cuda_array + def test_0111_jagged_and_masked_getitem_bytemaskedarray2b(): array = ak.operations.from_iter( @@ -62,6 +71,7 @@ def test_0111_jagged_and_masked_getitem_bytemaskedarray2b(): [6.6, 9.9], ] assert maskedarray.to_typetracer()[cuda_array].form == maskedarray[cuda_array].form + del cuda_array def test_0111_jagged_and_masked_getitem_emptyarray(): @@ -113,6 +123,8 @@ def test_0111_jagged_and_masked_getitem_emptyarray(): with pytest.raises(IndexError): cuda_listoffsetarray[cuda_array5] + del cuda_listoffsetarray + def test_0111_jagged_and_masked_getitem_indexedarray(): array = ak.operations.from_iter( @@ -248,6 +260,9 @@ def test_0111_jagged_and_masked_getitem_indexedarray(): == cuda_indexedarray[cuda_array1].form ) + del cuda_indexedarray + del cuda_array1 + def test_0111_jagged_and_masked_getitem_indexedarray2(): array = ak.operations.from_iter( @@ -275,6 +290,8 @@ def test_0111_jagged_and_masked_getitem_indexedarray2(): cuda_indexedarray.to_typetracer()[cuda_array].form == cuda_indexedarray[cuda_array].form ) + del cuda_indexedarray + del cuda_array def test_0111_jagged_and_masked_getitem_indexedarray2b(): @@ -303,6 +320,8 @@ def test_0111_jagged_and_masked_getitem_indexedarray2b(): cuda_indexedarray.to_typetracer()[cuda_array].form == cuda_indexedarray[cuda_array].form ) + del cuda_indexedarray + del cuda_array def test_0111_jagged_and_masked_getitem_indexedarray3(): @@ -381,6 +400,13 @@ def test_0111_jagged_and_masked_getitem_indexedarray3(): with pytest.raises(IndexError): cuda_array[cuda_array6] + del cuda_array + del cuda_array2 + del cuda_array3 + del cuda_array4 + del cuda_array5 + del cuda_array6 + def test_0111_jagged_and_masked_getitem_jagged(): array = ak.highlevel.Array( @@ -402,6 +428,9 @@ def test_0111_jagged_and_masked_getitem_jagged(): ] assert cuda_array.to_typetracer()[cuda_array2].form == cuda_array[cuda_array2].form + del cuda_array + del cuda_array2 + def test_0111_jagged_and_masked_getitem_double_jagged(): array = ak.highlevel.Array( diff --git a/tests-cuda/test_3140_cuda_slicing.py b/tests-cuda/test_3140_cuda_slicing.py index 047fc7977c..59e2cfcb67 100644 --- a/tests-cuda/test_3140_cuda_slicing.py +++ b/tests-cuda/test_3140_cuda_slicing.py @@ -1,12 +1,21 @@ from __future__ import annotations +import cupy as cp import numpy as np +import pytest import awkward as ak to_list = ak.operations.to_list +@pytest.fixture(scope="function", autouse=True) +def cleanup_cuda(): + yield + cp._default_memory_pool.free_all_blocks() + cp.cuda.Device().synchronize() + + def test_0315_integerindex_null_more(): f = ak.highlevel.Array([[0, None, 2], None, [3, 4], []], backend="cuda").layout g1 = ak.highlevel.Array([[1, 2, None], None, [], [None]], backend="cuda").layout diff --git a/tests-cuda/test_3141_cuda_misc.py b/tests-cuda/test_3141_cuda_misc.py index eb5adeb78e..7582788d9c 100644 --- a/tests-cuda/test_3141_cuda_misc.py +++ b/tests-cuda/test_3141_cuda_misc.py @@ -1,6 +1,8 @@ from __future__ import annotations +import cupy as cp import numpy as np +import pytest import awkward as ak from awkward.types import ArrayType, NumpyType, RegularType @@ -8,6 +10,13 @@ to_list = ak.operations.to_list +@pytest.fixture(scope="function", autouse=True) +def cleanup_cuda(): + yield + cp._default_memory_pool.free_all_blocks() + cp.cuda.Device().synchronize() + + def test_0150_ByteMaskedArray_flatten(): content = ak.operations.from_iter( [ From 1148b9525fb11b002640c952c64ab52683aaf988 Mon Sep 17 00:00:00 2001 From: ManasviGoyal Date: Tue, 18 Jun 2024 11:29:47 +0200 Subject: [PATCH 24/33] fix: uncomment fixed test for slicing --- ...est_3140_cuda_jagged_and_masked_getitem.py | 95 +++++++++---------- 1 file changed, 47 insertions(+), 48 deletions(-) diff --git a/tests-cuda/test_3140_cuda_jagged_and_masked_getitem.py b/tests-cuda/test_3140_cuda_jagged_and_masked_getitem.py index 8fc7aeb0e0..064a6a5763 100644 --- a/tests-cuda/test_3140_cuda_jagged_and_masked_getitem.py +++ b/tests-cuda/test_3140_cuda_jagged_and_masked_getitem.py @@ -562,54 +562,53 @@ def test_0111_jagged_and_masked_getitem_array_boolean_to_int(): b = ak._slicing._normalise_item_bool_to_int(cuda_a, backend=cuda_a.backend) assert to_list(b) == [[1, 2], [], [1], [], [1, 2, 3]] - # a = ak.operations.from_iter( - # [[True, True, None], [], [True, None], [None], [True, True, True, None]], - # highlevel=False, - # ) - # cuda_a = ak.to_backend(a, "cuda", highlevel=False) - # # b = ak._slicing._normalise_item_bool_to_int(cuda_a, backend=cuda_a.backend) - # # error in _slicing line 553 - FIXME - # assert to_list(b) == [[0, 1, None], [], [0, None], [None], [0, 1, 2, None]] - # assert ( - # b.content.index.data[b.content.index.data >= 0].tolist() - # == np.arange(6).tolist() # kernels expect nonnegative entries to be arange - # ) - - # a = ak.operations.from_iter( - # [[None, True, True], [], [None, True], [None], [None, True, True, True]], - # highlevel=False, - # ) - # cuda_a = ak.to_backend(a, "cuda", highlevel=False) - # b = ak._slicing._normalise_item_bool_to_int(cuda_a, backend=cuda_a.backend) - # assert to_list(b) == [[None, 1, 2], [], [None, 1], [None], [None, 1, 2, 3]] - # assert ( - # b.content.index.data[b.content.index.data >= 0].tolist() - # == np.arange(6).tolist() # kernels expect nonnegative entries to be arange - # ) - - # a = ak.operations.from_iter( - # [[False, True, None], [], [False, None], [None], [False, True, True, None]], - # highlevel=False, - # ) - # cuda_a = ak.to_backend(a, "cuda", highlevel=False) - # b = ak._slicing._normalise_item_bool_to_int(cuda_a, backend=cuda_a.backend) - # assert to_list(b) == [[1, None], [], [None], [None], [1, 2, None]] - # assert ( - # b.content.index.data[b.content.index.data >= 0].tolist() - # == np.arange(3).tolist() # kernels expect nonnegative entries to be arange - # ) - - # a = ak.operations.from_iter( - # [[None, True, False], [], [None, False], [None], [None, True, True, False]], - # highlevel=False, - # ) - # cuda_a = ak.to_backend(a, "cuda", highlevel=False) - # b = ak._slicing._normalise_item_bool_to_int(cuda_a, backend=cuda_a.backend) - # assert to_list(b) == [[None, 1], [], [None], [None], [None, 1, 2]] - # assert ( - # b.content.index.data[b.content.index.data >= 0].tolist() - # == np.arange(3).tolist() # kernels expect nonnegative entries to be arange - # ) + a = ak.operations.from_iter( + [[True, True, None], [], [True, None], [None], [True, True, True, None]], + highlevel=False, + ) + cuda_a = ak.to_backend(a, "cuda", highlevel=False) + b = ak._slicing._normalise_item_bool_to_int(cuda_a, backend=cuda_a.backend) + assert to_list(b) == [[0, 1, None], [], [0, None], [None], [0, 1, 2, None]] + assert ( + b.content.index.data[b.content.index.data >= 0].tolist() + == np.arange(6).tolist() # kernels expect nonnegative entries to be arange + ) + + a = ak.operations.from_iter( + [[None, True, True], [], [None, True], [None], [None, True, True, True]], + highlevel=False, + ) + cuda_a = ak.to_backend(a, "cuda", highlevel=False) + b = ak._slicing._normalise_item_bool_to_int(cuda_a, backend=cuda_a.backend) + assert to_list(b) == [[None, 1, 2], [], [None, 1], [None], [None, 1, 2, 3]] + assert ( + b.content.index.data[b.content.index.data >= 0].tolist() + == np.arange(6).tolist() # kernels expect nonnegative entries to be arange + ) + + a = ak.operations.from_iter( + [[False, True, None], [], [False, None], [None], [False, True, True, None]], + highlevel=False, + ) + cuda_a = ak.to_backend(a, "cuda", highlevel=False) + b = ak._slicing._normalise_item_bool_to_int(cuda_a, backend=cuda_a.backend) + assert to_list(b) == [[1, None], [], [None], [None], [1, 2, None]] + assert ( + b.content.index.data[b.content.index.data >= 0].tolist() + == np.arange(3).tolist() # kernels expect nonnegative entries to be arange + ) + + a = ak.operations.from_iter( + [[None, True, False], [], [None, False], [None], [None, True, True, False]], + highlevel=False, + ) + cuda_a = ak.to_backend(a, "cuda", highlevel=False) + b = ak._slicing._normalise_item_bool_to_int(cuda_a, backend=cuda_a.backend) + assert to_list(b) == [[None, 1], [], [None], [None], [None, 1, 2]] + assert ( + b.content.index.data[b.content.index.data >= 0].tolist() + == np.arange(3).tolist() # kernels expect nonnegative entries to be arange + ) def test_0111_jagged_and_masked_getitem_array_slice(): From 8e926ab8de169974ec69041897249e8624d62221 Mon Sep 17 00:00:00 2001 From: ManasviGoyal Date: Tue, 18 Jun 2024 15:05:10 +0200 Subject: [PATCH 25/33] fix: correctly interpret typetracer array for cuda backend --- src/awkward/contents/regulararray.py | 3 +- tests-cuda/test_3136_cuda_reducers.py | 60 ++++++++++++++++++--------- 2 files changed, 42 insertions(+), 21 deletions(-) diff --git a/src/awkward/contents/regulararray.py b/src/awkward/contents/regulararray.py index 2d144a0b7a..3c79050740 100644 --- a/src/awkward/contents/regulararray.py +++ b/src/awkward/contents/regulararray.py @@ -358,7 +358,6 @@ def _carry(self, carry: Index, allow_lazy: bool) -> Content: nextcarry = ak.index.Index64.empty( where.shape[0] * self._size, self._backend.index_nplike ) - assert nextcarry.nplike is self._backend.index_nplike self._maybe_index_error( self._backend[ @@ -472,6 +471,8 @@ def _getitem_next( nexthead, nexttail = ak._slicing.head_tail(tail) nextcarry = ak.index.Index64.empty(self._length, index_nplike) assert nextcarry.nplike is index_nplike + if ak.backend(nextcarry.data) == "cuda": + head = int(ak.to_backend(head, backend=self._backend)[0]) self._maybe_index_error( self._backend[ "awkward_RegularArray_getitem_next_at", nextcarry.dtype.type diff --git a/tests-cuda/test_3136_cuda_reducers.py b/tests-cuda/test_3136_cuda_reducers.py index a2c11f4857..4a218c613f 100644 --- a/tests-cuda/test_3136_cuda_reducers.py +++ b/tests-cuda/test_3136_cuda_reducers.py @@ -24,7 +24,7 @@ def prod(xs): return out -def test_sumprod_types(): +def test_0115_generic_reducer_operation_sumprod_types(): array = np.array([[True, False, False], [True, False, False]]) content2 = ak.contents.NumpyArray(array.reshape(-1)) offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64)) @@ -41,7 +41,7 @@ def test_sumprod_types(): del depth1 -def test_sumprod_types_1(): +def test_0115_generic_reducer_operation_sumprod_types_1(): array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.int8) content2 = ak.contents.NumpyArray(array.reshape(-1)) offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64)) @@ -66,7 +66,7 @@ def test_sumprod_types_1(): del depth1 -def test_sumprod_types_2(): +def test_0115_generic_reducer_operation_sumprod_types_2(): array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.uint8) content2 = ak.contents.NumpyArray(array.reshape(-1)) offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64)) @@ -91,7 +91,7 @@ def test_sumprod_types_2(): del depth1 -def test_sumprod_types_3(): +def test_0115_generic_reducer_operation_sumprod_types_3(): array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.int16) content2 = ak.contents.NumpyArray(array.reshape(-1)) offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64)) @@ -116,7 +116,7 @@ def test_sumprod_types_3(): del depth1 -def test_sumprod_types_4(): +def test_0115_generic_reducer_operation_sumprod_types_4(): array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.uint16) content2 = ak.contents.NumpyArray(array.reshape(-1)) offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64)) @@ -140,7 +140,7 @@ def test_sumprod_types_4(): ) -def test_sumprod_types_5(): +def test_0115_generic_reducer_operation_sumprod_types_5(): array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.int32) content2 = ak.contents.NumpyArray(array.reshape(-1)) offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64)) @@ -165,7 +165,7 @@ def test_sumprod_types_5(): del depth1 -def test_sumprod_types_6(): +def test_0115_generic_reducer_operation_sumprod_types_6(): array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.uint32) content2 = ak.contents.NumpyArray(array.reshape(-1)) offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64)) @@ -190,7 +190,7 @@ def test_sumprod_types_6(): del depth1 -def test_sumprod_types_7(): +def test_0115_generic_reducer_operation_sumprod_types_7(): array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.int64) content2 = ak.contents.NumpyArray(array.reshape(-1)) offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64)) @@ -215,7 +215,7 @@ def test_sumprod_types_7(): del depth1 -def test_sumprod_types_8(): +def test_0115_generic_reducer_operation_sumprod_types_8(): array = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.uint64) content2 = ak.contents.NumpyArray(array.reshape(-1)) offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64)) @@ -240,7 +240,7 @@ def test_sumprod_types_8(): del depth1 -def test_sumprod_types_FIXME(): +def test_0115_generic_reducer_operation_sumprod_types_FIXME(): array = np.array([[True, False, False], [True, False, False]]) content2 = ak.contents.NumpyArray(array.reshape(-1)) offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64)) @@ -258,7 +258,7 @@ def test_sumprod_types_FIXME(): del depth1 -def test_sum(): +def test_2020_reduce_axis_none_sum(): array = ak.Array( [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda" ) @@ -277,7 +277,7 @@ def test_sum(): del array -def test_prod(): +def test_2020_reduce_axis_none_prod(): array = ak.Array( [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda" ) @@ -303,7 +303,7 @@ def test_prod(): del array -def test_min(): +def test_2020_reduce_axis_none_min(): array = ak.Array( [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda" ) @@ -334,7 +334,7 @@ def test_min(): del array -def test_max(): +def test_2020_reduce_axis_none_max(): array = ak.Array( [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda" ) @@ -365,7 +365,7 @@ def test_max(): del array -def test_count(): +def test_2020_reduce_axis_none_count(): array = ak.Array( [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda" ) @@ -391,7 +391,7 @@ def test_count(): del array -def test_count_nonzero(): +def test_2020_reduce_axis_none_count_nonzero(): array = ak.Array( [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda" ) @@ -417,7 +417,7 @@ def test_count_nonzero(): del array -def test_std_no_mask_axis_none(): +def test_2020_reduce_axis_none_std_no_mask_axis_none(): array = ak.Array( [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda" ) @@ -427,9 +427,29 @@ def test_std_no_mask_axis_none(): ) assert ak.almost_equal(out1, out2) - # FIXME: - # out3 = ak.std(array[2], axis=None, mask_identity=True) - # assert out3 is None + out3 = ak.std(array[2], axis=None, mask_identity=True) + assert out3 is None del array del out1 del out2 + + +def test_2020_reduce_axis_none_std(): + array = ak.Array( + [[0, 2, 3.0], [4, 5, 6, 7, 8], [], [9, 8, None], [10, 1], []], backend="cuda" + ) + cpt.assert_allclose(ak.std(array, axis=None), 3.139134700306227) + cpt.assert_allclose( + ak.std(array, axis=None, keepdims=True, mask_identity=False), + ak.to_regular([[3.139134700306227]]), + ) + cpt.assert_allclose( + ak.std(array, axis=None, keepdims=True, mask_identity=True), + ak.to_regular( + ak.Array([[3.139134700306227]], backend="cuda").mask[ + ak.Array([[True]], backend="cuda") + ] + ), + ) + assert np.isnan(ak.std(array[2], axis=None, mask_identity=False)) + del array From 38d314d30ce42d37d6b87afff19227f56ce5dbed Mon Sep 17 00:00:00 2001 From: ManasviGoyal Date: Tue, 18 Jun 2024 15:22:22 +0200 Subject: [PATCH 26/33] fix: tests-spec error for bool --- dev/generate-tests.py | 2 +- tests-cuda/test_3136_cuda_reducers.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/dev/generate-tests.py b/dev/generate-tests.py index 4250fa11c7..45068c590c 100644 --- a/dev/generate-tests.py +++ b/dev/generate-tests.py @@ -424,7 +424,7 @@ def genspectests(specdict): """ ) - f.write("import pytest\nimport kernels\n\n") + f.write("import pytest\nimport numpy as np\nimport kernels\n\n") num = 1 if spec.tests == []: f.write( diff --git a/tests-cuda/test_3136_cuda_reducers.py b/tests-cuda/test_3136_cuda_reducers.py index 4a218c613f..06ab47117a 100644 --- a/tests-cuda/test_3136_cuda_reducers.py +++ b/tests-cuda/test_3136_cuda_reducers.py @@ -430,8 +430,7 @@ def test_2020_reduce_axis_none_std_no_mask_axis_none(): out3 = ak.std(array[2], axis=None, mask_identity=True) assert out3 is None del array - del out1 - del out2 + del out1, out2, out3 def test_2020_reduce_axis_none_std(): From 15068b66fb199f293882ba9a85f7db63610b0c64 Mon Sep 17 00:00:00 2001 From: ManasviGoyal Date: Tue, 18 Jun 2024 15:32:12 +0200 Subject: [PATCH 27/33] fix: check for the backend of head --- src/awkward/contents/regulararray.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/awkward/contents/regulararray.py b/src/awkward/contents/regulararray.py index 3c79050740..a5a16fcdff 100644 --- a/src/awkward/contents/regulararray.py +++ b/src/awkward/contents/regulararray.py @@ -471,7 +471,7 @@ def _getitem_next( nexthead, nexttail = ak._slicing.head_tail(tail) nextcarry = ak.index.Index64.empty(self._length, index_nplike) assert nextcarry.nplike is index_nplike - if ak.backend(nextcarry.data) == "cuda": + if ak.backend(head) == "cuda": head = int(ak.to_backend(head, backend=self._backend)[0]) self._maybe_index_error( self._backend[ From 6cf0919be5e02e5e2d1a52af5725654b11713c05 Mon Sep 17 00:00:00 2001 From: Manasvi Goyal Date: Fri, 21 Jun 2024 14:04:07 +0200 Subject: [PATCH 28/33] test: reducer CUDAkernel tests --- ...est_3162_cuda_generic_reducer_operation.py | 865 ++++++++++++++++++ 1 file changed, 865 insertions(+) create mode 100644 tests-cuda/test_3162_cuda_generic_reducer_operation.py diff --git a/tests-cuda/test_3162_cuda_generic_reducer_operation.py b/tests-cuda/test_3162_cuda_generic_reducer_operation.py new file mode 100644 index 0000000000..bcfc5488c9 --- /dev/null +++ b/tests-cuda/test_3162_cuda_generic_reducer_operation.py @@ -0,0 +1,865 @@ +from __future__ import annotations + +import cupy as cp +import numpy as np +import pytest + +import awkward as ak + +to_list = ak.operations.to_list + + +@pytest.fixture(scope="function", autouse=True) +def cleanup_cuda(): + yield + cp._default_memory_pool.free_all_blocks() + cp.cuda.Device().synchronize() + + +to_list = ak.operations.to_list + +primes = [x for x in range(2, 1000) if all(x % n != 0 for n in range(2, x))] + + +def test_0115_generic_reducer_operation_ListOffsetArray_to_RegularArray(): + content = ak.contents.NumpyArray(np.array(primes[: 2 * 3 * 5], dtype=np.int64)) + offsets1 = ak.index.Index64(np.array([0, 5, 10, 15, 20, 25, 30], dtype=np.int64)) + listoffsetarray = ak.contents.ListOffsetArray(offsets1, content) + regulararray = listoffsetarray.to_RegularArray() + cuda_listoffsetarray = ak.to_backend(listoffsetarray, "cuda") + cuda_regulararray = ak.to_backend(regulararray, "cuda") + + assert to_list(cuda_listoffsetarray) == to_list(cuda_regulararray) + del cuda_listoffsetarray, cuda_regulararray + + +def test_0115_generic_reducer_operation_dimension_optiontype_1(): + content = ak.contents.NumpyArray(np.array(primes[: 2 * 3 * 5], dtype=np.int64)) + offsets1 = ak.index.Index64(np.array([0, 5, 10, 15, 20, 25, 30], dtype=np.int64)) + listoffsetarray = ak.contents.ListOffsetArray(offsets1, content) + index = ak.index.Index64(np.array([5, -1, 3, 2, -1, 0], dtype=np.int64)) + indexedarray = ak.contents.IndexedOptionArray(index, listoffsetarray) + depth2 = ak.contents.RegularArray(indexedarray, 3) + depth2 = ak.to_backend(depth2, "cuda") + + assert to_list(depth2) == [ + [[101, 103, 107, 109, 113], None, [53, 59, 61, 67, 71]], + [[31, 37, 41, 43, 47], None, [2, 3, 5, 7, 11]], + ] + assert to_list(ak.prod(depth2, axis=-1, keepdims=False, highlevel=False)) == [ + [101 * 103 * 107 * 109 * 113, None, 53 * 59 * 61 * 67 * 71], + [31 * 37 * 41 * 43 * 47, None, 2 * 3 * 5 * 7 * 11], + ] + assert to_list(ak.prod(depth2, axis=-1, keepdims=True, highlevel=False)) == [ + [[101 * 103 * 107 * 109 * 113], None, [53 * 59 * 61 * 67 * 71]], + [[31 * 37 * 41 * 43 * 47], None, [2 * 3 * 5 * 7 * 11]], + ] + del depth2 + + +def test_0115_generic_reducer_operation_dimension_optiontype_2(): + content = ak.contents.NumpyArray(np.array(primes[: 2 * 3 * 5], dtype=np.int64)) + offsets1 = ak.index.Index64(np.array([0, 5, 10, 15, 20, 25, 30], dtype=np.int64)) + listoffsetarray = ak.contents.ListOffsetArray(offsets1, content) + index = ak.index.Index64(np.array([5, 4, 3, 2, 1, 0], dtype=np.int64)) + indexedarray = ak.contents.IndexedArray(index, listoffsetarray) + depth2 = ak.contents.RegularArray(indexedarray, 3) + depth2 = ak.to_backend(depth2, "cuda") + + assert to_list(depth2) == [ + [[101, 103, 107, 109, 113], [73, 79, 83, 89, 97], [53, 59, 61, 67, 71]], + [[31, 37, 41, 43, 47], [13, 17, 19, 23, 29], [2, 3, 5, 7, 11]], + ] + assert to_list(ak.prod(depth2, axis=-1, highlevel=False)) == [ + [101 * 103 * 107 * 109 * 113, 73 * 79 * 83 * 89 * 97, 53 * 59 * 61 * 67 * 71], + [31 * 37 * 41 * 43 * 47, 13 * 17 * 19 * 23 * 29, 2 * 3 * 5 * 7 * 11], + ] + assert to_list(ak.prod(depth2, axis=-1, keepdims=True, highlevel=False)) == [ + [ + [101 * 103 * 107 * 109 * 113], + [73 * 79 * 83 * 89 * 97], + [53 * 59 * 61 * 67 * 71], + ], + [[31 * 37 * 41 * 43 * 47], [13 * 17 * 19 * 23 * 29], [2 * 3 * 5 * 7 * 11]], + ] + del depth2 + + +def test_0115_generic_reducer_operation_reproduce_numpy_1(): + content1 = ak.contents.NumpyArray(np.array(primes[: 2 * 3 * 5], dtype=np.int64)) + offsets1 = ak.index.Index64(np.array([0, 5, 10, 15, 20, 25, 30], dtype=np.int64)) + offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64)) + depth2 = ak.contents.ListOffsetArray( + offsets2, ak.contents.ListOffsetArray(offsets1, content1) + ) + depth2 = ak.to_backend(depth2, "cuda", highlevel=False) + + assert to_list(depth2) == [ + [[2, 3, 5, 7, 11], [13, 17, 19, 23, 29], [31, 37, 41, 43, 47]], + [[53, 59, 61, 67, 71], [73, 79, 83, 89, 97], [101, 103, 107, 109, 113]], + ] + + assert to_list(ak.prod(depth2, axis=-1, highlevel=False)) == [ + [2 * 3 * 5 * 7 * 11, 13 * 17 * 19 * 23 * 29, 31 * 37 * 41 * 43 * 47], + [53 * 59 * 61 * 67 * 71, 73 * 79 * 83 * 89 * 97, 101 * 103 * 107 * 109 * 113], + ] + assert ( + ak.prod(depth2.to_typetracer(), axis=-1, highlevel=False).form + == ak.prod(depth2, axis=-1, highlevel=False).form + ) + assert to_list(ak.prod(depth2, axis=2, highlevel=False)) == [ + [2 * 3 * 5 * 7 * 11, 13 * 17 * 19 * 23 * 29, 31 * 37 * 41 * 43 * 47], + [53 * 59 * 61 * 67 * 71, 73 * 79 * 83 * 89 * 97, 101 * 103 * 107 * 109 * 113], + ] + assert ( + ak.prod(depth2.to_typetracer(), axis=2, highlevel=False).form + == ak.prod(depth2, axis=2, highlevel=False).form + ) + + assert to_list(ak.prod(depth2, axis=-2, highlevel=False)) == [ + [2 * 13 * 31, 3 * 17 * 37, 5 * 19 * 41, 7 * 23 * 43, 11 * 29 * 47], + [53 * 73 * 101, 59 * 79 * 103, 61 * 83 * 107, 67 * 89 * 109, 71 * 97 * 113], + ] + assert ( + ak.prod(depth2.to_typetracer(), axis=-2, highlevel=False).form + == ak.prod(depth2, axis=-2, highlevel=False).form + ) + assert to_list(ak.prod(depth2, axis=1, highlevel=False)) == [ + [2 * 13 * 31, 3 * 17 * 37, 5 * 19 * 41, 7 * 23 * 43, 11 * 29 * 47], + [53 * 73 * 101, 59 * 79 * 103, 61 * 83 * 107, 67 * 89 * 109, 71 * 97 * 113], + ] + assert ( + ak.prod(depth2.to_typetracer(), axis=1, highlevel=False).form + == ak.prod(depth2, axis=1, highlevel=False).form + ) + + assert to_list(ak.prod(depth2, axis=-3, highlevel=False)) == [ + [2 * 53, 3 * 59, 5 * 61, 7 * 67, 11 * 71], + [13 * 73, 17 * 79, 19 * 83, 23 * 89, 29 * 97], + [31 * 101, 37 * 103, 41 * 107, 43 * 109, 47 * 113], + ] + assert ( + ak.prod(depth2.to_typetracer(), axis=-3, highlevel=False).form + == ak.prod(depth2, axis=-3, highlevel=False).form + ) + assert to_list(ak.prod(depth2, axis=0, highlevel=False)) == [ + [2 * 53, 3 * 59, 5 * 61, 7 * 67, 11 * 71], + [13 * 73, 17 * 79, 19 * 83, 23 * 89, 29 * 97], + [31 * 101, 37 * 103, 41 * 107, 43 * 109, 47 * 113], + ] + assert ( + ak.prod(depth2.to_typetracer(), axis=0, highlevel=False).form + == ak.prod(depth2, axis=0, highlevel=False).form + ) + del depth2 + + +def test_0115_generic_reducer_operation_reproduce_numpy_2(): + content2 = ak.contents.NumpyArray(np.array(primes[:12], dtype=np.int64)) + offsets3 = ak.index.Index64(np.array([0, 4, 8, 12], dtype=np.int64)) + depth1 = ak.contents.ListOffsetArray(offsets3, content2) + depth1 = ak.to_backend(depth1, "cuda", highlevel=False) + + assert to_list(ak.prod(depth1, -1, highlevel=False)) == [ + 2 * 3 * 5 * 7, + 11 * 13 * 17 * 19, + 23 * 29 * 31 * 37, + ] + assert ( + ak.prod(depth1.to_typetracer(), -1, highlevel=False).form + == ak.prod(depth1, -1, highlevel=False).form + ) + assert to_list(ak.prod(depth1, 1, highlevel=False)) == [ + 2 * 3 * 5 * 7, + 11 * 13 * 17 * 19, + 23 * 29 * 31 * 37, + ] + assert ( + ak.prod(depth1.to_typetracer(), 1, highlevel=False).form + == ak.prod(depth1, 1, highlevel=False).form + ) + + assert to_list(ak.prod(depth1, -2, highlevel=False)) == [ + 2 * 11 * 23, + 3 * 13 * 29, + 5 * 17 * 31, + 7 * 19 * 37, + ] + assert ( + ak.prod(depth1.to_typetracer(), -2, highlevel=False).form + == ak.prod(depth1, -2, highlevel=False).form + ) + assert to_list(ak.prod(depth1, 0, highlevel=False)) == [ + 2 * 11 * 23, + 3 * 13 * 29, + 5 * 17 * 31, + 7 * 19 * 37, + ] + assert ( + ak.prod(depth1.to_typetracer(), 0, highlevel=False).form + == ak.prod(depth1, 0, highlevel=False).form + ) + del depth1 + + +def test_0115_generic_reducer_operation_gaps_1(): + content1 = ak.contents.NumpyArray( + np.array([123] + primes[: 2 * 3 * 5], dtype=np.int64) + ) + offsets1 = ak.index.Index64(np.array([0, 1, 6, 11, 16, 21, 26, 31], dtype=np.int64)) + offsets2 = ak.index.Index64(np.array([1, 4, 7], dtype=np.int64)) + depth2 = ak.contents.ListOffsetArray( + offsets2, ak.contents.ListOffsetArray(offsets1, content1) + ) + depth2 = ak.to_backend(depth2, "cuda", highlevel=False) + + assert to_list(depth2) == [ + [[2, 3, 5, 7, 11], [13, 17, 19, 23, 29], [31, 37, 41, 43, 47]], + [[53, 59, 61, 67, 71], [73, 79, 83, 89, 97], [101, 103, 107, 109, 113]], + ] + + assert to_list(ak.prod(depth2, -3, highlevel=False)) == [ + [106, 177, 305, 469, 781], + [949, 1343, 1577, 2047, 2813], + [3131, 3811, 4387, 4687, 5311], + ] + assert ( + ak.prod(depth2.to_typetracer(), -3, highlevel=False).form + == ak.prod(depth2, -3, highlevel=False).form + ) + del depth2 + + +def test_0115_generic_reducer_operation_gaps_2(): + content1 = ak.contents.NumpyArray(np.array(primes[: 2 * 3 * 5 - 1], dtype=np.int64)) + offsets1 = ak.index.Index64(np.array([0, 5, 10, 15, 20, 25, 29], dtype=np.int64)) + offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64)) + depth2 = ak.contents.ListOffsetArray( + offsets2, ak.contents.ListOffsetArray(offsets1, content1) + ) + depth2 = ak.to_backend(depth2, "cuda", highlevel=False) + + assert to_list(depth2) == [ + [[2, 3, 5, 7, 11], [13, 17, 19, 23, 29], [31, 37, 41, 43, 47]], + [ + [53, 59, 61, 67, 71], + [73, 79, 83, 89, 97], + [ + 101, + 103, + 107, + 109, + ], + ], + ] + + assert to_list(ak.prod(depth2, -3, highlevel=False)) == [ + [106, 177, 305, 469, 781], + [949, 1343, 1577, 2047, 2813], + [3131, 3811, 4387, 4687, 47], + ] + assert ( + ak.prod(depth2.to_typetracer(), -3, highlevel=False).form + == ak.prod(depth2, -3, highlevel=False).form + ) + del depth2 + + +def test_0115_generic_reducer_operation_gaps_3(): + content1 = ak.contents.NumpyArray(np.array(primes[: 2 * 3 * 5 - 2], dtype=np.int64)) + offsets1 = ak.index.Index64(np.array([0, 5, 10, 15, 20, 25, 28], dtype=np.int64)) + offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64)) + depth2 = ak.contents.ListOffsetArray( + offsets2, ak.contents.ListOffsetArray(offsets1, content1) + ) + depth2 = ak.to_backend(depth2, "cuda", highlevel=False) + + assert to_list(depth2) == [ + [[2, 3, 5, 7, 11], [13, 17, 19, 23, 29], [31, 37, 41, 43, 47]], + [ + [53, 59, 61, 67, 71], + [73, 79, 83, 89, 97], + [ + 101, + 103, + 107, + ], + ], + ] + + assert to_list(ak.prod(depth2, -3, highlevel=False)) == [ + [106, 177, 305, 469, 781], + [949, 1343, 1577, 2047, 2813], + [3131, 3811, 4387, 43, 47], + ] + assert ( + ak.prod(depth2.to_typetracer(), -3, highlevel=False).form + == ak.prod(depth2, -3, highlevel=False).form + ) + del depth2 + + +def test_0115_generic_reducer_operation_gaps_4(): + content1 = ak.contents.NumpyArray( + np.array( + [ + 2, + 3, + 5, + 7, + 11, + 13, + 17, + 19, + 23, + 29, + 31, + 37, + 41, + 43, + 47, + 53, + 59, + 61, + 67, + 71, + 73, + 79, + 83, + 89, + 101, + 103, + 107, + 109, + ], + dtype=np.int64, + ) + ) + offsets1 = ak.index.Index64(np.array([0, 5, 10, 15, 20, 24, 28], dtype=np.int64)) + offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64)) + depth2 = ak.contents.ListOffsetArray( + offsets2, ak.contents.ListOffsetArray(offsets1, content1) + ) + depth2 = ak.to_backend(depth2, "cuda", highlevel=False) + + assert to_list(depth2) == [ + [[2, 3, 5, 7, 11], [13, 17, 19, 23, 29], [31, 37, 41, 43, 47]], + [ + [53, 59, 61, 67, 71], + [ + 73, + 79, + 83, + 89, + ], + [101, 103, 107, 109], + ], + ] + + assert to_list(ak.prod(depth2, -3, highlevel=False)) == [ + [106, 177, 305, 469, 781], + [949, 1343, 1577, 2047, 29], + [3131, 3811, 4387, 4687, 47], + ] + assert ( + ak.prod(depth2.to_typetracer(), -3, highlevel=False).form + == ak.prod(depth2, -3, highlevel=False).form + ) + del depth2 + + +def test_0115_generic_reducer_operation_gaps_5(): + content1 = ak.contents.NumpyArray(np.array(primes[1 : 2 * 3 * 5], dtype=np.int64)) + offsets1 = ak.index.Index64(np.array([0, 4, 9, 14, 19, 24, 29], dtype=np.int64)) + offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64)) + depth2 = ak.contents.ListOffsetArray( + offsets2, ak.contents.ListOffsetArray(offsets1, content1) + ) + depth2 = ak.to_backend(depth2, "cuda", highlevel=False) + + assert to_list(depth2) == [ + [[3, 5, 7, 11], [13, 17, 19, 23, 29], [31, 37, 41, 43, 47]], + [[53, 59, 61, 67, 71], [73, 79, 83, 89, 97], [101, 103, 107, 109, 113]], + ] + + assert to_list(ak.prod(depth2, -3, highlevel=False)) == [ + [159, 295, 427, 737, 71], + [949, 1343, 1577, 2047, 2813], + [3131, 3811, 4387, 4687, 5311], + ] + assert ( + ak.prod(depth2.to_typetracer(), -3, highlevel=False).form + == ak.prod(depth2, -3, highlevel=False).form + ) + del depth2 + + +def test_0115_generic_reducer_operation_gaps_6(): + content1 = ak.contents.NumpyArray(np.array(primes[2 : 2 * 3 * 5], dtype=np.int64)) + offsets1 = ak.index.Index64(np.array([0, 3, 8, 13, 18, 23, 28], dtype=np.int64)) + offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64)) + depth2 = ak.contents.ListOffsetArray( + offsets2, ak.contents.ListOffsetArray(offsets1, content1) + ) + depth2 = ak.to_backend(depth2, "cuda", highlevel=False) + + assert to_list(depth2) == [ + [[5, 7, 11], [13, 17, 19, 23, 29], [31, 37, 41, 43, 47]], + [[53, 59, 61, 67, 71], [73, 79, 83, 89, 97], [101, 103, 107, 109, 113]], + ] + + assert to_list(ak.prod(depth2, -3, highlevel=False)) == [ + [265, 413, 671, 67, 71], + [949, 1343, 1577, 2047, 2813], + [3131, 3811, 4387, 4687, 5311], + ] + assert ( + ak.prod(depth2.to_typetracer(), -3, highlevel=False).form + == ak.prod(depth2, -3, highlevel=False).form + ) + del depth2 + + +def test_0115_generic_reducer_operation_gaps_7(): + content1 = ak.contents.NumpyArray( + np.array( + [ + 3, + 5, + 7, + 13, + 17, + 19, + 23, + 29, + 31, + 37, + 41, + 43, + 47, + 53, + 59, + 61, + 67, + 71, + 73, + 79, + 83, + 89, + 97, + 101, + 103, + 107, + 109, + 113, + ], + dtype=np.int64, + ) + ) + offsets1 = ak.index.Index64(np.array([0, 3, 8, 13, 18, 23, 28], dtype=np.int64)) + offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64)) + depth2 = ak.contents.ListOffsetArray( + offsets2, ak.contents.ListOffsetArray(offsets1, content1) + ) + depth2 = ak.to_backend(depth2, "cuda", highlevel=False) + + assert to_list(depth2) == [ + [ + [ + 3, + 5, + 7, + ], + [13, 17, 19, 23, 29], + [31, 37, 41, 43, 47], + ], + [[53, 59, 61, 67, 71], [73, 79, 83, 89, 97], [101, 103, 107, 109, 113]], + ] + + assert to_list(ak.prod(depth2, -3, highlevel=False)) == [ + [159, 295, 427, 67, 71], + [949, 1343, 1577, 2047, 2813], + [3131, 3811, 4387, 4687, 5311], + ] + assert ( + ak.prod(depth2.to_typetracer(), -3, highlevel=False).form + == ak.prod(depth2, -3, highlevel=False).form + ) + del depth2 + + +def test_0115_generic_reducer_operation_gaps_8(): + content1 = ak.contents.NumpyArray( + np.array( + [ + 3, + 5, + 7, + 11, + 13, + 17, + 19, + 23, + 31, + 37, + 41, + 43, + 47, + 53, + 59, + 61, + 67, + 71, + 73, + 79, + 83, + 89, + 97, + 101, + 103, + 107, + 109, + 113, + ], + dtype=np.int64, + ) + ) + offsets1 = ak.index.Index64(np.array([0, 4, 8, 13, 18, 23, 28], dtype=np.int64)) + offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64)) + depth2 = ak.contents.ListOffsetArray( + offsets2, ak.contents.ListOffsetArray(offsets1, content1) + ) + depth2 = ak.to_backend(depth2, "cuda", highlevel=False) + + assert to_list(depth2) == [ + [[3, 5, 7, 11], [13, 17, 19, 23], [31, 37, 41, 43, 47]], + [[53, 59, 61, 67, 71], [73, 79, 83, 89, 97], [101, 103, 107, 109, 113]], + ] + + assert to_list(ak.prod(depth2, -3, highlevel=False)) == [ + [159, 295, 427, 737, 71], + [949, 1343, 1577, 2047, 97], + [3131, 3811, 4387, 4687, 5311], + ] + assert ( + ak.prod(depth2.to_typetracer(), -3, highlevel=False).form + == ak.prod(depth2, -3, highlevel=False).form + ) + del depth2 + + +def test_0115_generic_reducer_operation_gaps_9(): + content1 = ak.contents.NumpyArray( + np.array( + [ + 2, + 3, + 5, + 7, + 11, + 13, + 17, + 19, + 23, + 29, + 31, + 37, + 41, + 43, + 53, + 59, + 61, + 67, + 71, + 73, + 79, + 83, + 89, + 97, + 101, + 103, + 107, + 109, + ], + dtype=np.int64, + ) + ) + offsets1 = ak.index.Index64(np.array([0, 5, 10, 14, 19, 24, 28], dtype=np.int64)) + offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64)) + depth2 = ak.contents.ListOffsetArray( + offsets2, ak.contents.ListOffsetArray(offsets1, content1) + ) + depth2 = ak.to_backend(depth2, "cuda", highlevel=False) + + assert to_list(depth2) == [ + [[2, 3, 5, 7, 11], [13, 17, 19, 23, 29], [31, 37, 41, 43]], + [[53, 59, 61, 67, 71], [73, 79, 83, 89, 97], [101, 103, 107, 109]], + ] + + assert to_list(ak.prod(depth2, -3, highlevel=False)) == [ + [106, 177, 305, 469, 781], + [949, 1343, 1577, 2047, 2813], + [3131, 3811, 4387, 4687], + ] + assert ( + ak.prod(depth2.to_typetracer(), -3, highlevel=False).form + == ak.prod(depth2, -3, highlevel=False).form + ) + del depth2 + + +def test_0115_generic_reducer_operation_gaps_10(): + content1 = ak.contents.NumpyArray( + np.array( + [ + 2, + 3, + 5, + 7, + 11, + 13, + 17, + 19, + 23, + 31, + 37, + 41, + 43, + 47, + 53, + 59, + 61, + 67, + 71, + 73, + 79, + 83, + 89, + 101, + 103, + 107, + 109, + 113, + ], + dtype=np.int64, + ) + ) + offsets1 = ak.index.Index64(np.array([0, 5, 9, 14, 19, 23, 28], dtype=np.int64)) + offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64)) + depth2 = ak.contents.ListOffsetArray( + offsets2, ak.contents.ListOffsetArray(offsets1, content1) + ) + depth2 = ak.to_backend(depth2, "cuda", highlevel=False) + + assert to_list(depth2) == [ + [[2, 3, 5, 7, 11], [13, 17, 19, 23], [31, 37, 41, 43, 47]], + [[53, 59, 61, 67, 71], [73, 79, 83, 89], [101, 103, 107, 109, 113]], + ] + + assert to_list(ak.prod(depth2, -3, highlevel=False)) == [ + [106, 177, 305, 469, 781], + [949, 1343, 1577, 2047], + [3131, 3811, 4387, 4687, 5311], + ] + assert ( + ak.prod(depth2.to_typetracer(), -3, highlevel=False).form + == ak.prod(depth2, -3, highlevel=False).form + ) + del depth2 + + +def test_0115_generic_reducer_operation_gaps_11(): + content1 = ak.contents.NumpyArray(np.array(primes[:9], dtype=np.int64)) + offsets1 = ak.index.Index64(np.array([0, 3, 4, 6, 6, 7, 9], dtype=np.int64)) + offsets2 = ak.index.Index64(np.array([0, 2, 4, 6], dtype=np.int64)) + depth2 = ak.contents.ListOffsetArray( + offsets2, ak.contents.ListOffsetArray(offsets1, content1) + ) + depth2 = ak.to_backend(depth2, "cuda", highlevel=False) + + assert to_list(depth2) == [[[2, 3, 5], [7]], [[11, 13], []], [[17], [19, 23]]] + + assert to_list(ak.prod(depth2, -3, highlevel=False)) == [ + [2 * 11 * 17, 3 * 13, 5], + [7 * 19, 23], + ] + assert ( + ak.prod(depth2.to_typetracer(), -3, highlevel=False).form + == ak.prod(depth2, -3, highlevel=False).form + ) + del depth2 + + +def test_0115_generic_reducer_operation_gaps_12(): + content1 = ak.contents.NumpyArray(np.array(primes[:9], dtype=np.int64)) + offsets1 = ak.index.Index64(np.array([0, 3, 4, 6, 7, 9], dtype=np.int64)) + offsets2 = ak.index.Index64(np.array([0, 2, 3, 5], dtype=np.int64)) + depth2 = ak.contents.ListOffsetArray( + offsets2, ak.contents.ListOffsetArray(offsets1, content1) + ) + depth2 = ak.to_backend(depth2, "cuda", highlevel=False) + + assert to_list(depth2) == [[[2, 3, 5], [7]], [[11, 13]], [[17], [19, 23]]] + + assert to_list(ak.prod(depth2, -3, highlevel=False)) == [ + [2 * 11 * 17, 3 * 13, 5], + [7 * 19, 23], + ] + assert ( + ak.prod(depth2.to_typetracer(), -3, highlevel=False).form + == ak.prod(depth2, -3, highlevel=False).form + ) + del depth2 + + +def test_0115_generic_reducer_operation_gaps_13(): + content1 = ak.contents.NumpyArray(np.array(primes[:10], dtype=np.int64)) + offsets1 = ak.index.Index64(np.array([0, 3, 5, 6, 8, 9, 10], dtype=np.int64)) + offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64)) + depth2 = ak.contents.ListOffsetArray( + offsets2, ak.contents.ListOffsetArray(offsets1, content1) + ) + depth2 = ak.to_backend(depth2, "cuda", highlevel=False) + + assert to_list(depth2) == [[[2, 3, 5], [7, 11], [13]], [[17, 19], [23], [29]]] + + assert to_list(ak.prod(depth2, -3, highlevel=False)) == [ + [34, 57, 5], + [161, 11], + [377], + ] + assert ( + ak.prod(depth2.to_typetracer(), -3, highlevel=False).form + == ak.prod(depth2, -3, highlevel=False).form + ) + del depth2 + + +def test_0115_generic_reducer_operation_gaps_14(): + content1 = ak.contents.NumpyArray(np.array(primes[:9], dtype=np.int64)) + offsets1 = ak.index.Index64(np.array([0, 3, 3, 5, 6, 8, 9], dtype=np.int64)) + offsets2 = ak.index.Index64(np.array([0, 4, 6], dtype=np.int64)) + depth2 = ak.contents.ListOffsetArray( + offsets2, ak.contents.ListOffsetArray(offsets1, content1) + ) + depth2 = ak.to_backend(depth2, "cuda", highlevel=False) + + assert to_list(depth2) == [[[2, 3, 5], [], [7, 11], [13]], [[17, 19], [23]]] + + assert to_list(ak.prod(depth2, -3, highlevel=False)) == [ + [34, 57, 5], + [23], + [7, 11], + [13], + ] + assert ( + ak.prod(depth2.to_typetracer(), -3, highlevel=False).form + == ak.prod(depth2, -3, highlevel=False).form + ) + del depth2 + + +def test_0115_generic_reducer_operation_gaps_15(): + content1 = ak.contents.NumpyArray(np.array(primes[:9], dtype=np.int64)) + offsets1 = ak.index.Index64(np.array([0, 3, 3, 5, 6, 8, 9], dtype=np.int64)) + offsets2 = ak.index.Index64(np.array([0, 4, 4, 6], dtype=np.int64)) + depth2 = ak.contents.ListOffsetArray( + offsets2, ak.contents.ListOffsetArray(offsets1, content1) + ) + depth2 = ak.to_backend(depth2, "cuda", highlevel=False) + + assert to_list(depth2) == [[[2, 3, 5], [], [7, 11], [13]], [], [[17, 19], [23]]] + + assert to_list(ak.prod(depth2, -3, highlevel=False)) == [ + [34, 57, 5], + [23], + [7, 11], + [13], + ] + assert ( + ak.prod(depth2.to_typetracer(), -3, highlevel=False).form + == ak.prod(depth2, -3, highlevel=False).form + ) + del depth2 + + +def test_0115_generic_reducer_operation_gaps_16(): + content1 = ak.contents.NumpyArray(np.array(primes[: 2 * 3 * 5], dtype=np.int64)) + offsets1 = ak.index.Index64(np.array([0, 5, 10, 15, 20, 25, 30], dtype=np.int64)) + offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64)) + depth2 = ak.contents.ListOffsetArray( + offsets2, ak.contents.ListOffsetArray(offsets1, content1) + ) + depth2 = ak.to_backend(depth2, "cuda", highlevel=False) + + assert to_list(depth2) == [ + [[2, 3, 5, 7, 11], [13, 17, 19, 23, 29], [31, 37, 41, 43, 47]], + [[53, 59, 61, 67, 71], [73, 79, 83, 89, 97], [101, 103, 107, 109, 113]], + ] + + assert to_list(ak.prod(depth2, -1, highlevel=False)) == [ + [2 * 3 * 5 * 7 * 11, 13 * 17 * 19 * 23 * 29, 31 * 37 * 41 * 43 * 47], + [53 * 59 * 61 * 67 * 71, 73 * 79 * 83 * 89 * 97, 101 * 103 * 107 * 109 * 113], + ] + assert ( + ak.prod(depth2.to_typetracer(), -1, highlevel=False).form + == ak.prod(depth2, -1, highlevel=False).form + ) + + assert to_list(ak.prod(depth2, -2, highlevel=False)) == [ + [2 * 13 * 31, 3 * 17 * 37, 5 * 19 * 41, 7 * 23 * 43, 11 * 29 * 47], + [53 * 73 * 101, 59 * 79 * 103, 61 * 83 * 107, 67 * 89 * 109, 71 * 97 * 113], + ] + assert ( + ak.prod(depth2.to_typetracer(), -2, highlevel=False).form + == ak.prod(depth2, -2, highlevel=False).form + ) + del depth2 + + +def test_0115_generic_reducer_operation_gaps_17(): + content1 = ak.contents.NumpyArray(np.array(primes[:9], dtype=np.int64)) + offsets1 = ak.index.Index64(np.array([0, 3, 3, 5, 6, 8, 9], dtype=np.int64)) + offsets2 = ak.index.Index64(np.array([0, 4, 4, 6], dtype=np.int64)) + depth2 = ak.contents.ListOffsetArray( + offsets2, ak.contents.ListOffsetArray(offsets1, content1) + ) + depth2 = ak.to_backend(depth2, "cuda", highlevel=False) + + assert to_list(depth2) == [ + [[2, 3, 5], [], [7, 11], [13]], + [], + [[17, 19], [23]], + ] + + assert to_list(ak.prod(depth2, -1, highlevel=False)) == [ + [2 * 3 * 5, 1, 7 * 11, 13], + [], + [17 * 19, 23], + ] + assert ( + ak.prod(depth2.to_typetracer(), -1, highlevel=False).form + == ak.prod(depth2, -1, highlevel=False).form + ) + + assert to_list(ak.prod(depth2, -2, highlevel=False)) == [ + [2 * 7 * 13, 3 * 11, 5], + [], + [17 * 23, 19], + ] + assert ( + ak.prod(depth2.to_typetracer(), -2, highlevel=False).form + == ak.prod(depth2, -2, highlevel=False).form + ) + + assert to_list(ak.prod(depth2, -3, highlevel=False)) == [ + [2 * 17, 3 * 19, 5], + [23], + [7, 11], + [13], + ] + assert ( + ak.prod(depth2.to_typetracer(), -3, highlevel=False).form + == ak.prod(depth2, -3, highlevel=False).form + ) + del depth2 From 347c4c902c629f409664f2b4c97a25965e4f0c4d Mon Sep 17 00:00:00 2001 From: Manasvi Goyal Date: Fri, 21 Jun 2024 15:30:47 +0200 Subject: [PATCH 29/33] test: add more reducer tests --- ...est_3162_cuda_generic_reducer_operation.py | 457 ++++++++++++++++++ 1 file changed, 457 insertions(+) diff --git a/tests-cuda/test_3162_cuda_generic_reducer_operation.py b/tests-cuda/test_3162_cuda_generic_reducer_operation.py index bcfc5488c9..94d1bb3570 100644 --- a/tests-cuda/test_3162_cuda_generic_reducer_operation.py +++ b/tests-cuda/test_3162_cuda_generic_reducer_operation.py @@ -863,3 +863,460 @@ def test_0115_generic_reducer_operation_gaps_17(): == ak.prod(depth2, -3, highlevel=False).form ) del depth2 + + +def test_0115_generic_reducer_operation_complicated(): + offsets1 = ak.index.Index64(np.array([0, 3, 3, 5], dtype=np.int64)) + content1 = ak.contents.ListOffsetArray( + offsets1, ak.contents.NumpyArray(np.array(primes[:5], dtype=np.int64)) + ) + offsets2 = ak.index.Index64(np.array([0, 3, 3, 5, 6, 8, 9], dtype=np.int64)) + offsets3 = ak.index.Index64(np.array([0, 4, 4, 6], dtype=np.int64)) + content2 = ak.contents.ListOffsetArray( + offsets3, + ak.contents.ListOffsetArray( + offsets2, ak.contents.NumpyArray(np.array(primes[:9], dtype=np.int64)) + ), + ) + offsets4 = ak.index.Index64(np.array([0, 1, 1, 3], dtype=np.int64)) + complicated = ak.contents.ListOffsetArray( + offsets4, ak.contents.RecordArray([content1, content2], ["x", "y"]) + ) + complicated = ak.to_backend(complicated, "cuda", highlevel=False) + + assert to_list(complicated) == [ + [{"x": [2, 3, 5], "y": [[2, 3, 5], [], [7, 11], [13]]}], + [], + [{"x": [], "y": []}, {"x": [7, 11], "y": [[17, 19], [23]]}], + ] + + assert to_list(complicated["x"]) == [[[2, 3, 5]], [], [[], [7, 11]]] + assert complicated.to_typetracer()["x"].form == complicated["x"].form + assert to_list(complicated["y"]) == [ + [[[2, 3, 5], [], [7, 11], [13]]], + [], + [[], [[17, 19], [23]]], + ] + assert complicated.to_typetracer()["y"].form == complicated["y"].form + + with pytest.raises(TypeError): + to_list(ak.prod(complicated, -1, highlevel=False)) + + with pytest.raises(TypeError): + assert ( + ak.prod(complicated.to_typetracer(), -1, highlevel=False).form + == ak.prod(complicated, -1, highlevel=False).form + ) + + assert to_list(ak.prod(complicated["x"], -1, highlevel=False)) == [ + [30], + [], + [1, 77], + ] + assert ( + ak.prod(complicated.to_typetracer()["x"], -1, highlevel=False).form + == ak.prod(complicated["x"], -1, highlevel=False).form + ) + assert to_list(ak.prod(complicated["y"], -1, highlevel=False)) == [ + [[30, 1, 77, 13]], + [], + [[], [323, 23]], + ] + assert ( + ak.prod(complicated.to_typetracer()["y"], -1, highlevel=False).form + == ak.prod(complicated["y"], -1, highlevel=False).form + ) + + with pytest.raises(TypeError): + to_list(ak.prod(complicated, -2, highlevel=False)) + + with pytest.raises(TypeError): + assert ( + ak.prod(complicated.to_typetracer(), -2, highlevel=False).form + == ak.prod(complicated, -2, highlevel=False).form + ) + assert to_list(ak.prod(complicated["x"], -2, highlevel=False)) == [ + [2, 3, 5], + [], + [7, 11], + ] + assert ( + ak.prod(complicated.to_typetracer()["x"], -2, highlevel=False).form + == ak.prod(complicated["x"], -2, highlevel=False).form + ) + assert to_list(ak.prod(complicated["y"], -2, highlevel=False)) == [ + [[182, 33, 5]], + [], + [[], [391, 19]], + ] + assert ( + ak.prod(complicated.to_typetracer()["y"], -2, highlevel=False).form + == ak.prod(complicated["y"], -2, highlevel=False).form + ) + + assert to_list(complicated[0]) == [ + {"x": [2, 3, 5], "y": [[2, 3, 5], [], [7, 11], [13]]} + ] + assert complicated.to_typetracer()[0].form == complicated[0].form + + with pytest.raises(TypeError): + to_list(ak.prod(complicated[0], -1, highlevel=False)) + + with pytest.raises(TypeError): + to_list(ak.prod(complicated.to_typetracer()[0], -1, highlevel=False)) + del complicated + + +def test_0115_generic_reducer_operation_EmptyArray(): + offsets = ak.index.Index64(np.array([0, 0, 0, 0], dtype=np.int64)) + array = ak.contents.ListOffsetArray(offsets, ak.contents.EmptyArray()) + array = ak.to_backend(array, "cuda") + + assert to_list(array) == [[], [], []] + + assert to_list(ak.prod(array, -1, highlevel=False)) == [1, 1, 1] + assert ( + ak.prod(array.to_typetracer(), -1, highlevel=False).form + == ak.prod(array, -1, highlevel=False).form + ) + + offsets = ak.index.Index64(np.array([0, 0, 0, 0], dtype=np.int64)) + array = ak.contents.ListOffsetArray( + offsets, ak.contents.NumpyArray(np.array([], dtype=np.int64)) + ) + array = ak.to_backend(array, "cuda") + + assert to_list(array) == [[], [], []] + + assert to_list(ak.prod(array, -1, highlevel=False)) == [1, 1, 1] + assert ( + ak.prod(array.to_typetracer(), -1, highlevel=False).form + == ak.prod(array, -1, highlevel=False).form + ) + del array + + +def test_0115_generic_reducer_operation_IndexedOptionArray_1(): + content = ak.contents.NumpyArray(np.array(primes[: 2 * 3 * 5], dtype=np.int64)) + offsets1 = ak.index.Index64(np.array([0, 5, 10, 15, 20, 25, 30], dtype=np.int64)) + listoffsetarray = ak.contents.ListOffsetArray(offsets1, content) + index = ak.index.Index64(np.array([5, 4, 3, 2, 1, 0], dtype=np.int64)) + indexedarray = ak.contents.IndexedArray(index, listoffsetarray) + offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64)) + depth2 = ak.contents.ListOffsetArray(offsets2, indexedarray) + depth2 = ak.to_backend(depth2, "cuda") + + assert to_list(depth2) == [ + [[101, 103, 107, 109, 113], [73, 79, 83, 89, 97], [53, 59, 61, 67, 71]], + [[31, 37, 41, 43, 47], [13, 17, 19, 23, 29], [2, 3, 5, 7, 11]], + ] + + assert to_list(ak.prod(depth2, -1, highlevel=False)) == [ + [101 * 103 * 107 * 109 * 113, 73 * 79 * 83 * 89 * 97, 53 * 59 * 61 * 67 * 71], + [31 * 37 * 41 * 43 * 47, 13 * 17 * 19 * 23 * 29, 2 * 3 * 5 * 7 * 11], + ] + assert ( + ak.prod(depth2.to_typetracer(), -1, highlevel=False).form + == ak.prod(depth2, -1, highlevel=False).form + ) + + assert to_list(ak.prod(depth2, -2, highlevel=False)) == [ + [101 * 73 * 53, 103 * 79 * 59, 107 * 83 * 61, 109 * 89 * 67, 113 * 97 * 71], + [31 * 13 * 2, 37 * 17 * 3, 41 * 19 * 5, 43 * 23 * 7, 47 * 29 * 11], + ] + assert ( + ak.prod(depth2.to_typetracer(), -2, highlevel=False).form + == ak.prod(depth2, -2, highlevel=False).form + ) + + assert to_list(ak.prod(depth2, -3, highlevel=False)) == [ + [101 * 31, 103 * 37, 107 * 41, 109 * 43, 113 * 47], + [73 * 13, 79 * 17, 83 * 19, 89 * 23, 97 * 29], + [53 * 2, 59 * 3, 61 * 5, 67 * 7, 71 * 11], + ] + assert ( + ak.prod(depth2.to_typetracer(), -3, highlevel=False).form + == ak.prod(depth2, -3, highlevel=False).form + ) + del depth2 + + +def test_0115_generic_reducer_operation_IndexedOptionArray_2(): + content = ak.contents.NumpyArray( + np.array( + [ + 2, + 3, + 5, + 7, + 11, + 31, + 37, + 41, + 43, + 47, + 53, + 59, + 61, + 67, + 71, + 101, + 103, + 107, + 109, + 113, + ], + dtype=np.int64, + ) + ) + offsets1 = ak.index.Index64(np.array([0, 5, 10, 15, 20], dtype=np.int64)) + listoffsetarray = ak.contents.ListOffsetArray(offsets1, content) + index = ak.index.Index64(np.array([3, -1, 2, 1, -1, 0], dtype=np.int64)) + indexedoptionarray = ak.contents.IndexedOptionArray(index, listoffsetarray) + offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64)) + depth2 = ak.contents.ListOffsetArray(offsets2, indexedoptionarray) + depth2 = ak.to_backend(depth2, "cuda") + + assert to_list(depth2) == [ + [[101, 103, 107, 109, 113], None, [53, 59, 61, 67, 71]], + [[31, 37, 41, 43, 47], None, [2, 3, 5, 7, 11]], + ] + + assert to_list(ak.prod(depth2, -1, highlevel=False)) == [ + [101 * 103 * 107 * 109 * 113, None, 53 * 59 * 61 * 67 * 71], + [31 * 37 * 41 * 43 * 47, None, 2 * 3 * 5 * 7 * 11], + ] + assert ( + ak.prod(depth2.to_typetracer(), -1, highlevel=False).form + == ak.prod(depth2, -1, highlevel=False).form + ) + + assert to_list(ak.prod(depth2, -2, highlevel=False)) == [ + [101 * 53, 103 * 59, 107 * 61, 109 * 67, 113 * 71], + [31 * 2, 37 * 3, 41 * 5, 43 * 7, 47 * 11], + ] + assert ( + ak.prod(depth2.to_typetracer(), -2, highlevel=False).form + == ak.prod(depth2, -2, highlevel=False).form + ) + + assert to_list(ak.prod(depth2, -3, highlevel=False)) == [ + [101 * 31, 103 * 37, 107 * 41, 109 * 43, 113 * 47], + [], + [53 * 2, 59 * 3, 61 * 5, 67 * 7, 71 * 11], + ] + assert ( + ak.prod(depth2.to_typetracer(), -3, highlevel=False).form + == ak.prod(depth2, -3, highlevel=False).form + ) + del depth2 + + +def test_0115_generic_reducer_operation_IndexedOptionArray_3(): + content = ak.contents.NumpyArray( + np.array( + [ + 2, + 3, + 5, + 7, + 11, + 31, + 37, + 41, + 43, + 47, + 53, + 59, + 61, + 67, + 71, + 101, + 103, + 107, + 109, + 113, + ], + dtype=np.int64, + ) + ) + index = ak.index.Index64( + np.array( + [ + 15, + 16, + 17, + 18, + 19, + -1, + -1, + -1, + -1, + -1, + 10, + 11, + 12, + 13, + 14, + 5, + 6, + 7, + 8, + 9, + -1, + -1, + -1, + -1, + -1, + 0, + 1, + 2, + 3, + 4, + ], + dtype=np.int64, + ) + ) + indexedoptionarray = ak.contents.IndexedOptionArray(index, content) + offsets1 = ak.index.Index64(np.array([0, 5, 10, 15, 20, 25, 30], dtype=np.int64)) + listoffsetarray = ak.contents.ListOffsetArray(offsets1, indexedoptionarray) + offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64)) + depth2 = ak.contents.ListOffsetArray(offsets2, listoffsetarray) + depth2 = ak.to_backend(depth2, "cuda") + + assert to_list(depth2) == [ + [ + [101, 103, 107, 109, 113], + [None, None, None, None, None], + [53, 59, 61, 67, 71], + ], + [[31, 37, 41, 43, 47], [None, None, None, None, None], [2, 3, 5, 7, 11]], + ] + + assert to_list(ak.prod(depth2, -1, highlevel=False)) == [ + [101 * 103 * 107 * 109 * 113, 1 * 1 * 1 * 1 * 1, 53 * 59 * 61 * 67 * 71], + [31 * 37 * 41 * 43 * 47, 1 * 1 * 1 * 1 * 1, 2 * 3 * 5 * 7 * 11], + ] + assert ( + ak.prod(depth2.to_typetracer(), -1, highlevel=False).form + == ak.prod(depth2, -1, highlevel=False).form + ) + + assert to_list(ak.prod(depth2, -2, highlevel=False)) == [ + [101 * 53, 103 * 59, 107 * 61, 109 * 67, 113 * 71], + [31 * 2, 37 * 3, 41 * 5, 43 * 7, 47 * 11], + ] + assert ( + ak.prod(depth2.to_typetracer(), -2, highlevel=False).form + == ak.prod(depth2, -2, highlevel=False).form + ) + + assert to_list(ak.prod(depth2, -3, highlevel=False)) == [ + [101 * 31, 103 * 37, 107 * 41, 109 * 43, 113 * 47], + [1, 1, 1, 1, 1], + [53 * 2, 59 * 3, 61 * 5, 67 * 7, 71 * 11], + ] + assert ( + ak.prod(depth2.to_typetracer(), -3, highlevel=False).form + == ak.prod(depth2, -3, highlevel=False).form + ) + del depth2 + + +def test_0115_generic_reducer_operation_IndexedOptionArray_4(): + content = ak.contents.NumpyArray( + np.array( + [ + 2, + 3, + 5, + 7, + 11, + 31, + 37, + 41, + 43, + 47, + 53, + 59, + 61, + 67, + 71, + 101, + 103, + 107, + 109, + 113, + ], + dtype=np.int64, + ) + ) + index = ak.index.Index64( + np.array( + [ + 15, + 16, + 17, + 18, + 19, + -1, + 10, + 11, + 12, + 13, + 14, + 5, + 6, + 7, + 8, + 9, + -1, + 0, + 1, + 2, + 3, + 4, + ], + dtype=np.int64, + ) + ) + indexedoptionarray = ak.contents.IndexedOptionArray(index, content) + offsets1 = ak.index.Index64(np.array([0, 5, 6, 11, 16, 17, 22], dtype=np.int64)) + listoffsetarray = ak.contents.ListOffsetArray(offsets1, indexedoptionarray) + offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64)) + depth2 = ak.contents.ListOffsetArray(offsets2, listoffsetarray) + depth2 = ak.to_backend(depth2, "cuda") + + assert to_list(depth2) == [ + [[101, 103, 107, 109, 113], [None], [53, 59, 61, 67, 71]], + [[31, 37, 41, 43, 47], [None], [2, 3, 5, 7, 11]], + ] + + assert to_list(ak.prod(depth2, -1, highlevel=False)) == [ + [101 * 103 * 107 * 109 * 113, 1, 53 * 59 * 61 * 67 * 71], + [31 * 37 * 41 * 43 * 47, 1, 2 * 3 * 5 * 7 * 11], + ] + assert ( + ak.prod(depth2.to_typetracer(), -1, highlevel=False).form + == ak.prod(depth2, -1, highlevel=False).form + ) + + assert to_list(ak.prod(depth2, -2, highlevel=False)) == [ + [101 * 53, 103 * 59, 107 * 61, 109 * 67, 113 * 71], + [31 * 2, 37 * 3, 41 * 5, 43 * 7, 47 * 11], + ] + assert ( + ak.prod(depth2.to_typetracer(), -2, highlevel=False).form + == ak.prod(depth2, -2, highlevel=False).form + ) + + assert to_list(ak.prod(depth2, -3, highlevel=False)) == [ + [101 * 31, 103 * 37, 107 * 41, 109 * 43, 113 * 47], + [1], + [53 * 2, 59 * 3, 61 * 5, 67 * 7, 71 * 11], + ] + assert ( + ak.prod(depth2.to_typetracer(), -3, highlevel=False).form + == ak.prod(depth2, -3, highlevel=False).form + ) + del depth2 From bc7e1c02775b84feabe0c5afe97605a9086fc54f Mon Sep 17 00:00:00 2001 From: Manasvi Goyal Date: Fri, 21 Jun 2024 15:55:37 +0200 Subject: [PATCH 30/33] test: add more reducer tests 2 --- ...est_3162_cuda_generic_reducer_operation.py | 627 ++++++++++++++++++ 1 file changed, 627 insertions(+) diff --git a/tests-cuda/test_3162_cuda_generic_reducer_operation.py b/tests-cuda/test_3162_cuda_generic_reducer_operation.py index 94d1bb3570..cd843a1f56 100644 --- a/tests-cuda/test_3162_cuda_generic_reducer_operation.py +++ b/tests-cuda/test_3162_cuda_generic_reducer_operation.py @@ -1320,3 +1320,630 @@ def test_0115_generic_reducer_operation_IndexedOptionArray_4(): == ak.prod(depth2, -3, highlevel=False).form ) del depth2 + + +def test_0115_generic_reducer_operation_sum(): + content2 = ak.contents.NumpyArray( + np.array([1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048], dtype=np.int64) + ) + offsets3 = ak.index.Index64(np.array([0, 4, 8, 12], dtype=np.int64)) + depth1 = ak.contents.ListOffsetArray(offsets3, content2) + depth1 = ak.to_backend(depth1, "cuda", highlevel=False) + + assert to_list(ak.sum(depth1, -1, highlevel=False)) == [ + 1 + 2 + 4 + 8, + 16 + 32 + 64 + 128, + 256 + 512 + 1024 + 2048, + ] + assert ( + ak.sum(depth1.to_typetracer(), -1, highlevel=False).form + == ak.sum(depth1, -1, highlevel=False).form + ) + assert to_list(ak.sum(depth1, 1, highlevel=False)) == [ + 1 + 2 + 4 + 8, + 16 + 32 + 64 + 128, + 256 + 512 + 1024 + 2048, + ] + assert ( + ak.sum(depth1.to_typetracer(), 1, highlevel=False).form + == ak.sum(depth1, 1, highlevel=False).form + ) + + assert to_list(ak.sum(depth1, -2, highlevel=False)) == [ + 1 + 16 + 256, + 2 + 32 + 512, + 4 + 64 + 1024, + 8 + 128 + 2048, + ] + assert ( + ak.sum(depth1.to_typetracer(), -2, highlevel=False).form + == ak.sum(depth1, -2, highlevel=False).form + ) + assert to_list(ak.sum(depth1, 0, highlevel=False)) == [ + 1 + 16 + 256, + 2 + 32 + 512, + 4 + 64 + 1024, + 8 + 128 + 2048, + ] + assert ( + ak.sum(depth1.to_typetracer(), 0, highlevel=False).form + == ak.sum(depth1, 0, highlevel=False).form + ) + del depth1 + + +def test_0115_generic_reducer_operation_any(): + content2 = ak.contents.NumpyArray( + np.array([1.1, 2.2, 3.3, 0.0, 2.2, 0.0, 0.0, 0.0, 0.0, 0.0]) + ) + offsets3 = ak.index.Index64(np.array([0, 3, 6, 10], dtype=np.int64)) + depth1 = ak.contents.ListOffsetArray(offsets3, content2) + depth1 = ak.to_backend(depth1, "cuda", highlevel=False) + + assert to_list(depth1) == [ + [1.1, 2.2, 3.3], + [0.0, 2.2, 0.0], + [0.0, 0.0, 0.0, 0.0], + ] + + assert to_list(ak.any(depth1, -1, highlevel=False)) == [True, True, False] + assert ( + ak.any(depth1.to_typetracer(), -1, highlevel=False).form + == ak.any(depth1, -1, highlevel=False).form + ) + assert to_list(ak.any(depth1, 1, highlevel=False)) == [True, True, False] + assert ( + ak.any(depth1.to_typetracer(), 1, highlevel=False).form + == ak.any(depth1, 1, highlevel=False).form + ) + + assert to_list(ak.any(depth1, -2, highlevel=False)) == [True, True, True, False] + assert ( + ak.any(depth1.to_typetracer(), -2, highlevel=False).form + == ak.any(depth1, -2, highlevel=False).form + ) + assert to_list(ak.any(depth1, 0, highlevel=False)) == [True, True, True, False] + assert ( + ak.any(depth1.to_typetracer(), 0, highlevel=False).form + == ak.any(depth1, 0, highlevel=False).form + ) + del depth1 + + +def test_0115_generic_reducer_operation_all(): + content2 = ak.contents.NumpyArray( + np.array([1.1, 2.2, 3.3, 0.0, 2.2, 0.0, 0.0, 2.2, 0.0, 4.4]) + ) + offsets3 = ak.index.Index64(np.array([0, 3, 6, 10], dtype=np.int64)) + depth1 = ak.contents.ListOffsetArray(offsets3, content2) + depth1 = ak.to_backend(depth1, "cuda", highlevel=False) + + assert to_list(depth1) == [ + [1.1, 2.2, 3.3], + [0.0, 2.2, 0.0], + [0.0, 2.2, 0.0, 4.4], + ] + + assert to_list(ak.all(depth1, -1, highlevel=False)) == [True, False, False] + assert ( + ak.all(depth1.to_typetracer(), -1, highlevel=False).form + == ak.all(depth1, -1, highlevel=False).form + ) + assert to_list(ak.all(depth1, 1, highlevel=False)) == [True, False, False] + assert ( + ak.all(depth1.to_typetracer(), 1, highlevel=False).form + == ak.all(depth1, 1, highlevel=False).form + ) + + assert to_list(ak.all(depth1, -2, highlevel=False)) == [False, True, False, True] + assert ( + ak.all(depth1.to_typetracer(), -2, highlevel=False).form + == ak.all(depth1, -2, highlevel=False).form + ) + assert to_list(ak.all(depth1, 0, highlevel=False)) == [False, True, False, True] + assert ( + ak.all(depth1.to_typetracer(), 0, highlevel=False).form + == ak.all(depth1, 0, highlevel=False).form + ) + del depth1 + + +def test_0115_generic_reducer_operation_count(): + content2 = ak.contents.NumpyArray( + np.array([1.1, 2.2, 3.3, 0.0, 2.2, 0.0, 0.0, 2.2, 0.0, 4.4]) + ) + offsets3 = ak.index.Index64(np.array([0, 3, 6, 10], dtype=np.int64)) + depth1 = ak.contents.ListOffsetArray(offsets3, content2) + depth1 = ak.to_backend(depth1, "cuda", highlevel=False) + + assert to_list(depth1) == [ + [1.1, 2.2, 3.3], + [0.0, 2.2, 0.0], + [0.0, 2.2, 0.0, 4.4], + ] + + assert to_list(ak.count(depth1, -1, highlevel=False)) == [3, 3, 4] + assert ( + ak.count(depth1.to_typetracer(), -1, highlevel=False).form + == ak.count(depth1, -1, highlevel=False).form + ) + assert to_list(ak.count(depth1, 1, highlevel=False)) == [3, 3, 4] + assert ( + ak.count(depth1.to_typetracer(), 1, highlevel=False).form + == ak.count(depth1, 1, highlevel=False).form + ) + + assert to_list(ak.count(depth1, -2, highlevel=False)) == [3, 3, 3, 1] + assert ( + ak.count(depth1.to_typetracer(), -2, highlevel=False).form + == ak.count(depth1, -2, highlevel=False).form + ) + assert to_list(ak.count(depth1, 0, highlevel=False)) == [3, 3, 3, 1] + assert ( + ak.count(depth1.to_typetracer(), 0, highlevel=False).form + == ak.count(depth1, 0, highlevel=False).form + ) + del depth1 + + +def test_0115_generic_reducer_operation_count_nonzero(): + content2 = ak.contents.NumpyArray( + np.array([1.1, 2.2, 3.3, 0.0, 2.2, 0.0, 0.0, 2.2, 0.0, 4.4]) + ) + offsets3 = ak.index.Index64(np.array([0, 3, 6, 10], dtype=np.int64)) + depth1 = ak.contents.ListOffsetArray(offsets3, content2) + depth1 = ak.to_backend(depth1, "cuda", highlevel=False) + + assert to_list(depth1) == [ + [1.1, 2.2, 3.3], + [0.0, 2.2, 0.0], + [0.0, 2.2, 0.0, 4.4], + ] + + assert to_list(ak.count_nonzero(depth1, -1, highlevel=False)) == [3, 1, 2] + assert ( + ak.count_nonzero(depth1.to_typetracer(), -1, highlevel=False).form + == ak.count_nonzero(depth1, -1, highlevel=False).form + ) + assert to_list(ak.count_nonzero(depth1, 1, highlevel=False)) == [3, 1, 2] + assert ( + ak.count_nonzero(depth1.to_typetracer(), 1, highlevel=False).form + == ak.count_nonzero(depth1, 1, highlevel=False).form + ) + + assert to_list(ak.count_nonzero(depth1, -2, highlevel=False)) == [1, 3, 1, 1] + assert ( + ak.count_nonzero(depth1.to_typetracer(), -2, highlevel=False).form + == ak.count_nonzero(depth1, -2, highlevel=False).form + ) + assert to_list(ak.count_nonzero(depth1, 0, highlevel=False)) == [1, 3, 1, 1] + assert ( + ak.count_nonzero(depth1.to_typetracer(), 0, highlevel=False).form + == ak.count_nonzero(depth1, 0, highlevel=False).form + ) + del depth1 + + +def test_0115_generic_reducer_operation_count_min_1(): + content2 = ak.contents.NumpyArray( + np.array([1.1, 2.2, 3.3, 0.0, 2.2, 0.0, 0.0, 2.2, 0.0, 4.4]) + ) + offsets3 = ak.index.Index64(np.array([0, 3, 6, 10], dtype=np.int64)) + depth1 = ak.contents.ListOffsetArray(offsets3, content2) + depth1 = ak.to_backend(depth1, "cuda", highlevel=False) + + assert to_list(depth1) == [ + [1.1, 2.2, 3.3], + [0.0, 2.2, 0.0], + [0.0, 2.2, 0.0, 4.4], + ] + + assert to_list(ak.min(depth1, -1, highlevel=False)) == [1.1, 0.0, 0.0] + assert ( + ak.min(depth1.to_typetracer(), -1, highlevel=False).form + == ak.min(depth1, -1, highlevel=False).form + ) + assert to_list(ak.min(depth1, 1, highlevel=False)) == [1.1, 0.0, 0.0] + assert ( + ak.min(depth1.to_typetracer(), 1, highlevel=False).form + == ak.min(depth1, 1, highlevel=False).form + ) + + assert to_list(ak.min(depth1, -2, highlevel=False)) == [0.0, 2.2, 0.0, 4.4] + assert ( + ak.min(depth1.to_typetracer(), -2, highlevel=False).form + == ak.min(depth1, -2, highlevel=False).form + ) + assert to_list(ak.min(depth1, 0, highlevel=False)) == [0.0, 2.2, 0.0, 4.4] + assert ( + ak.min(depth1.to_typetracer(), 0, highlevel=False).form + == ak.min(depth1, 0, highlevel=False).form + ) + del depth1 + + +def test_0115_generic_reducer_operation_count_min_2(): + content2 = ak.contents.NumpyArray( + np.array([True, True, True, False, True, False, False, True, False, True]) + ) + offsets3 = ak.index.Index64(np.array([0, 3, 6, 10], dtype=np.int64)) + depth1 = ak.contents.ListOffsetArray(offsets3, content2) + depth1 = ak.to_backend(depth1, "cuda", highlevel=False) + + assert to_list(depth1) == [ + [True, True, True], + [False, True, False], + [False, True, False, True], + ] + + assert to_list(ak.min(depth1, -1, highlevel=False)) == [True, False, False] + assert ( + ak.min(depth1.to_typetracer(), -1, highlevel=False).form + == ak.min(depth1, -1, highlevel=False).form + ) + assert to_list(ak.min(depth1, 1, highlevel=False)) == [True, False, False] + assert ( + ak.min(depth1.to_typetracer(), 1, highlevel=False).form + == ak.min(depth1, 1, highlevel=False).form + ) + + assert to_list(ak.min(depth1, -2, highlevel=False)) == [False, True, False, True] + assert ( + ak.min(depth1.to_typetracer(), -2, highlevel=False).form + == ak.min(depth1, -2, highlevel=False).form + ) + assert to_list(ak.min(depth1, 0, highlevel=False)) == [False, True, False, True] + assert ( + ak.min(depth1.to_typetracer(), 0, highlevel=False).form + == ak.min(depth1, 0, highlevel=False).form + ) + del depth1 + + +def test_0115_generic_reducer_operation_count_max_1(): + content2 = ak.contents.NumpyArray( + np.array([1.1, 2.2, 3.3, 0.0, 2.2, 0.0, 0.0, 2.2, 0.0, 4.4]) + ) + offsets3 = ak.index.Index64(np.array([0, 3, 6, 10], dtype=np.int64)) + depth1 = ak.contents.ListOffsetArray(offsets3, content2) + depth1 = ak.to_backend(depth1, "cuda", highlevel=False) + + assert to_list(depth1) == [ + [1.1, 2.2, 3.3], + [0.0, 2.2, 0.0], + [0.0, 2.2, 0.0, 4.4], + ] + + assert to_list(ak.max(depth1, -1, highlevel=False)) == [3.3, 2.2, 4.4] + assert ( + ak.max(depth1.to_typetracer(), -1, highlevel=False).form + == ak.max(depth1, -1, highlevel=False).form + ) + assert to_list(ak.max(depth1, 1, highlevel=False)) == [3.3, 2.2, 4.4] + assert ( + ak.max(depth1.to_typetracer(), 1, highlevel=False).form + == ak.max(depth1, 1, highlevel=False).form + ) + + assert to_list(ak.max(depth1, -2, highlevel=False)) == [1.1, 2.2, 3.3, 4.4] + assert ( + ak.max(depth1.to_typetracer(), -2, highlevel=False).form + == ak.max(depth1, -2, highlevel=False).form + ) + assert to_list(ak.max(depth1, 0, highlevel=False)) == [1.1, 2.2, 3.3, 4.4] + assert ( + ak.max(depth1.to_typetracer(), 0, highlevel=False).form + == ak.max(depth1, 0, highlevel=False).form + ) + del depth1 + + +def test_0115_generic_reducer_operation_count_max_2(): + content2 = ak.contents.NumpyArray( + np.array([False, True, True, False, True, False, False, False, False, False]) + ) + offsets3 = ak.index.Index64(np.array([0, 3, 6, 10], dtype=np.int64)) + depth1 = ak.contents.ListOffsetArray(offsets3, content2) + depth1 = ak.to_backend(depth1, "cuda", highlevel=False) + + assert to_list(depth1) == [ + [False, True, True], + [False, True, False], + [False, False, False, False], + ] + + assert to_list(ak.max(depth1, -1, highlevel=False)) == [True, True, False] + assert ( + ak.max(depth1.to_typetracer(), -1, highlevel=False).form + == ak.max(depth1, -1, highlevel=False).form + ) + assert to_list(ak.max(depth1, 1, highlevel=False)) == [True, True, False] + assert ( + ak.max(depth1.to_typetracer(), 1, highlevel=False).form + == ak.max(depth1, 1, highlevel=False).form + ) + + assert to_list(ak.max(depth1, -2, highlevel=False)) == [False, True, True, False] + assert ( + ak.max(depth1.to_typetracer(), -2, highlevel=False).form + == ak.max(depth1, -2, highlevel=False).form + ) + assert to_list(ak.max(depth1, 0, highlevel=False)) == [False, True, True, False] + assert ( + ak.max(depth1.to_typetracer(), 0, highlevel=False).form + == ak.max(depth1, 0, highlevel=False).form + ) + del depth1 + + +def test_0115_generic_reducer_operation_mask(): + content = ak.contents.NumpyArray( + np.array([1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9]) + ) + offsets = ak.index.Index64(np.array([0, 3, 3, 5, 6, 6, 6, 9], dtype=np.int64)) + array = ak.contents.ListOffsetArray(offsets, content) + array = ak.to_backend(array, "cuda", highlevel=False) + + assert to_list(ak.min(array, axis=-1, mask_identity=False, highlevel=False)) == [ + 1.1, + np.inf, + 4.4, + 6.6, + np.inf, + np.inf, + 7.7, + ] + assert ( + ak.min( + array.to_typetracer(), axis=-1, mask_identity=False, highlevel=False + ).form + == ak.min(array, axis=-1, mask_identity=False, highlevel=False).form + ) + assert to_list(ak.min(array, axis=-1, mask_identity=True, highlevel=False)) == [ + 1.1, + None, + 4.4, + 6.6, + None, + None, + 7.7, + ] + assert ( + ak.min(array.to_typetracer(), axis=-1, mask_identity=True, highlevel=False).form + == ak.min(array, axis=-1, mask_identity=True, highlevel=False).form + ) + del array + + +def test_0115_generic_reducer_operation_ByteMaskedArray(): + content = ak.operations.from_iter( + [ + [[1.1, 0.0, 2.2], [], [3.3, 4.4]], + [], + [[5.5]], + [[6.6, 9.9, 8.8, 7.7]], + [[], [12.2, 11.1, 10.0]], + ], + highlevel=False, + ) + mask = ak.index.Index8(np.array([0, 0, 1, 1, 0], dtype=np.int8)) + v2_array = ak.contents.ByteMaskedArray(mask, content, valid_when=False) + v2_array = ak.to_backend(v2_array, "cuda", highlevel=False) + + assert to_list(v2_array) == [ + [[1.1, 0.0, 2.2], [], [3.3, 4.4]], + [], + None, + None, + [[], [12.2, 11.1, 10.0]], + ] + assert to_list(ak.argmin(v2_array, axis=-1, highlevel=False)) == [ + [1, None, 0], + [], + None, + None, + [None, 2], + ] + assert ( + ak.argmin(v2_array.to_typetracer(), axis=-1, highlevel=False).form + == ak.argmin(v2_array, axis=-1, highlevel=False).form + ) + del v2_array + + +def test_0115_generic_reducer_operation_keepdims(): + nparray = np.array(primes[: 2 * 3 * 5], dtype=np.int64).reshape(2, 3, 5) + content1 = ak.contents.NumpyArray(np.array(primes[: 2 * 3 * 5], dtype=np.int64)) + offsets1 = ak.index.Index64(np.array([0, 5, 10, 15, 20, 25, 30], dtype=np.int64)) + offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64)) + depth2 = ak.contents.ListOffsetArray( + offsets2, ak.contents.ListOffsetArray(offsets1, content1) + ) + depth2 = ak.to_backend(depth2, "cuda", highlevel=False) + + assert to_list(depth2) == [ + [[2, 3, 5, 7, 11], [13, 17, 19, 23, 29], [31, 37, 41, 43, 47]], + [[53, 59, 61, 67, 71], [73, 79, 83, 89, 97], [101, 103, 107, 109, 113]], + ] + + assert to_list( + ak.prod(depth2, axis=-1, keepdims=False, highlevel=False) + ) == to_list(ak.prod(nparray, axis=-1, keepdims=False, highlevel=False)) + assert ( + ak.prod(depth2.to_typetracer(), axis=-1, keepdims=False, highlevel=False).form + == ak.prod(depth2, axis=-1, keepdims=False, highlevel=False).form + ) + assert to_list( + ak.prod(depth2, axis=-2, keepdims=False, highlevel=False) + ) == to_list(ak.prod(nparray, axis=-2, keepdims=False, highlevel=False)) + assert ( + ak.prod(depth2.to_typetracer(), axis=-2, keepdims=False, highlevel=False).form + == ak.prod(depth2, axis=-2, keepdims=False, highlevel=False).form + ) + assert to_list( + ak.prod(depth2, axis=-3, keepdims=False, highlevel=False) + ) == to_list(ak.prod(nparray, axis=-3, keepdims=False, highlevel=False)) + assert ( + ak.prod(depth2.to_typetracer(), axis=-3, keepdims=False, highlevel=False).form + == ak.prod(depth2, axis=-3, keepdims=False, highlevel=False).form + ) + + assert to_list(ak.prod(depth2, axis=-1, keepdims=True, highlevel=False)) == to_list( + ak.prod(nparray, axis=-1, keepdims=True, highlevel=False) + ) + assert ( + ak.prod(depth2.to_typetracer(), axis=-1, keepdims=True, highlevel=False).form + == ak.prod(depth2, axis=-1, keepdims=True, highlevel=False).form + ) + assert to_list(ak.prod(depth2, axis=-2, keepdims=True, highlevel=False)) == to_list( + ak.prod(nparray, axis=-2, keepdims=True, highlevel=False) + ) + assert ( + ak.prod(depth2.to_typetracer(), axis=-2, keepdims=True, highlevel=False).form + == ak.prod(depth2, axis=-2, keepdims=True, highlevel=False).form + ) + assert to_list(ak.prod(depth2, axis=-3, keepdims=True, highlevel=False)) == to_list( + ak.prod(nparray, axis=-3, keepdims=True, highlevel=False) + ) + assert ( + ak.prod(depth2.to_typetracer(), axis=-3, keepdims=True, highlevel=False).form + == ak.prod(depth2, axis=-3, keepdims=True, highlevel=False).form + ) + del depth2 + + +def test_0115_generic_reducer_operation_highlevel_1(): + array = ak.highlevel.Array( + [[[2, 3, 5], [], [7, 11], [13]], [], [[17, 19], [23]]], check_valid=True + ) + array = ak.to_backend(array, "cuda", highlevel=False) + + assert ak.operations.count(array) == 9 + assert to_list(ak.operations.count(array, axis=-1)) == [ + [3, 0, 2, 1], + [], + [2, 1], + ] + assert to_list(ak.operations.count(array, axis=2)) == [ + [3, 0, 2, 1], + [], + [2, 1], + ] + assert to_list(ak.operations.count(array, axis=-1, keepdims=True)) == [ + [[3], [0], [2], [1]], + [], + [[2], [1]], + ] + assert to_list(ak.operations.count(array, axis=-2)) == [ + [3, 2, 1], + [], + [2, 1], + ] + assert to_list(ak.operations.count(array, axis=1)) == [ + [3, 2, 1], + [], + [2, 1], + ] + assert to_list(ak.operations.count(array, axis=-2, keepdims=True)) == [ + [[3, 2, 1]], + [[]], + [[2, 1]], + ] + + assert ak.operations.count_nonzero(array) == 9 + assert to_list(ak.operations.count_nonzero(array, axis=-1)) == [ + [3, 0, 2, 1], + [], + [2, 1], + ] + assert to_list(ak.operations.count_nonzero(array, axis=-2)) == [ + [3, 2, 1], + [], + [2, 1], + ] + + assert ak.operations.sum(array) == 2 + 3 + 5 + 7 + 11 + 13 + 17 + 19 + 23 + assert to_list(ak.operations.sum(array, axis=-1)) == [ + [2 + 3 + 5, 0, 7 + 11, 13], + [], + [17 + 19, 23], + ] + assert to_list(ak.operations.sum(array, axis=-2)) == [ + [2 + 7 + 13, 3 + 11, 5], + [], + [17 + 23, 19], + ] + + assert ak.operations.prod(array) == 2 * 3 * 5 * 7 * 11 * 13 * 17 * 19 * 23 + assert to_list(ak.operations.prod(array, axis=-1)) == [ + [2 * 3 * 5, 1, 7 * 11, 13], + [], + [17 * 19, 23], + ] + assert to_list(ak.operations.prod(array, axis=-2)) == [ + [2 * 7 * 13, 3 * 11, 5], + [], + [17 * 23, 19], + ] + + assert ak.operations.min(array) == 2 + assert to_list(ak.operations.min(array, axis=-1)) == [ + [2, None, 7, 13], + [], + [17, 23], + ] + assert to_list(ak.operations.min(array, axis=-2)) == [ + [2, 3, 5], + [], + [17, 19], + ] + + assert ak.operations.max(array) == 23 + assert to_list(ak.operations.max(array, axis=-1)) == [ + [5, None, 11, 13], + [], + [19, 23], + ] + assert to_list(ak.operations.max(array, axis=-2)) == [ + [13, 11, 5], + [], + [23, 19], + ] + del array + + +def test_0115_generic_reducer_operation_highlevel_2(): + array = ak.highlevel.Array( + [ + [[True, False, True], [], [False, False], [True]], + [], + [[False, True], [True]], + ], + check_valid=True, + ) + array = ak.to_backend(array, "cuda", highlevel=False) + + assert ak.operations.any(array) is np.bool_(True) + assert to_list(ak.operations.any(array, axis=-1)) == [ + [True, False, False, True], + [], + [True, True], + ] + assert to_list(ak.operations.any(array, axis=-2)) == [ + [True, False, True], + [], + [True, True], + ] + + assert ak.operations.all(array) is np.bool_(False) + assert to_list(ak.operations.all(array, axis=-1)) == [ + [False, True, False, True], + [], + [False, True], + ] + assert to_list(ak.operations.all(array, axis=-2)) == [ + [False, False, True], + [], + [False, True], + ] + del array From b59e428035a6dae95bc7e3c392c7ae284e0832eb Mon Sep 17 00:00:00 2001 From: ManasviGoyal Date: Mon, 24 Jun 2024 12:02:50 +0200 Subject: [PATCH 31/33] fix: error for EmptyArray --- ...tOffsetArray_reduce_local_outoffsets_64.cu | 18 +++++++-------- .../cuda_kernels/awkward_reduce_argmax.cu | 22 +++++++++---------- .../cuda_kernels/awkward_reduce_argmin.cu | 22 +++++++++---------- .../cuda_kernels/awkward_reduce_count_64.cu | 18 +++++++-------- .../awkward_reduce_countnonzero.cu | 18 +++++++-------- .../cuda/cuda_kernels/awkward_reduce_max.cu | 18 +++++++-------- .../cuda/cuda_kernels/awkward_reduce_min.cu | 18 +++++++-------- .../cuda/cuda_kernels/awkward_reduce_prod.cu | 18 +++++++-------- .../cuda_kernels/awkward_reduce_prod_bool.cu | 18 +++++++-------- .../cuda/cuda_kernels/awkward_reduce_sum.cu | 18 +++++++-------- .../cuda_kernels/awkward_reduce_sum_bool.cu | 18 +++++++-------- .../awkward_reduce_sum_int32_bool_64.cu | 18 +++++++-------- .../awkward_reduce_sum_int64_bool_64.cu | 18 +++++++-------- 13 files changed, 121 insertions(+), 121 deletions(-) diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu index 42e8119d46..62846dd0a7 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu @@ -59,17 +59,17 @@ awkward_ListOffsetArray_reduce_local_outoffsets_64_b( } __syncthreads(); - for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { - int64_t val = 0; - if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { - val = temp[thread_id - stride]; + if (thread_id < lenparents) { + for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { + int64_t val = 0; + if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { + val = temp[thread_id - stride]; + } + __syncthreads(); + temp[thread_id] += val; + __syncthreads(); } - __syncthreads(); - temp[thread_id] += val; - __syncthreads(); - } - if (thread_id < lenparents) { int64_t parent = parents[thread_id]; if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { atomicAdd(&scan_in_array[parent], temp[thread_id]); diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu index df515f05a4..71754d3588 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu @@ -59,19 +59,19 @@ awkward_reduce_argmax_b( } __syncthreads(); - for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { - int64_t index = -1; - if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { - index = temp[thread_id - stride]; - } - if (index != -1 && (temp[thread_id] == -1 || fromptr[index] > fromptr[temp[thread_id]] || - (fromptr[index] == fromptr[temp[thread_id]] && index < temp[thread_id]))) { - temp[thread_id] = index; + if (thread_id < lenparents) { + for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { + int64_t index = -1; + if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { + index = temp[thread_id - stride]; + } + if (index != -1 && (temp[thread_id] == -1 || fromptr[index] > fromptr[temp[thread_id]] || + (fromptr[index] == fromptr[temp[thread_id]] && index < temp[thread_id]))) { + temp[thread_id] = index; + } + __syncthreads(); } - __syncthreads(); - } - if (thread_id < lenparents) { int64_t parent = parents[thread_id]; if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { atomicExch(&atomic_toptr[parent], temp[thread_id]); diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu index af1d3fd93d..b8517098e8 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu @@ -59,19 +59,19 @@ awkward_reduce_argmin_b( } __syncthreads(); - for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { - int64_t index = -1; - if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { - index = temp[thread_id - stride]; - } - if (index != -1 && (temp[thread_id] == -1 || fromptr[index] < fromptr[temp[thread_id]] || - (fromptr[index] == fromptr[temp[thread_id]] && index < temp[thread_id]))) { - temp[thread_id] = index; + if (thread_id < lenparents) { + for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { + int64_t index = -1; + if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { + index = temp[thread_id - stride]; + } + if (index != -1 && (temp[thread_id] == -1 || fromptr[index] < fromptr[temp[thread_id]] || + (fromptr[index] == fromptr[temp[thread_id]] && index < temp[thread_id]))) { + temp[thread_id] = index; + } + __syncthreads(); } - __syncthreads(); - } - if (thread_id < lenparents) { int64_t parent = parents[thread_id]; if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { atomicExch(&atomic_toptr[parent], temp[thread_id]); diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu index 9c55e69600..ebe8104be2 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu @@ -52,17 +52,17 @@ awkward_reduce_count_64_b( } __syncthreads(); - for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { - int64_t val = 0; - if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { - val = temp[thread_id - stride]; + if (thread_id < lenparents) { + for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { + int64_t val = 0; + if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { + val = temp[thread_id - stride]; + } + __syncthreads(); + temp[thread_id] += val; + __syncthreads(); } - __syncthreads(); - temp[thread_id] += val; - __syncthreads(); - } - if (thread_id < lenparents) { int64_t parent = parents[thread_id]; if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { atomicAdd(&toptr[parent], temp[thread_id]); diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu index ffcb0b8bd3..4dc9f50e5f 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu @@ -54,17 +54,17 @@ awkward_reduce_countnonzero_b( } __syncthreads(); - for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { - int64_t val = 0; - if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { - val = temp[thread_id - stride]; + if (thread_id < lenparents) { + for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { + int64_t val = 0; + if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { + val = temp[thread_id - stride]; + } + __syncthreads(); + temp[thread_id] += val; + __syncthreads(); } - __syncthreads(); - temp[thread_id] += val; - __syncthreads(); - } - if (thread_id < lenparents) { int64_t parent = parents[thread_id]; if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { atomicAdd(&toptr[parent], temp[thread_id]); diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu index 6a3fe66055..2941aa417e 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu @@ -55,18 +55,18 @@ awkward_reduce_max_b( } __syncthreads(); - for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { - T val = identity; + if (thread_id < lenparents) { + for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { + T val = identity; - if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { - val = temp[idx - stride]; + if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { + val = temp[idx - stride]; + } + __syncthreads(); + temp[thread_id] = val > temp[thread_id] ? val : temp[thread_id]; + __syncthreads(); } - __syncthreads(); - temp[thread_id] = val > temp[thread_id] ? val : temp[thread_id]; - __syncthreads(); - } - if (thread_id < lenparents) { int64_t parent = parents[thread_id]; if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { atomicMax(&toptr[parent], temp[thread_id]); diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu index 12a72b338f..e709d687f8 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu @@ -56,17 +56,17 @@ awkward_reduce_min_b( } __syncthreads(); - for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { - T val = identity; - if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { - val = temp[thread_id - stride]; + if (thread_id < lenparents) { + for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { + T val = identity; + if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { + val = temp[thread_id - stride]; + } + __syncthreads(); + temp[thread_id] = val < temp[thread_id] ? val : temp[thread_id]; + __syncthreads(); } - __syncthreads(); - temp[thread_id] = val < temp[thread_id] ? val : temp[thread_id]; - __syncthreads(); - } - if (thread_id < lenparents) { int64_t parent = parents[thread_id]; if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { atomicMin(&toptr[parent], temp[thread_id]); diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod.cu index 9248e20efc..e24e3ab56c 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod.cu @@ -59,17 +59,17 @@ awkward_reduce_prod_b( } __syncthreads(); - for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { - T val = 1; - if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { - val = temp[thread_id - stride]; + if (thread_id < lenparents) { + for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { + T val = 1; + if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { + val = temp[thread_id - stride]; + } + __syncthreads(); + temp[thread_id] *= val; + __syncthreads(); } - __syncthreads(); - temp[thread_id] *= val; - __syncthreads(); - } - if (thread_id < lenparents) { int64_t parent = parents[thread_id]; if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { atomicMul(&atomic_toptr[parent], temp[thread_id]); diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod_bool.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod_bool.cu index 9d85b366c7..db09188f1f 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod_bool.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod_bool.cu @@ -59,17 +59,17 @@ awkward_reduce_prod_bool_b( } __syncthreads(); - for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { - T val = 1; - if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { - val = temp[thread_id - stride]; + if (thread_id < lenparents) { + for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { + T val = 1; + if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { + val = temp[thread_id - stride]; + } + __syncthreads(); + temp[thread_id] &= (val != 0); + __syncthreads(); } - __syncthreads(); - temp[thread_id] &= (val != 0); - __syncthreads(); - } - if (thread_id < lenparents) { int64_t parent = parents[thread_id]; if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { atomicAnd(&atomic_toptr[parent], temp[thread_id]); diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum.cu index 8ce2b8159c..66c320d87f 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum.cu @@ -54,17 +54,17 @@ awkward_reduce_sum_b( } __syncthreads(); - for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { - T val = 0; - if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { - val = temp[thread_id - stride]; + if (thread_id < lenparents) { + for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { + T val = 0; + if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { + val = temp[thread_id - stride]; + } + __syncthreads(); + temp[thread_id] += val; + __syncthreads(); } - __syncthreads(); - temp[thread_id] += val; - __syncthreads(); - } - if (thread_id < lenparents) { int64_t parent = parents[thread_id]; if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { atomicAdd(&toptr[parent], temp[thread_id]); diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_bool.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_bool.cu index f85df8e20a..168ee17cdb 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_bool.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_bool.cu @@ -59,17 +59,17 @@ awkward_reduce_sum_bool_b( } __syncthreads(); - for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { - T val = 0; - if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { - val = temp[thread_id - stride]; + if (thread_id < lenparents) { + for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { + T val = 0; + if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { + val = temp[thread_id - stride]; + } + __syncthreads(); + temp[thread_id] |= (val != 0); + __syncthreads(); } - __syncthreads(); - temp[thread_id] |= (val != 0); - __syncthreads(); - } - if (thread_id < lenparents) { int64_t parent = parents[thread_id]; if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { atomicOr(&atomic_toptr[parent], temp[thread_id]); diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu index f52b6fb21c..67da88ec04 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu @@ -54,17 +54,17 @@ awkward_reduce_sum_int32_bool_64_b( } __syncthreads(); - for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { - T val = 0; - if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { - val = temp[thread_id - stride]; + if (thread_id < lenparents) { + for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { + T val = 0; + if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { + val = temp[thread_id - stride]; + } + __syncthreads(); + temp[thread_id] += val; + __syncthreads(); } - __syncthreads(); - temp[thread_id] += val; - __syncthreads(); - } - if (thread_id < lenparents) { int64_t parent = parents[thread_id]; if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { atomicAdd(&toptr[parent], temp[thread_id]); diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu index 7e220cccc0..2468760ac5 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu @@ -54,17 +54,17 @@ awkward_reduce_sum_int64_bool_64_b( } __syncthreads(); - for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { - T val = 0; - if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { - val = temp[thread_id - stride]; + if (thread_id < lenparents) { + for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { + T val = 0; + if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { + val = temp[thread_id - stride]; + } + __syncthreads(); + temp[thread_id] += val; + __syncthreads(); } - __syncthreads(); - temp[thread_id] += val; - __syncthreads(); - } - if (thread_id < lenparents) { int64_t parent = parents[thread_id]; if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) { atomicAdd(&toptr[parent], temp[thread_id]); From 1cf8e0d9e12fdd742c978be828716b8f6b1d2b26 Mon Sep 17 00:00:00 2001 From: ManasviGoyal Date: Mon, 24 Jun 2024 12:14:10 +0200 Subject: [PATCH 32/33] test: generic_reducer_operation and block_boundary --- tests-cuda/test_3162_block_boundary.py | 170 +++ ...est_3162_cuda_generic_reducer_operation.py | 1203 +++-------------- 2 files changed, 357 insertions(+), 1016 deletions(-) create mode 100644 tests-cuda/test_3162_block_boundary.py diff --git a/tests-cuda/test_3162_block_boundary.py b/tests-cuda/test_3162_block_boundary.py new file mode 100644 index 0000000000..594265c424 --- /dev/null +++ b/tests-cuda/test_3162_block_boundary.py @@ -0,0 +1,170 @@ +from __future__ import annotations + +import cupy as cp +import numpy as np +import pytest + +import awkward as ak + +to_list = ak.operations.to_list + + +@pytest.fixture(scope="function", autouse=True) +def cleanup_cuda(): + yield + cp._default_memory_pool.free_all_blocks() + cp.cuda.Device().synchronize() + + +def test_block_boundary_sum(): + np.random.seed(42) + content = ak.contents.NumpyArray(np.random.randint(3000, size=3000)) + cuda_content = ak.to_backend(content, "cuda", highlevel=False) + assert ak.sum(cuda_content, -1, highlevel=False) == ak.sum( + content, -1, highlevel=False + ) + + offsets = ak.index.Index64(np.array([0, 1, 2998, 3000], dtype=np.int64)) + depth1 = ak.contents.ListOffsetArray(offsets, content) + cuda_depth1 = ak.to_backend(depth1, "cuda", highlevel=False) + assert to_list(ak.sum(cuda_depth1, -1, highlevel=False)) == to_list( + ak.sum(depth1, -1, highlevel=False) + ) + del cuda_content, cuda_depth1 + + +def test_block_boundary_any(): + np.random.seed(42) + content = ak.contents.NumpyArray(np.random.randint(3000, size=3000)) + cuda_content = ak.to_backend(content, "cuda", highlevel=False) + assert ak.any(cuda_content, -1, highlevel=False) == ak.any( + content, -1, highlevel=False + ) + + offsets = ak.index.Index64(np.array([0, 1, 2998, 3000], dtype=np.int64)) + depth1 = ak.contents.ListOffsetArray(offsets, content) + cuda_depth1 = ak.to_backend(depth1, "cuda", highlevel=False) + assert to_list(ak.any(cuda_depth1, -1, highlevel=False)) == to_list( + ak.any(depth1, -1, highlevel=False) + ) + del cuda_content, cuda_depth1 + + +def test_block_boundary_all(): + np.random.seed(42) + content = ak.contents.NumpyArray(np.random.randint(3000, size=3000)) + cuda_content = ak.to_backend(content, "cuda", highlevel=False) + assert ak.all(cuda_content, -1, highlevel=False) == ak.all( + content, -1, highlevel=False + ) + + offsets = ak.index.Index64(np.array([0, 1, 2998, 3000], dtype=np.int64)) + depth1 = ak.contents.ListOffsetArray(offsets, content) + cuda_depth1 = ak.to_backend(depth1, "cuda", highlevel=False) + assert to_list(ak.all(cuda_depth1, -1, highlevel=False)) == to_list( + ak.all(depth1, -1, highlevel=False) + ) + del cuda_content, cuda_depth1 + + +def test_block_boundary_sum_bool(): + np.random.seed(42) + content = ak.contents.NumpyArray(np.random.randint(2, size=3000)) + cuda_content = ak.to_backend(content, "cuda", highlevel=False) + assert ak.sum(cuda_content, -1, highlevel=False) == ak.sum( + content, -1, highlevel=False + ) + + offsets = ak.index.Index64(np.array([0, 1, 2998, 3000], dtype=np.int64)) + depth1 = ak.contents.ListOffsetArray(offsets, content) + cuda_depth1 = ak.to_backend(depth1, "cuda", highlevel=False) + assert to_list(ak.sum(cuda_depth1, -1, highlevel=False)) == to_list( + ak.sum(depth1, -1, highlevel=False) + ) + del cuda_content, cuda_depth1 + + +def test_block_boundary_max(): + np.random.seed(42) + content = ak.contents.NumpyArray(np.random.randint(3000, size=3000)) + cuda_content = ak.to_backend(content, "cuda", highlevel=False) + assert ak.max(cuda_content, -1, highlevel=False) == ak.max( + content, -1, highlevel=False + ) + + offsets = ak.index.Index64(np.array([0, 1, 2998, 3000], dtype=np.int64)) + depth1 = ak.contents.ListOffsetArray(offsets, content) + cuda_depth1 = ak.to_backend(depth1, "cuda", highlevel=False) + assert to_list(ak.max(cuda_depth1, -1, highlevel=False)) == to_list( + ak.max(depth1, -1, highlevel=False) + ) + del cuda_content, cuda_depth1 + + +def test_block_boundary_min(): + np.random.seed(42) + content = ak.contents.NumpyArray(np.random.randint(3000, size=3000)) + cuda_content = ak.to_backend(content, "cuda", highlevel=False) + assert ak.min(cuda_content, -1, highlevel=False) == ak.min( + content, -1, highlevel=False + ) + + offsets = ak.index.Index64(np.array([0, 1, 2998, 3000], dtype=np.int64)) + depth1 = ak.contents.ListOffsetArray(offsets, content) + cuda_depth1 = ak.to_backend(depth1, "cuda", highlevel=False) + assert to_list(ak.min(cuda_depth1, -1, highlevel=False)) == to_list( + ak.min(depth1, -1, highlevel=False) + ) + del cuda_content, cuda_depth1 + + +def test_block_boundary_count(): + np.random.seed(42) + content = ak.contents.NumpyArray(np.random.randint(3000, size=3000)) + cuda_content = ak.to_backend(content, "cuda", highlevel=False) + assert ak.count(cuda_content, -1, highlevel=False) == ak.count( + content, -1, highlevel=False + ) + + offsets = ak.index.Index64(np.array([0, 1, 2998, 3000], dtype=np.int64)) + depth1 = ak.contents.ListOffsetArray(offsets, content) + cuda_depth1 = ak.to_backend(depth1, "cuda", highlevel=False) + assert to_list(ak.count(cuda_depth1, -1, highlevel=False)) == to_list( + ak.count(depth1, -1, highlevel=False) + ) + del cuda_content, cuda_depth1 + + +def test_block_boundary_count_nonzero(): + np.random.seed(42) + content = ak.contents.NumpyArray(np.random.randint(2, size=3000)) + cuda_content = ak.to_backend(content, "cuda", highlevel=False) + assert ak.count_nonzero(cuda_content, -1, highlevel=False) == ak.count_nonzero( + content, -1, highlevel=False + ) + + offsets = ak.index.Index64(np.array([0, 1, 2998, 3000], dtype=np.int64)) + depth1 = ak.contents.ListOffsetArray(offsets, content) + cuda_depth1 = ak.to_backend(depth1, "cuda", highlevel=False) + assert to_list(ak.count_nonzero(cuda_depth1, -1, highlevel=False)) == to_list( + ak.count_nonzero(depth1, -1, highlevel=False) + ) + del cuda_content, cuda_depth1 + + +def test_block_boundary_prod(): + np.random.seed(42) + primes = [x for x in range(2, 30000) if all(x % n != 0 for n in range(2, x))] + content = ak.contents.NumpyArray(primes) + cuda_content = ak.to_backend(content, "cuda", highlevel=False) + assert ak.prod(cuda_content, -1, highlevel=False) == ak.prod( + content, -1, highlevel=False + ) + + offsets = ak.index.Index64(np.array([0, 1, 2998, 3000], dtype=np.int64)) + depth1 = ak.contents.ListOffsetArray(offsets, content) + cuda_depth1 = ak.to_backend(depth1, "cuda", highlevel=False) + assert to_list(ak.prod(cuda_depth1, -1, highlevel=False)) == to_list( + ak.prod(depth1, -1, highlevel=False) + ) + del cuda_content, cuda_depth1 diff --git a/tests-cuda/test_3162_cuda_generic_reducer_operation.py b/tests-cuda/test_3162_cuda_generic_reducer_operation.py index cd843a1f56..0c00106cb4 100644 --- a/tests-cuda/test_3162_cuda_generic_reducer_operation.py +++ b/tests-cuda/test_3162_cuda_generic_reducer_operation.py @@ -1,6 +1,7 @@ from __future__ import annotations import cupy as cp +import cupy.testing as cpt import numpy as np import pytest @@ -16,8 +17,6 @@ def cleanup_cuda(): cp.cuda.Device().synchronize() -to_list = ak.operations.to_list - primes = [x for x in range(2, 1000) if all(x % n != 0 for n in range(2, x))] @@ -80,710 +79,73 @@ def test_0115_generic_reducer_operation_dimension_optiontype_2(): [73 * 79 * 83 * 89 * 97], [53 * 59 * 61 * 67 * 71], ], - [[31 * 37 * 41 * 43 * 47], [13 * 17 * 19 * 23 * 29], [2 * 3 * 5 * 7 * 11]], - ] - del depth2 - - -def test_0115_generic_reducer_operation_reproduce_numpy_1(): - content1 = ak.contents.NumpyArray(np.array(primes[: 2 * 3 * 5], dtype=np.int64)) - offsets1 = ak.index.Index64(np.array([0, 5, 10, 15, 20, 25, 30], dtype=np.int64)) - offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64)) - depth2 = ak.contents.ListOffsetArray( - offsets2, ak.contents.ListOffsetArray(offsets1, content1) - ) - depth2 = ak.to_backend(depth2, "cuda", highlevel=False) - - assert to_list(depth2) == [ - [[2, 3, 5, 7, 11], [13, 17, 19, 23, 29], [31, 37, 41, 43, 47]], - [[53, 59, 61, 67, 71], [73, 79, 83, 89, 97], [101, 103, 107, 109, 113]], - ] - - assert to_list(ak.prod(depth2, axis=-1, highlevel=False)) == [ - [2 * 3 * 5 * 7 * 11, 13 * 17 * 19 * 23 * 29, 31 * 37 * 41 * 43 * 47], - [53 * 59 * 61 * 67 * 71, 73 * 79 * 83 * 89 * 97, 101 * 103 * 107 * 109 * 113], - ] - assert ( - ak.prod(depth2.to_typetracer(), axis=-1, highlevel=False).form - == ak.prod(depth2, axis=-1, highlevel=False).form - ) - assert to_list(ak.prod(depth2, axis=2, highlevel=False)) == [ - [2 * 3 * 5 * 7 * 11, 13 * 17 * 19 * 23 * 29, 31 * 37 * 41 * 43 * 47], - [53 * 59 * 61 * 67 * 71, 73 * 79 * 83 * 89 * 97, 101 * 103 * 107 * 109 * 113], - ] - assert ( - ak.prod(depth2.to_typetracer(), axis=2, highlevel=False).form - == ak.prod(depth2, axis=2, highlevel=False).form - ) - - assert to_list(ak.prod(depth2, axis=-2, highlevel=False)) == [ - [2 * 13 * 31, 3 * 17 * 37, 5 * 19 * 41, 7 * 23 * 43, 11 * 29 * 47], - [53 * 73 * 101, 59 * 79 * 103, 61 * 83 * 107, 67 * 89 * 109, 71 * 97 * 113], - ] - assert ( - ak.prod(depth2.to_typetracer(), axis=-2, highlevel=False).form - == ak.prod(depth2, axis=-2, highlevel=False).form - ) - assert to_list(ak.prod(depth2, axis=1, highlevel=False)) == [ - [2 * 13 * 31, 3 * 17 * 37, 5 * 19 * 41, 7 * 23 * 43, 11 * 29 * 47], - [53 * 73 * 101, 59 * 79 * 103, 61 * 83 * 107, 67 * 89 * 109, 71 * 97 * 113], - ] - assert ( - ak.prod(depth2.to_typetracer(), axis=1, highlevel=False).form - == ak.prod(depth2, axis=1, highlevel=False).form - ) - - assert to_list(ak.prod(depth2, axis=-3, highlevel=False)) == [ - [2 * 53, 3 * 59, 5 * 61, 7 * 67, 11 * 71], - [13 * 73, 17 * 79, 19 * 83, 23 * 89, 29 * 97], - [31 * 101, 37 * 103, 41 * 107, 43 * 109, 47 * 113], - ] - assert ( - ak.prod(depth2.to_typetracer(), axis=-3, highlevel=False).form - == ak.prod(depth2, axis=-3, highlevel=False).form - ) - assert to_list(ak.prod(depth2, axis=0, highlevel=False)) == [ - [2 * 53, 3 * 59, 5 * 61, 7 * 67, 11 * 71], - [13 * 73, 17 * 79, 19 * 83, 23 * 89, 29 * 97], - [31 * 101, 37 * 103, 41 * 107, 43 * 109, 47 * 113], - ] - assert ( - ak.prod(depth2.to_typetracer(), axis=0, highlevel=False).form - == ak.prod(depth2, axis=0, highlevel=False).form - ) - del depth2 - - -def test_0115_generic_reducer_operation_reproduce_numpy_2(): - content2 = ak.contents.NumpyArray(np.array(primes[:12], dtype=np.int64)) - offsets3 = ak.index.Index64(np.array([0, 4, 8, 12], dtype=np.int64)) - depth1 = ak.contents.ListOffsetArray(offsets3, content2) - depth1 = ak.to_backend(depth1, "cuda", highlevel=False) - - assert to_list(ak.prod(depth1, -1, highlevel=False)) == [ - 2 * 3 * 5 * 7, - 11 * 13 * 17 * 19, - 23 * 29 * 31 * 37, - ] - assert ( - ak.prod(depth1.to_typetracer(), -1, highlevel=False).form - == ak.prod(depth1, -1, highlevel=False).form - ) - assert to_list(ak.prod(depth1, 1, highlevel=False)) == [ - 2 * 3 * 5 * 7, - 11 * 13 * 17 * 19, - 23 * 29 * 31 * 37, - ] - assert ( - ak.prod(depth1.to_typetracer(), 1, highlevel=False).form - == ak.prod(depth1, 1, highlevel=False).form - ) - - assert to_list(ak.prod(depth1, -2, highlevel=False)) == [ - 2 * 11 * 23, - 3 * 13 * 29, - 5 * 17 * 31, - 7 * 19 * 37, - ] - assert ( - ak.prod(depth1.to_typetracer(), -2, highlevel=False).form - == ak.prod(depth1, -2, highlevel=False).form - ) - assert to_list(ak.prod(depth1, 0, highlevel=False)) == [ - 2 * 11 * 23, - 3 * 13 * 29, - 5 * 17 * 31, - 7 * 19 * 37, - ] - assert ( - ak.prod(depth1.to_typetracer(), 0, highlevel=False).form - == ak.prod(depth1, 0, highlevel=False).form - ) - del depth1 - - -def test_0115_generic_reducer_operation_gaps_1(): - content1 = ak.contents.NumpyArray( - np.array([123] + primes[: 2 * 3 * 5], dtype=np.int64) - ) - offsets1 = ak.index.Index64(np.array([0, 1, 6, 11, 16, 21, 26, 31], dtype=np.int64)) - offsets2 = ak.index.Index64(np.array([1, 4, 7], dtype=np.int64)) - depth2 = ak.contents.ListOffsetArray( - offsets2, ak.contents.ListOffsetArray(offsets1, content1) - ) - depth2 = ak.to_backend(depth2, "cuda", highlevel=False) - - assert to_list(depth2) == [ - [[2, 3, 5, 7, 11], [13, 17, 19, 23, 29], [31, 37, 41, 43, 47]], - [[53, 59, 61, 67, 71], [73, 79, 83, 89, 97], [101, 103, 107, 109, 113]], - ] - - assert to_list(ak.prod(depth2, -3, highlevel=False)) == [ - [106, 177, 305, 469, 781], - [949, 1343, 1577, 2047, 2813], - [3131, 3811, 4387, 4687, 5311], - ] - assert ( - ak.prod(depth2.to_typetracer(), -3, highlevel=False).form - == ak.prod(depth2, -3, highlevel=False).form - ) - del depth2 - - -def test_0115_generic_reducer_operation_gaps_2(): - content1 = ak.contents.NumpyArray(np.array(primes[: 2 * 3 * 5 - 1], dtype=np.int64)) - offsets1 = ak.index.Index64(np.array([0, 5, 10, 15, 20, 25, 29], dtype=np.int64)) - offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64)) - depth2 = ak.contents.ListOffsetArray( - offsets2, ak.contents.ListOffsetArray(offsets1, content1) - ) - depth2 = ak.to_backend(depth2, "cuda", highlevel=False) - - assert to_list(depth2) == [ - [[2, 3, 5, 7, 11], [13, 17, 19, 23, 29], [31, 37, 41, 43, 47]], - [ - [53, 59, 61, 67, 71], - [73, 79, 83, 89, 97], - [ - 101, - 103, - 107, - 109, - ], - ], - ] - - assert to_list(ak.prod(depth2, -3, highlevel=False)) == [ - [106, 177, 305, 469, 781], - [949, 1343, 1577, 2047, 2813], - [3131, 3811, 4387, 4687, 47], - ] - assert ( - ak.prod(depth2.to_typetracer(), -3, highlevel=False).form - == ak.prod(depth2, -3, highlevel=False).form - ) - del depth2 - - -def test_0115_generic_reducer_operation_gaps_3(): - content1 = ak.contents.NumpyArray(np.array(primes[: 2 * 3 * 5 - 2], dtype=np.int64)) - offsets1 = ak.index.Index64(np.array([0, 5, 10, 15, 20, 25, 28], dtype=np.int64)) - offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64)) - depth2 = ak.contents.ListOffsetArray( - offsets2, ak.contents.ListOffsetArray(offsets1, content1) - ) - depth2 = ak.to_backend(depth2, "cuda", highlevel=False) - - assert to_list(depth2) == [ - [[2, 3, 5, 7, 11], [13, 17, 19, 23, 29], [31, 37, 41, 43, 47]], - [ - [53, 59, 61, 67, 71], - [73, 79, 83, 89, 97], - [ - 101, - 103, - 107, - ], - ], - ] - - assert to_list(ak.prod(depth2, -3, highlevel=False)) == [ - [106, 177, 305, 469, 781], - [949, 1343, 1577, 2047, 2813], - [3131, 3811, 4387, 43, 47], - ] - assert ( - ak.prod(depth2.to_typetracer(), -3, highlevel=False).form - == ak.prod(depth2, -3, highlevel=False).form - ) - del depth2 - - -def test_0115_generic_reducer_operation_gaps_4(): - content1 = ak.contents.NumpyArray( - np.array( - [ - 2, - 3, - 5, - 7, - 11, - 13, - 17, - 19, - 23, - 29, - 31, - 37, - 41, - 43, - 47, - 53, - 59, - 61, - 67, - 71, - 73, - 79, - 83, - 89, - 101, - 103, - 107, - 109, - ], - dtype=np.int64, - ) - ) - offsets1 = ak.index.Index64(np.array([0, 5, 10, 15, 20, 24, 28], dtype=np.int64)) - offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64)) - depth2 = ak.contents.ListOffsetArray( - offsets2, ak.contents.ListOffsetArray(offsets1, content1) - ) - depth2 = ak.to_backend(depth2, "cuda", highlevel=False) - - assert to_list(depth2) == [ - [[2, 3, 5, 7, 11], [13, 17, 19, 23, 29], [31, 37, 41, 43, 47]], - [ - [53, 59, 61, 67, 71], - [ - 73, - 79, - 83, - 89, - ], - [101, 103, 107, 109], - ], - ] - - assert to_list(ak.prod(depth2, -3, highlevel=False)) == [ - [106, 177, 305, 469, 781], - [949, 1343, 1577, 2047, 29], - [3131, 3811, 4387, 4687, 47], - ] - assert ( - ak.prod(depth2.to_typetracer(), -3, highlevel=False).form - == ak.prod(depth2, -3, highlevel=False).form - ) - del depth2 - - -def test_0115_generic_reducer_operation_gaps_5(): - content1 = ak.contents.NumpyArray(np.array(primes[1 : 2 * 3 * 5], dtype=np.int64)) - offsets1 = ak.index.Index64(np.array([0, 4, 9, 14, 19, 24, 29], dtype=np.int64)) - offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64)) - depth2 = ak.contents.ListOffsetArray( - offsets2, ak.contents.ListOffsetArray(offsets1, content1) - ) - depth2 = ak.to_backend(depth2, "cuda", highlevel=False) - - assert to_list(depth2) == [ - [[3, 5, 7, 11], [13, 17, 19, 23, 29], [31, 37, 41, 43, 47]], - [[53, 59, 61, 67, 71], [73, 79, 83, 89, 97], [101, 103, 107, 109, 113]], - ] - - assert to_list(ak.prod(depth2, -3, highlevel=False)) == [ - [159, 295, 427, 737, 71], - [949, 1343, 1577, 2047, 2813], - [3131, 3811, 4387, 4687, 5311], - ] - assert ( - ak.prod(depth2.to_typetracer(), -3, highlevel=False).form - == ak.prod(depth2, -3, highlevel=False).form - ) - del depth2 - - -def test_0115_generic_reducer_operation_gaps_6(): - content1 = ak.contents.NumpyArray(np.array(primes[2 : 2 * 3 * 5], dtype=np.int64)) - offsets1 = ak.index.Index64(np.array([0, 3, 8, 13, 18, 23, 28], dtype=np.int64)) - offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64)) - depth2 = ak.contents.ListOffsetArray( - offsets2, ak.contents.ListOffsetArray(offsets1, content1) - ) - depth2 = ak.to_backend(depth2, "cuda", highlevel=False) - - assert to_list(depth2) == [ - [[5, 7, 11], [13, 17, 19, 23, 29], [31, 37, 41, 43, 47]], - [[53, 59, 61, 67, 71], [73, 79, 83, 89, 97], [101, 103, 107, 109, 113]], - ] - - assert to_list(ak.prod(depth2, -3, highlevel=False)) == [ - [265, 413, 671, 67, 71], - [949, 1343, 1577, 2047, 2813], - [3131, 3811, 4387, 4687, 5311], - ] - assert ( - ak.prod(depth2.to_typetracer(), -3, highlevel=False).form - == ak.prod(depth2, -3, highlevel=False).form - ) - del depth2 - - -def test_0115_generic_reducer_operation_gaps_7(): - content1 = ak.contents.NumpyArray( - np.array( - [ - 3, - 5, - 7, - 13, - 17, - 19, - 23, - 29, - 31, - 37, - 41, - 43, - 47, - 53, - 59, - 61, - 67, - 71, - 73, - 79, - 83, - 89, - 97, - 101, - 103, - 107, - 109, - 113, - ], - dtype=np.int64, - ) - ) - offsets1 = ak.index.Index64(np.array([0, 3, 8, 13, 18, 23, 28], dtype=np.int64)) - offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64)) - depth2 = ak.contents.ListOffsetArray( - offsets2, ak.contents.ListOffsetArray(offsets1, content1) - ) - depth2 = ak.to_backend(depth2, "cuda", highlevel=False) - - assert to_list(depth2) == [ - [ - [ - 3, - 5, - 7, - ], - [13, 17, 19, 23, 29], - [31, 37, 41, 43, 47], - ], - [[53, 59, 61, 67, 71], [73, 79, 83, 89, 97], [101, 103, 107, 109, 113]], - ] - - assert to_list(ak.prod(depth2, -3, highlevel=False)) == [ - [159, 295, 427, 67, 71], - [949, 1343, 1577, 2047, 2813], - [3131, 3811, 4387, 4687, 5311], - ] - assert ( - ak.prod(depth2.to_typetracer(), -3, highlevel=False).form - == ak.prod(depth2, -3, highlevel=False).form - ) - del depth2 - - -def test_0115_generic_reducer_operation_gaps_8(): - content1 = ak.contents.NumpyArray( - np.array( - [ - 3, - 5, - 7, - 11, - 13, - 17, - 19, - 23, - 31, - 37, - 41, - 43, - 47, - 53, - 59, - 61, - 67, - 71, - 73, - 79, - 83, - 89, - 97, - 101, - 103, - 107, - 109, - 113, - ], - dtype=np.int64, - ) - ) - offsets1 = ak.index.Index64(np.array([0, 4, 8, 13, 18, 23, 28], dtype=np.int64)) - offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64)) - depth2 = ak.contents.ListOffsetArray( - offsets2, ak.contents.ListOffsetArray(offsets1, content1) - ) - depth2 = ak.to_backend(depth2, "cuda", highlevel=False) - - assert to_list(depth2) == [ - [[3, 5, 7, 11], [13, 17, 19, 23], [31, 37, 41, 43, 47]], - [[53, 59, 61, 67, 71], [73, 79, 83, 89, 97], [101, 103, 107, 109, 113]], - ] - - assert to_list(ak.prod(depth2, -3, highlevel=False)) == [ - [159, 295, 427, 737, 71], - [949, 1343, 1577, 2047, 97], - [3131, 3811, 4387, 4687, 5311], - ] - assert ( - ak.prod(depth2.to_typetracer(), -3, highlevel=False).form - == ak.prod(depth2, -3, highlevel=False).form - ) - del depth2 - - -def test_0115_generic_reducer_operation_gaps_9(): - content1 = ak.contents.NumpyArray( - np.array( - [ - 2, - 3, - 5, - 7, - 11, - 13, - 17, - 19, - 23, - 29, - 31, - 37, - 41, - 43, - 53, - 59, - 61, - 67, - 71, - 73, - 79, - 83, - 89, - 97, - 101, - 103, - 107, - 109, - ], - dtype=np.int64, - ) - ) - offsets1 = ak.index.Index64(np.array([0, 5, 10, 14, 19, 24, 28], dtype=np.int64)) - offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64)) - depth2 = ak.contents.ListOffsetArray( - offsets2, ak.contents.ListOffsetArray(offsets1, content1) - ) - depth2 = ak.to_backend(depth2, "cuda", highlevel=False) - - assert to_list(depth2) == [ - [[2, 3, 5, 7, 11], [13, 17, 19, 23, 29], [31, 37, 41, 43]], - [[53, 59, 61, 67, 71], [73, 79, 83, 89, 97], [101, 103, 107, 109]], - ] - - assert to_list(ak.prod(depth2, -3, highlevel=False)) == [ - [106, 177, 305, 469, 781], - [949, 1343, 1577, 2047, 2813], - [3131, 3811, 4387, 4687], - ] - assert ( - ak.prod(depth2.to_typetracer(), -3, highlevel=False).form - == ak.prod(depth2, -3, highlevel=False).form - ) - del depth2 - - -def test_0115_generic_reducer_operation_gaps_10(): - content1 = ak.contents.NumpyArray( - np.array( - [ - 2, - 3, - 5, - 7, - 11, - 13, - 17, - 19, - 23, - 31, - 37, - 41, - 43, - 47, - 53, - 59, - 61, - 67, - 71, - 73, - 79, - 83, - 89, - 101, - 103, - 107, - 109, - 113, - ], - dtype=np.int64, - ) - ) - offsets1 = ak.index.Index64(np.array([0, 5, 9, 14, 19, 23, 28], dtype=np.int64)) - offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64)) - depth2 = ak.contents.ListOffsetArray( - offsets2, ak.contents.ListOffsetArray(offsets1, content1) - ) - depth2 = ak.to_backend(depth2, "cuda", highlevel=False) - - assert to_list(depth2) == [ - [[2, 3, 5, 7, 11], [13, 17, 19, 23], [31, 37, 41, 43, 47]], - [[53, 59, 61, 67, 71], [73, 79, 83, 89], [101, 103, 107, 109, 113]], - ] - - assert to_list(ak.prod(depth2, -3, highlevel=False)) == [ - [106, 177, 305, 469, 781], - [949, 1343, 1577, 2047], - [3131, 3811, 4387, 4687, 5311], - ] - assert ( - ak.prod(depth2.to_typetracer(), -3, highlevel=False).form - == ak.prod(depth2, -3, highlevel=False).form - ) - del depth2 - - -def test_0115_generic_reducer_operation_gaps_11(): - content1 = ak.contents.NumpyArray(np.array(primes[:9], dtype=np.int64)) - offsets1 = ak.index.Index64(np.array([0, 3, 4, 6, 6, 7, 9], dtype=np.int64)) - offsets2 = ak.index.Index64(np.array([0, 2, 4, 6], dtype=np.int64)) - depth2 = ak.contents.ListOffsetArray( - offsets2, ak.contents.ListOffsetArray(offsets1, content1) - ) - depth2 = ak.to_backend(depth2, "cuda", highlevel=False) - - assert to_list(depth2) == [[[2, 3, 5], [7]], [[11, 13], []], [[17], [19, 23]]] - - assert to_list(ak.prod(depth2, -3, highlevel=False)) == [ - [2 * 11 * 17, 3 * 13, 5], - [7 * 19, 23], - ] - assert ( - ak.prod(depth2.to_typetracer(), -3, highlevel=False).form - == ak.prod(depth2, -3, highlevel=False).form - ) - del depth2 - - -def test_0115_generic_reducer_operation_gaps_12(): - content1 = ak.contents.NumpyArray(np.array(primes[:9], dtype=np.int64)) - offsets1 = ak.index.Index64(np.array([0, 3, 4, 6, 7, 9], dtype=np.int64)) - offsets2 = ak.index.Index64(np.array([0, 2, 3, 5], dtype=np.int64)) - depth2 = ak.contents.ListOffsetArray( - offsets2, ak.contents.ListOffsetArray(offsets1, content1) - ) - depth2 = ak.to_backend(depth2, "cuda", highlevel=False) - - assert to_list(depth2) == [[[2, 3, 5], [7]], [[11, 13]], [[17], [19, 23]]] - - assert to_list(ak.prod(depth2, -3, highlevel=False)) == [ - [2 * 11 * 17, 3 * 13, 5], - [7 * 19, 23], - ] - assert ( - ak.prod(depth2.to_typetracer(), -3, highlevel=False).form - == ak.prod(depth2, -3, highlevel=False).form - ) - del depth2 - - -def test_0115_generic_reducer_operation_gaps_13(): - content1 = ak.contents.NumpyArray(np.array(primes[:10], dtype=np.int64)) - offsets1 = ak.index.Index64(np.array([0, 3, 5, 6, 8, 9, 10], dtype=np.int64)) - offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64)) - depth2 = ak.contents.ListOffsetArray( - offsets2, ak.contents.ListOffsetArray(offsets1, content1) - ) - depth2 = ak.to_backend(depth2, "cuda", highlevel=False) - - assert to_list(depth2) == [[[2, 3, 5], [7, 11], [13]], [[17, 19], [23], [29]]] - - assert to_list(ak.prod(depth2, -3, highlevel=False)) == [ - [34, 57, 5], - [161, 11], - [377], + [[31 * 37 * 41 * 43 * 47], [13 * 17 * 19 * 23 * 29], [2 * 3 * 5 * 7 * 11]], ] - assert ( - ak.prod(depth2.to_typetracer(), -3, highlevel=False).form - == ak.prod(depth2, -3, highlevel=False).form - ) del depth2 -def test_0115_generic_reducer_operation_gaps_14(): - content1 = ak.contents.NumpyArray(np.array(primes[:9], dtype=np.int64)) - offsets1 = ak.index.Index64(np.array([0, 3, 3, 5, 6, 8, 9], dtype=np.int64)) - offsets2 = ak.index.Index64(np.array([0, 4, 6], dtype=np.int64)) +def test_0115_generic_reducer_operation_reproduce_numpy_1(): + content1 = ak.contents.NumpyArray(np.array(primes[: 2 * 3 * 5], dtype=np.int64)) + offsets1 = ak.index.Index64(np.array([0, 5, 10, 15, 20, 25, 30], dtype=np.int64)) + offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64)) depth2 = ak.contents.ListOffsetArray( offsets2, ak.contents.ListOffsetArray(offsets1, content1) ) depth2 = ak.to_backend(depth2, "cuda", highlevel=False) - assert to_list(depth2) == [[[2, 3, 5], [], [7, 11], [13]], [[17, 19], [23]]] + assert to_list(depth2) == [ + [[2, 3, 5, 7, 11], [13, 17, 19, 23, 29], [31, 37, 41, 43, 47]], + [[53, 59, 61, 67, 71], [73, 79, 83, 89, 97], [101, 103, 107, 109, 113]], + ] - assert to_list(ak.prod(depth2, -3, highlevel=False)) == [ - [34, 57, 5], - [23], - [7, 11], - [13], + assert to_list(ak.prod(depth2, axis=-1, highlevel=False)) == [ + [2 * 3 * 5 * 7 * 11, 13 * 17 * 19 * 23 * 29, 31 * 37 * 41 * 43 * 47], + [53 * 59 * 61 * 67 * 71, 73 * 79 * 83 * 89 * 97, 101 * 103 * 107 * 109 * 113], ] assert ( - ak.prod(depth2.to_typetracer(), -3, highlevel=False).form - == ak.prod(depth2, -3, highlevel=False).form + ak.prod(depth2.to_typetracer(), axis=-1, highlevel=False).form + == ak.prod(depth2, axis=-1, highlevel=False).form + ) + assert to_list(ak.prod(depth2, axis=2, highlevel=False)) == [ + [2 * 3 * 5 * 7 * 11, 13 * 17 * 19 * 23 * 29, 31 * 37 * 41 * 43 * 47], + [53 * 59 * 61 * 67 * 71, 73 * 79 * 83 * 89 * 97, 101 * 103 * 107 * 109 * 113], + ] + assert ( + ak.prod(depth2.to_typetracer(), axis=2, highlevel=False).form + == ak.prod(depth2, axis=2, highlevel=False).form ) del depth2 -def test_0115_generic_reducer_operation_gaps_15(): - content1 = ak.contents.NumpyArray(np.array(primes[:9], dtype=np.int64)) - offsets1 = ak.index.Index64(np.array([0, 3, 3, 5, 6, 8, 9], dtype=np.int64)) - offsets2 = ak.index.Index64(np.array([0, 4, 4, 6], dtype=np.int64)) - depth2 = ak.contents.ListOffsetArray( - offsets2, ak.contents.ListOffsetArray(offsets1, content1) - ) - depth2 = ak.to_backend(depth2, "cuda", highlevel=False) - - assert to_list(depth2) == [[[2, 3, 5], [], [7, 11], [13]], [], [[17, 19], [23]]] +def test_0115_generic_reducer_operation_reproduce_numpy_2(): + content2 = ak.contents.NumpyArray(np.array(primes[:12], dtype=np.int64)) + offsets3 = ak.index.Index64(np.array([0, 4, 8, 12], dtype=np.int64)) + depth1 = ak.contents.ListOffsetArray(offsets3, content2) + depth1 = ak.to_backend(depth1, "cuda", highlevel=False) - assert to_list(ak.prod(depth2, -3, highlevel=False)) == [ - [34, 57, 5], - [23], - [7, 11], - [13], + assert to_list(ak.prod(depth1, -1, highlevel=False)) == [ + 2 * 3 * 5 * 7, + 11 * 13 * 17 * 19, + 23 * 29 * 31 * 37, ] assert ( - ak.prod(depth2.to_typetracer(), -3, highlevel=False).form - == ak.prod(depth2, -3, highlevel=False).form + ak.prod(depth1.to_typetracer(), -1, highlevel=False).form + == ak.prod(depth1, -1, highlevel=False).form ) - del depth2 + assert to_list(ak.prod(depth1, 1, highlevel=False)) == [ + 2 * 3 * 5 * 7, + 11 * 13 * 17 * 19, + 23 * 29 * 31 * 37, + ] + assert ( + ak.prod(depth1.to_typetracer(), 1, highlevel=False).form + == ak.prod(depth1, 1, highlevel=False).form + ) + + del depth1 -def test_0115_generic_reducer_operation_gaps_16(): +def test_0115_generic_reducer_operation_gaps_1(): content1 = ak.contents.NumpyArray(np.array(primes[: 2 * 3 * 5], dtype=np.int64)) offsets1 = ak.index.Index64(np.array([0, 5, 10, 15, 20, 25, 30], dtype=np.int64)) offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64)) @@ -806,18 +168,10 @@ def test_0115_generic_reducer_operation_gaps_16(): == ak.prod(depth2, -1, highlevel=False).form ) - assert to_list(ak.prod(depth2, -2, highlevel=False)) == [ - [2 * 13 * 31, 3 * 17 * 37, 5 * 19 * 41, 7 * 23 * 43, 11 * 29 * 47], - [53 * 73 * 101, 59 * 79 * 103, 61 * 83 * 107, 67 * 89 * 109, 71 * 97 * 113], - ] - assert ( - ak.prod(depth2.to_typetracer(), -2, highlevel=False).form - == ak.prod(depth2, -2, highlevel=False).form - ) del depth2 -def test_0115_generic_reducer_operation_gaps_17(): +def test_0115_generic_reducer_operation_gaps_2(): content1 = ak.contents.NumpyArray(np.array(primes[:9], dtype=np.int64)) offsets1 = ak.index.Index64(np.array([0, 3, 3, 5, 6, 8, 9], dtype=np.int64)) offsets2 = ak.index.Index64(np.array([0, 4, 4, 6], dtype=np.int64)) @@ -842,26 +196,6 @@ def test_0115_generic_reducer_operation_gaps_17(): == ak.prod(depth2, -1, highlevel=False).form ) - assert to_list(ak.prod(depth2, -2, highlevel=False)) == [ - [2 * 7 * 13, 3 * 11, 5], - [], - [17 * 23, 19], - ] - assert ( - ak.prod(depth2.to_typetracer(), -2, highlevel=False).form - == ak.prod(depth2, -2, highlevel=False).form - ) - - assert to_list(ak.prod(depth2, -3, highlevel=False)) == [ - [2 * 17, 3 * 19, 5], - [23], - [7, 11], - [13], - ] - assert ( - ak.prod(depth2.to_typetracer(), -3, highlevel=False).form - == ak.prod(depth2, -3, highlevel=False).form - ) del depth2 @@ -927,38 +261,6 @@ def test_0115_generic_reducer_operation_complicated(): == ak.prod(complicated["y"], -1, highlevel=False).form ) - with pytest.raises(TypeError): - to_list(ak.prod(complicated, -2, highlevel=False)) - - with pytest.raises(TypeError): - assert ( - ak.prod(complicated.to_typetracer(), -2, highlevel=False).form - == ak.prod(complicated, -2, highlevel=False).form - ) - assert to_list(ak.prod(complicated["x"], -2, highlevel=False)) == [ - [2, 3, 5], - [], - [7, 11], - ] - assert ( - ak.prod(complicated.to_typetracer()["x"], -2, highlevel=False).form - == ak.prod(complicated["x"], -2, highlevel=False).form - ) - assert to_list(ak.prod(complicated["y"], -2, highlevel=False)) == [ - [[182, 33, 5]], - [], - [[], [391, 19]], - ] - assert ( - ak.prod(complicated.to_typetracer()["y"], -2, highlevel=False).form - == ak.prod(complicated["y"], -2, highlevel=False).form - ) - - assert to_list(complicated[0]) == [ - {"x": [2, 3, 5], "y": [[2, 3, 5], [], [7, 11], [13]]} - ] - assert complicated.to_typetracer()[0].form == complicated[0].form - with pytest.raises(TypeError): to_list(ak.prod(complicated[0], -1, highlevel=False)) @@ -970,8 +272,7 @@ def test_0115_generic_reducer_operation_complicated(): def test_0115_generic_reducer_operation_EmptyArray(): offsets = ak.index.Index64(np.array([0, 0, 0, 0], dtype=np.int64)) array = ak.contents.ListOffsetArray(offsets, ak.contents.EmptyArray()) - array = ak.to_backend(array, "cuda") - + array = ak.to_backend(array, "cuda", highlevel=False) assert to_list(array) == [[], [], []] assert to_list(ak.prod(array, -1, highlevel=False)) == [1, 1, 1] @@ -984,7 +285,7 @@ def test_0115_generic_reducer_operation_EmptyArray(): array = ak.contents.ListOffsetArray( offsets, ak.contents.NumpyArray(np.array([], dtype=np.int64)) ) - array = ak.to_backend(array, "cuda") + array = ak.to_backend(array, "cuda", highlevel=False) assert to_list(array) == [[], [], []] @@ -1004,7 +305,7 @@ def test_0115_generic_reducer_operation_IndexedOptionArray_1(): indexedarray = ak.contents.IndexedArray(index, listoffsetarray) offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64)) depth2 = ak.contents.ListOffsetArray(offsets2, indexedarray) - depth2 = ak.to_backend(depth2, "cuda") + depth2 = ak.to_backend(depth2, "cuda", highlevel=False) assert to_list(depth2) == [ [[101, 103, 107, 109, 113], [73, 79, 83, 89, 97], [53, 59, 61, 67, 71]], @@ -1020,24 +321,6 @@ def test_0115_generic_reducer_operation_IndexedOptionArray_1(): == ak.prod(depth2, -1, highlevel=False).form ) - assert to_list(ak.prod(depth2, -2, highlevel=False)) == [ - [101 * 73 * 53, 103 * 79 * 59, 107 * 83 * 61, 109 * 89 * 67, 113 * 97 * 71], - [31 * 13 * 2, 37 * 17 * 3, 41 * 19 * 5, 43 * 23 * 7, 47 * 29 * 11], - ] - assert ( - ak.prod(depth2.to_typetracer(), -2, highlevel=False).form - == ak.prod(depth2, -2, highlevel=False).form - ) - - assert to_list(ak.prod(depth2, -3, highlevel=False)) == [ - [101 * 31, 103 * 37, 107 * 41, 109 * 43, 113 * 47], - [73 * 13, 79 * 17, 83 * 19, 89 * 23, 97 * 29], - [53 * 2, 59 * 3, 61 * 5, 67 * 7, 71 * 11], - ] - assert ( - ak.prod(depth2.to_typetracer(), -3, highlevel=False).form - == ak.prod(depth2, -3, highlevel=False).form - ) del depth2 @@ -1075,7 +358,7 @@ def test_0115_generic_reducer_operation_IndexedOptionArray_2(): indexedoptionarray = ak.contents.IndexedOptionArray(index, listoffsetarray) offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64)) depth2 = ak.contents.ListOffsetArray(offsets2, indexedoptionarray) - depth2 = ak.to_backend(depth2, "cuda") + depth2 = ak.to_backend(depth2, "cuda", highlevel=False) assert to_list(depth2) == [ [[101, 103, 107, 109, 113], None, [53, 59, 61, 67, 71]], @@ -1091,24 +374,6 @@ def test_0115_generic_reducer_operation_IndexedOptionArray_2(): == ak.prod(depth2, -1, highlevel=False).form ) - assert to_list(ak.prod(depth2, -2, highlevel=False)) == [ - [101 * 53, 103 * 59, 107 * 61, 109 * 67, 113 * 71], - [31 * 2, 37 * 3, 41 * 5, 43 * 7, 47 * 11], - ] - assert ( - ak.prod(depth2.to_typetracer(), -2, highlevel=False).form - == ak.prod(depth2, -2, highlevel=False).form - ) - - assert to_list(ak.prod(depth2, -3, highlevel=False)) == [ - [101 * 31, 103 * 37, 107 * 41, 109 * 43, 113 * 47], - [], - [53 * 2, 59 * 3, 61 * 5, 67 * 7, 71 * 11], - ] - assert ( - ak.prod(depth2.to_typetracer(), -3, highlevel=False).form - == ak.prod(depth2, -3, highlevel=False).form - ) del depth2 @@ -1182,7 +447,7 @@ def test_0115_generic_reducer_operation_IndexedOptionArray_3(): listoffsetarray = ak.contents.ListOffsetArray(offsets1, indexedoptionarray) offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64)) depth2 = ak.contents.ListOffsetArray(offsets2, listoffsetarray) - depth2 = ak.to_backend(depth2, "cuda") + depth2 = ak.to_backend(depth2, "cuda", highlevel=False) assert to_list(depth2) == [ [ @@ -1202,24 +467,6 @@ def test_0115_generic_reducer_operation_IndexedOptionArray_3(): == ak.prod(depth2, -1, highlevel=False).form ) - assert to_list(ak.prod(depth2, -2, highlevel=False)) == [ - [101 * 53, 103 * 59, 107 * 61, 109 * 67, 113 * 71], - [31 * 2, 37 * 3, 41 * 5, 43 * 7, 47 * 11], - ] - assert ( - ak.prod(depth2.to_typetracer(), -2, highlevel=False).form - == ak.prod(depth2, -2, highlevel=False).form - ) - - assert to_list(ak.prod(depth2, -3, highlevel=False)) == [ - [101 * 31, 103 * 37, 107 * 41, 109 * 43, 113 * 47], - [1, 1, 1, 1, 1], - [53 * 2, 59 * 3, 61 * 5, 67 * 7, 71 * 11], - ] - assert ( - ak.prod(depth2.to_typetracer(), -3, highlevel=False).form - == ak.prod(depth2, -3, highlevel=False).form - ) del depth2 @@ -1285,7 +532,7 @@ def test_0115_generic_reducer_operation_IndexedOptionArray_4(): listoffsetarray = ak.contents.ListOffsetArray(offsets1, indexedoptionarray) offsets2 = ak.index.Index64(np.array([0, 3, 6], dtype=np.int64)) depth2 = ak.contents.ListOffsetArray(offsets2, listoffsetarray) - depth2 = ak.to_backend(depth2, "cuda") + depth2 = ak.to_backend(depth2, "cuda", highlevel=False) assert to_list(depth2) == [ [[101, 103, 107, 109, 113], [None], [53, 59, 61, 67, 71]], @@ -1301,24 +548,6 @@ def test_0115_generic_reducer_operation_IndexedOptionArray_4(): == ak.prod(depth2, -1, highlevel=False).form ) - assert to_list(ak.prod(depth2, -2, highlevel=False)) == [ - [101 * 53, 103 * 59, 107 * 61, 109 * 67, 113 * 71], - [31 * 2, 37 * 3, 41 * 5, 43 * 7, 47 * 11], - ] - assert ( - ak.prod(depth2.to_typetracer(), -2, highlevel=False).form - == ak.prod(depth2, -2, highlevel=False).form - ) - - assert to_list(ak.prod(depth2, -3, highlevel=False)) == [ - [101 * 31, 103 * 37, 107 * 41, 109 * 43, 113 * 47], - [1], - [53 * 2, 59 * 3, 61 * 5, 67 * 7, 71 * 11], - ] - assert ( - ak.prod(depth2.to_typetracer(), -3, highlevel=False).form - == ak.prod(depth2, -3, highlevel=False).form - ) del depth2 @@ -1349,26 +578,6 @@ def test_0115_generic_reducer_operation_sum(): == ak.sum(depth1, 1, highlevel=False).form ) - assert to_list(ak.sum(depth1, -2, highlevel=False)) == [ - 1 + 16 + 256, - 2 + 32 + 512, - 4 + 64 + 1024, - 8 + 128 + 2048, - ] - assert ( - ak.sum(depth1.to_typetracer(), -2, highlevel=False).form - == ak.sum(depth1, -2, highlevel=False).form - ) - assert to_list(ak.sum(depth1, 0, highlevel=False)) == [ - 1 + 16 + 256, - 2 + 32 + 512, - 4 + 64 + 1024, - 8 + 128 + 2048, - ] - assert ( - ak.sum(depth1.to_typetracer(), 0, highlevel=False).form - == ak.sum(depth1, 0, highlevel=False).form - ) del depth1 @@ -1397,16 +606,6 @@ def test_0115_generic_reducer_operation_any(): == ak.any(depth1, 1, highlevel=False).form ) - assert to_list(ak.any(depth1, -2, highlevel=False)) == [True, True, True, False] - assert ( - ak.any(depth1.to_typetracer(), -2, highlevel=False).form - == ak.any(depth1, -2, highlevel=False).form - ) - assert to_list(ak.any(depth1, 0, highlevel=False)) == [True, True, True, False] - assert ( - ak.any(depth1.to_typetracer(), 0, highlevel=False).form - == ak.any(depth1, 0, highlevel=False).form - ) del depth1 @@ -1435,16 +634,6 @@ def test_0115_generic_reducer_operation_all(): == ak.all(depth1, 1, highlevel=False).form ) - assert to_list(ak.all(depth1, -2, highlevel=False)) == [False, True, False, True] - assert ( - ak.all(depth1.to_typetracer(), -2, highlevel=False).form - == ak.all(depth1, -2, highlevel=False).form - ) - assert to_list(ak.all(depth1, 0, highlevel=False)) == [False, True, False, True] - assert ( - ak.all(depth1.to_typetracer(), 0, highlevel=False).form - == ak.all(depth1, 0, highlevel=False).form - ) del depth1 @@ -1473,16 +662,6 @@ def test_0115_generic_reducer_operation_count(): == ak.count(depth1, 1, highlevel=False).form ) - assert to_list(ak.count(depth1, -2, highlevel=False)) == [3, 3, 3, 1] - assert ( - ak.count(depth1.to_typetracer(), -2, highlevel=False).form - == ak.count(depth1, -2, highlevel=False).form - ) - assert to_list(ak.count(depth1, 0, highlevel=False)) == [3, 3, 3, 1] - assert ( - ak.count(depth1.to_typetracer(), 0, highlevel=False).form - == ak.count(depth1, 0, highlevel=False).form - ) del depth1 @@ -1511,16 +690,6 @@ def test_0115_generic_reducer_operation_count_nonzero(): == ak.count_nonzero(depth1, 1, highlevel=False).form ) - assert to_list(ak.count_nonzero(depth1, -2, highlevel=False)) == [1, 3, 1, 1] - assert ( - ak.count_nonzero(depth1.to_typetracer(), -2, highlevel=False).form - == ak.count_nonzero(depth1, -2, highlevel=False).form - ) - assert to_list(ak.count_nonzero(depth1, 0, highlevel=False)) == [1, 3, 1, 1] - assert ( - ak.count_nonzero(depth1.to_typetracer(), 0, highlevel=False).form - == ak.count_nonzero(depth1, 0, highlevel=False).form - ) del depth1 @@ -1549,16 +718,6 @@ def test_0115_generic_reducer_operation_count_min_1(): == ak.min(depth1, 1, highlevel=False).form ) - assert to_list(ak.min(depth1, -2, highlevel=False)) == [0.0, 2.2, 0.0, 4.4] - assert ( - ak.min(depth1.to_typetracer(), -2, highlevel=False).form - == ak.min(depth1, -2, highlevel=False).form - ) - assert to_list(ak.min(depth1, 0, highlevel=False)) == [0.0, 2.2, 0.0, 4.4] - assert ( - ak.min(depth1.to_typetracer(), 0, highlevel=False).form - == ak.min(depth1, 0, highlevel=False).form - ) del depth1 @@ -1587,16 +746,6 @@ def test_0115_generic_reducer_operation_count_min_2(): == ak.min(depth1, 1, highlevel=False).form ) - assert to_list(ak.min(depth1, -2, highlevel=False)) == [False, True, False, True] - assert ( - ak.min(depth1.to_typetracer(), -2, highlevel=False).form - == ak.min(depth1, -2, highlevel=False).form - ) - assert to_list(ak.min(depth1, 0, highlevel=False)) == [False, True, False, True] - assert ( - ak.min(depth1.to_typetracer(), 0, highlevel=False).form - == ak.min(depth1, 0, highlevel=False).form - ) del depth1 @@ -1625,16 +774,6 @@ def test_0115_generic_reducer_operation_count_max_1(): == ak.max(depth1, 1, highlevel=False).form ) - assert to_list(ak.max(depth1, -2, highlevel=False)) == [1.1, 2.2, 3.3, 4.4] - assert ( - ak.max(depth1.to_typetracer(), -2, highlevel=False).form - == ak.max(depth1, -2, highlevel=False).form - ) - assert to_list(ak.max(depth1, 0, highlevel=False)) == [1.1, 2.2, 3.3, 4.4] - assert ( - ak.max(depth1.to_typetracer(), 0, highlevel=False).form - == ak.max(depth1, 0, highlevel=False).form - ) del depth1 @@ -1663,16 +802,6 @@ def test_0115_generic_reducer_operation_count_max_2(): == ak.max(depth1, 1, highlevel=False).form ) - assert to_list(ak.max(depth1, -2, highlevel=False)) == [False, True, True, False] - assert ( - ak.max(depth1.to_typetracer(), -2, highlevel=False).form - == ak.max(depth1, -2, highlevel=False).form - ) - assert to_list(ak.max(depth1, 0, highlevel=False)) == [False, True, True, False] - assert ( - ak.max(depth1.to_typetracer(), 0, highlevel=False).form - == ak.max(depth1, 0, highlevel=False).form - ) del depth1 @@ -1773,20 +902,6 @@ def test_0115_generic_reducer_operation_keepdims(): ak.prod(depth2.to_typetracer(), axis=-1, keepdims=False, highlevel=False).form == ak.prod(depth2, axis=-1, keepdims=False, highlevel=False).form ) - assert to_list( - ak.prod(depth2, axis=-2, keepdims=False, highlevel=False) - ) == to_list(ak.prod(nparray, axis=-2, keepdims=False, highlevel=False)) - assert ( - ak.prod(depth2.to_typetracer(), axis=-2, keepdims=False, highlevel=False).form - == ak.prod(depth2, axis=-2, keepdims=False, highlevel=False).form - ) - assert to_list( - ak.prod(depth2, axis=-3, keepdims=False, highlevel=False) - ) == to_list(ak.prod(nparray, axis=-3, keepdims=False, highlevel=False)) - assert ( - ak.prod(depth2.to_typetracer(), axis=-3, keepdims=False, highlevel=False).form - == ak.prod(depth2, axis=-3, keepdims=False, highlevel=False).form - ) assert to_list(ak.prod(depth2, axis=-1, keepdims=True, highlevel=False)) == to_list( ak.prod(nparray, axis=-1, keepdims=True, highlevel=False) @@ -1795,20 +910,7 @@ def test_0115_generic_reducer_operation_keepdims(): ak.prod(depth2.to_typetracer(), axis=-1, keepdims=True, highlevel=False).form == ak.prod(depth2, axis=-1, keepdims=True, highlevel=False).form ) - assert to_list(ak.prod(depth2, axis=-2, keepdims=True, highlevel=False)) == to_list( - ak.prod(nparray, axis=-2, keepdims=True, highlevel=False) - ) - assert ( - ak.prod(depth2.to_typetracer(), axis=-2, keepdims=True, highlevel=False).form - == ak.prod(depth2, axis=-2, keepdims=True, highlevel=False).form - ) - assert to_list(ak.prod(depth2, axis=-3, keepdims=True, highlevel=False)) == to_list( - ak.prod(nparray, axis=-3, keepdims=True, highlevel=False) - ) - assert ( - ak.prod(depth2.to_typetracer(), axis=-3, keepdims=True, highlevel=False).form - == ak.prod(depth2, axis=-3, keepdims=True, highlevel=False).form - ) + del depth2 @@ -1824,31 +926,11 @@ def test_0115_generic_reducer_operation_highlevel_1(): [], [2, 1], ] - assert to_list(ak.operations.count(array, axis=2)) == [ - [3, 0, 2, 1], - [], - [2, 1], - ] assert to_list(ak.operations.count(array, axis=-1, keepdims=True)) == [ [[3], [0], [2], [1]], [], [[2], [1]], ] - assert to_list(ak.operations.count(array, axis=-2)) == [ - [3, 2, 1], - [], - [2, 1], - ] - assert to_list(ak.operations.count(array, axis=1)) == [ - [3, 2, 1], - [], - [2, 1], - ] - assert to_list(ak.operations.count(array, axis=-2, keepdims=True)) == [ - [[3, 2, 1]], - [[]], - [[2, 1]], - ] assert ak.operations.count_nonzero(array) == 9 assert to_list(ak.operations.count_nonzero(array, axis=-1)) == [ @@ -1856,11 +938,6 @@ def test_0115_generic_reducer_operation_highlevel_1(): [], [2, 1], ] - assert to_list(ak.operations.count_nonzero(array, axis=-2)) == [ - [3, 2, 1], - [], - [2, 1], - ] assert ak.operations.sum(array) == 2 + 3 + 5 + 7 + 11 + 13 + 17 + 19 + 23 assert to_list(ak.operations.sum(array, axis=-1)) == [ @@ -1868,11 +945,6 @@ def test_0115_generic_reducer_operation_highlevel_1(): [], [17 + 19, 23], ] - assert to_list(ak.operations.sum(array, axis=-2)) == [ - [2 + 7 + 13, 3 + 11, 5], - [], - [17 + 23, 19], - ] assert ak.operations.prod(array) == 2 * 3 * 5 * 7 * 11 * 13 * 17 * 19 * 23 assert to_list(ak.operations.prod(array, axis=-1)) == [ @@ -1880,23 +952,6 @@ def test_0115_generic_reducer_operation_highlevel_1(): [], [17 * 19, 23], ] - assert to_list(ak.operations.prod(array, axis=-2)) == [ - [2 * 7 * 13, 3 * 11, 5], - [], - [17 * 23, 19], - ] - - assert ak.operations.min(array) == 2 - assert to_list(ak.operations.min(array, axis=-1)) == [ - [2, None, 7, 13], - [], - [17, 23], - ] - assert to_list(ak.operations.min(array, axis=-2)) == [ - [2, 3, 5], - [], - [17, 19], - ] assert ak.operations.max(array) == 23 assert to_list(ak.operations.max(array, axis=-1)) == [ @@ -1904,11 +959,7 @@ def test_0115_generic_reducer_operation_highlevel_1(): [], [19, 23], ] - assert to_list(ak.operations.max(array, axis=-2)) == [ - [13, 11, 5], - [], - [23, 19], - ] + del array @@ -1921,29 +972,149 @@ def test_0115_generic_reducer_operation_highlevel_2(): ], check_valid=True, ) - array = ak.to_backend(array, "cuda", highlevel=False) - - assert ak.operations.any(array) is np.bool_(True) + array = ak.to_backend(array, "cuda") + assert ak.operations.any(array) == cp.bool_(True) assert to_list(ak.operations.any(array, axis=-1)) == [ [True, False, False, True], [], [True, True], ] - assert to_list(ak.operations.any(array, axis=-2)) == [ - [True, False, True], - [], - [True, True], - ] - assert ak.operations.all(array) is np.bool_(False) + assert ak.operations.all(array) == cp.bool_(False) assert to_list(ak.operations.all(array, axis=-1)) == [ [False, True, False, True], [], [False, True], ] - assert to_list(ak.operations.all(array, axis=-2)) == [ - [False, False, True], + del array + + +def test_nonreducers(): + x = ak.highlevel.Array([[1, 2, 3, 4, 5], [1, 2, 3, 4, 5]], check_valid=True) + y = ak.highlevel.Array( + [[1.1, 2.2, 2.9, 4.0, 5.1], [0.9, 2.1, 3.2, 4.1, 4.9]], check_valid=True + ) + x = ak.to_backend(x, "cuda") + y = ak.to_backend(y, "cuda") + + cpt.assert_allclose(ak.operations.mean(y), cp.mean(ak.operations.to_numpy(y))) + cpt.assert_allclose(ak.operations.var(y), cp.var(ak.operations.to_numpy(y))) + cpt.assert_allclose( + ak.operations.var(y, ddof=1), cp.var(ak.operations.to_numpy(y), ddof=1) + ) + cpt.assert_allclose(ak.operations.std(y), np.std(ak.operations.to_numpy(y))) + cpt.assert_allclose( + ak.operations.std(y, ddof=1), cp.std(ak.operations.to_numpy(y), ddof=1) + ) + + cpt.assert_allclose(ak.operations.moment(y, 1), cp.mean(ak.operations.to_numpy(y))) + cpt.assert_allclose( + ak.operations.moment(y - ak.operations.mean(y), 2), + cp.var(ak.operations.to_numpy(y)), + ) + cpt.assert_allclose(ak.operations.covar(y, y), cp.var(ak.operations.to_numpy(y))) + cpt.assert_allclose(ak.operations.corr(y, y), 1.0) + + cpt.assert_allclose(ak.operations.corr(x, y), 0.9968772535047296) + + cpt.assert_allclose( + to_list(ak.operations.mean(y, axis=-1)), + to_list(cp.mean(ak.operations.to_numpy(y), axis=-1)), + ) + cpt.assert_allclose( + to_list(ak.operations.var(y, axis=-1)), + to_list(cp.var(ak.operations.to_numpy(y), axis=-1)), + ) + cpt.assert_allclose( + to_list(ak.operations.var(y, axis=-1, ddof=1)), + to_list(cp.var(ak.operations.to_numpy(y), axis=-1, ddof=1)), + ) + cpt.assert_allclose( + to_list(ak.operations.std(y, axis=-1)), + to_list(cp.std(ak.operations.to_numpy(y), axis=-1)), + ) + cpt.assert_allclose( + to_list(ak.operations.std(y, axis=-1, ddof=1)), + to_list(cp.std(ak.operations.to_numpy(y), axis=-1, ddof=1)), + ) + + cpt.assert_allclose( + to_list(ak.operations.moment(y, 1, axis=-1)), + to_list(cp.mean(ak.operations.to_numpy(y), axis=-1)), + ) + cpt.assert_allclose( + to_list(ak.operations.moment(y - ak.operations.mean(y, axis=-1), 2, axis=-1)), + to_list(cp.var(ak.operations.to_numpy(y), axis=-1)), + ) + cpt.assert_allclose( + to_list(ak.operations.covar(y, y, axis=-1)), + to_list(cp.var(ak.operations.to_numpy(y), axis=-1)), + ) + cpt.assert_allclose(to_list(ak.operations.corr(y, y, axis=-1)), [1.0, 1.0]) + + cpt.assert_allclose( + to_list(ak.operations.corr(x, y, axis=-1)), + [0.9975103695813371, 0.9964193240901015], + ) + + +def test_softmax(): + array = ak.highlevel.Array( + [[np.log(2), np.log(2), np.log(4)], [], [np.log(5), np.log(5)]], + check_valid=True, + ) + array = ak.to_backend(array, "cuda") + + assert to_list(ak.operations.softmax(array, axis=-1)) == [ + pytest.approx([0.25, 0.25, 0.5]), [], - [False, True], + pytest.approx([0.5, 0.5]), ] del array + + +def test_prod_bool_1(): + # this had been silently broken + array = np.array([[True, False, False], [True, False, False]]) + content2 = ak.contents.NumpyArray(array.reshape(-1)) + offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64)) + depth1 = ak.contents.ListOffsetArray(offsets3, content2) + depth1 = ak.to_backend(depth1, "cuda") + + assert to_list(ak.prod(depth1, axis=-1, highlevel=False)) == [0, 1, 0, 0] + assert to_list(ak.all(depth1, axis=-1, highlevel=False)) == [ + False, + True, + False, + False, + ] + assert to_list(ak.min(depth1, axis=-1, highlevel=False)) == [ + False, + None, + False, + False, + ] + del depth1 + + +def test_prod_bool_2(): + array = np.array([[True, False, False], [True, False, False]]).view(np.uint8) + content2 = ak.contents.NumpyArray(array.reshape(-1)) + offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64)) + depth1 = ak.contents.ListOffsetArray(offsets3, content2) + depth1 = ak.to_backend(depth1, "cuda") + + assert to_list(ak.prod(depth1, axis=-1, highlevel=False)) == [0, 1, 0, 0] + assert to_list(ak.all(depth1, axis=-1, highlevel=False)) == [0, 1, 0, 0] + assert to_list(ak.min(depth1, axis=-1, highlevel=False)) == [0, None, 0, 0] + + array = np.array([[True, False, False], [True, False, False]]).astype(np.int32) + content2 = ak.contents.NumpyArray(array.reshape(-1)) + offsets3 = ak.index.Index64(np.array([0, 3, 3, 5, 6], dtype=np.int64)) + depth1 = ak.contents.ListOffsetArray(offsets3, content2) + depth1 = ak.to_backend(depth1, "cuda") + + assert to_list(ak.prod(depth1, axis=-1, highlevel=False)) == [0, 1, 0, 0] + assert to_list(ak.all(depth1, axis=-1, highlevel=False)) == [0, 1, 0, 0] + assert to_list(ak.min(depth1, axis=-1, highlevel=False)) == [0, None, 0, 0] + del depth1 From 8df6bd4332b7184db38eaa4d9c34129dbc3c9e19 Mon Sep 17 00:00:00 2001 From: Manasvi Goyal <55101825+ManasviGoyal@users.noreply.github.com> Date: Mon, 24 Jun 2024 14:27:47 +0200 Subject: [PATCH 33/33] Update dev/generate-tests.py Co-authored-by: Ianna Osborne --- dev/generate-tests.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dev/generate-tests.py b/dev/generate-tests.py index 45068c590c..7c97628101 100644 --- a/dev/generate-tests.py +++ b/dev/generate-tests.py @@ -961,6 +961,7 @@ def gencudakerneltests(specdict): f.write( "import cupy\n" "import cupy.testing as cpt\n" + "import numpy as np\n" "import pytest\n\n" "import awkward as ak\n" "import awkward._connect.cuda as ak_cu\n"