From 244c6183ba1951edbf2554849fbc4a84277fd3d8 Mon Sep 17 00:00:00 2001 From: Christian <28689358+christiangnrd@users.noreply.github.com> Date: Sun, 20 Jul 2025 15:06:10 -0300 Subject: [PATCH 1/3] Use GPUArrays accumulation implementation --- src/CUDA.jl | 1 - src/accumulate.jl | 234 --------------------------------------------- test/base/array.jl | 33 ------- 3 files changed, 268 deletions(-) delete mode 100644 src/accumulate.jl diff --git a/src/CUDA.jl b/src/CUDA.jl index 9502ad3986..634a86d18b 100644 --- a/src/CUDA.jl +++ b/src/CUDA.jl @@ -108,7 +108,6 @@ include("texture.jl") include("indexing.jl") include("broadcast.jl") include("mapreduce.jl") -include("accumulate.jl") include("reverse.jl") include("iterator.jl") include("sorting.jl") diff --git a/src/accumulate.jl b/src/accumulate.jl deleted file mode 100644 index 1ec21f20ea..0000000000 --- a/src/accumulate.jl +++ /dev/null @@ -1,234 +0,0 @@ -# scan and accumulate - -## COV_EXCL_START - -# partial scan of individual thread blocks within a grid -# work-efficient implementation after Blelloch (1990) -# -# number of threads needs to be a power-of-2 -# -# performance TODOs: -# - shuffle -# - warp-aggregate atomics -# - the ND case is quite a bit slower than the 1D case (not using Cartesian indices, -# before 35fcbde1f2987023229034370b0c9091e18c4137). optimize or special-case? -function partial_scan(op::Function, output::AbstractArray{T}, input::AbstractArray, - Rdim, Rpre, Rpost, Rother, neutral, init, - ::Val{inclusive}=Val(true)) where {T, inclusive} - threads = blockDim().x - thread = threadIdx().x - block = blockIdx().x - - temp = CuDynamicSharedArray(T, (2*threads,)) - - # iterate the main dimension using threads and the first block dimension - i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x - # iterate the other dimensions using the remaining block dimensions - j = (blockIdx().z-1i32) * gridDim().y + blockIdx().y - - if j > length(Rother) - return - end - - @inbounds begin - I = Rother[j] - Ipre = Rpre[I[1]] - Ipost = Rpost[I[2]] - end - - # load input into shared memory (apply `op` to have the correct type) - @inbounds temp[thread] = if i <= length(Rdim) - op(neutral, input[Ipre, i, Ipost]) - else - op(neutral, neutral) - end - - # build sum in place up the tree - offset = 1 - d = threads>>1 - while d > 0 - sync_threads() - @inbounds if thread <= d - ai = offset * (2*thread-1) - bi = offset * (2*thread) - temp[bi] = op(temp[ai], temp[bi]) - end - offset *= 2 - d >>= 1 - end - - # clear the last element - @inbounds if thread == 1 - temp[threads] = neutral - end - - # traverse down tree & build scan - d = 1 - while d < threads - offset >>= 1 - sync_threads() - @inbounds if thread <= d - ai = offset * (2*thread-1) - bi = offset * (2*thread) - - t = temp[ai] - temp[ai] = temp[bi] - temp[bi] = op(t, temp[bi]) - end - d *= 2 - end - - sync_threads() - - # write results to device memory - @inbounds if i <= length(Rdim) - val = if inclusive - op(temp[thread], input[Ipre, i, Ipost]) - else - temp[thread] - end - if init !== nothing - val = op(something(init), val) - end - output[Ipre, i, Ipost] = val - end - - return -end - -# aggregate the result of a partial scan by applying preceding block aggregates -function aggregate_partial_scan(op::Function, output::AbstractArray, - aggregates::AbstractArray, Rdim, Rpre, Rpost, Rother, - init) - threads = blockDim().x - thread = threadIdx().x - block = blockIdx().x - - # iterate the main dimension using threads and the first block dimension - i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x - # iterate the other dimensions using the remaining block dimensions - j = (blockIdx().z-1i32) * gridDim().y + blockIdx().y - - @inbounds if i <= length(Rdim) && j <= length(Rother) - I = Rother[j] - Ipre = Rpre[I[1]] - Ipost = Rpost[I[2]] - - val = if block > 1 - op(aggregates[Ipre, block-1, Ipost], output[Ipre, i, Ipost]) - else - output[Ipre, i, Ipost] - end - - if init !== nothing - val = op(something(init), val) - end - - output[Ipre, i, Ipost] = val - end - - return -end - -## COV_EXCL_STOP - -function scan!(f::Function, output::AnyCuArray{T}, input::AnyCuArray; - dims::Integer, init=nothing, neutral=GPUArrays.neutral_element(f, T)) where {T} - dims > 0 || throw(ArgumentError("dims must be a positive integer")) - inds_t = axes(input) - axes(output) == inds_t || throw(DimensionMismatch("shape of B must match A")) - dims > ndims(input) && return copyto!(output, input) - isempty(inds_t[dims]) && return output - - # iteration domain across the main dimension - Rdim = CartesianIndices((size(input, dims),)) - - # iteration domain for the other dimensions - Rpre = CartesianIndices(size(input)[1:dims-1]) - Rpost = CartesianIndices(size(input)[dims+1:end]) - Rother = CartesianIndices((length(Rpre), length(Rpost))) - - # determine how many threads we can launch for the scan kernel - kernel = @cuda launch=false partial_scan(f, output, input, Rdim, Rpre, Rpost, Rother, neutral, init, Val(true)) - kernel_config = launch_configuration(kernel.fun; shmem=(threads)->2*threads*sizeof(T)) - - # determine the grid layout to cover the other dimensions - if length(Rother) > 1 - dev = device() - max_other_blocks = attribute(dev, DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y) - blocks_other = (min(length(Rother), max_other_blocks), - cld(length(Rother), max_other_blocks)) - else - blocks_other = (1, 1) - end - - # does that suffice to scan the array in one go? - full = nextpow(2, length(Rdim)) - if full <= kernel_config.threads - @cuda(threads=full, blocks=(1, blocks_other...), shmem=2*full*sizeof(T), name="scan", - partial_scan(f, output, input, Rdim, Rpre, Rpost, Rother, neutral, init, Val(true))) - else - # perform partial scans across the scanning dimension - # NOTE: don't set init here to avoid applying the value multiple times - partial = prevpow(2, kernel_config.threads) - blocks_dim = cld(length(Rdim), partial) - @cuda(threads=partial, blocks=(blocks_dim, blocks_other...), shmem=2*partial*sizeof(T), - partial_scan(f, output, input, Rdim, Rpre, Rpost, Rother, neutral, nothing, Val(true))) - - # get the total of each thread block (except the first) of the partial scans - aggregates = fill(neutral, Base.setindex(size(input), blocks_dim, dims)) - partials = selectdim(output, dims, partial:partial:length(Rdim)) - indices = CartesianIndices(partials) - copyto!(aggregates, indices, partials, indices) - - # scan these totals to get totals for the entire partial scan - accumulate!(f, aggregates, aggregates; dims=dims) - - # add those totals to the partial scan result - # NOTE: we assume that this kernel requires fewer resources than the scan kernel. - # if that does not hold, launch with fewer threads and calculate - # the aggregate block index within the kernel itself. - @cuda(threads=partial, blocks=(blocks_dim, blocks_other...), - aggregate_partial_scan(f, output, aggregates, Rdim, Rpre, Rpost, Rother, init)) - - unsafe_free!(aggregates) - end - - return output -end - - -## Base interface - -Base._accumulate!(op, output::AnyCuArray, input::AnyCuVector, dims::Nothing, init::Nothing) = - scan!(op, output, input; dims=1) - -Base._accumulate!(op, output::AnyCuArray, input::AnyCuArray, dims::Integer, init::Nothing) = - scan!(op, output, input; dims=dims) - -Base._accumulate!(op, output::AnyCuArray, input::CuVector, dims::Nothing, init::Some) = - scan!(op, output, input; dims=1, init=init) - -Base._accumulate!(op, output::AnyCuArray, input::AnyCuArray, dims::Integer, init::Some) = - scan!(op, output, input; dims=dims, init=init) - -Base.accumulate_pairwise!(op, result::AnyCuVector, v::AnyCuVector) = accumulate!(op, result, v) - -# default behavior unless dims are specified by the user -function Base.accumulate(op, A::AnyCuArray; - dims::Union{Nothing,Integer}=nothing, kw...) - if dims === nothing && !(A isa AbstractVector) - # This branch takes care of the cases not handled by `_accumulate!`. - return reshape(accumulate(op, A[:]; kw...), size(A)) - end - nt = values(kw) - if isempty(kw) - out = similar(A, Base.promote_op(op, eltype(A), eltype(A))) - elseif keys(nt) === (:init,) - out = similar(A, Base.promote_op(op, typeof(nt.init), eltype(A))) - else - throw(ArgumentError("accumulate does not support the keyword arguments $(setdiff(keys(nt), (:init,)))")) - end - accumulate!(op, out, A; dims=dims, kw...) -end - diff --git a/test/base/array.jl b/test/base/array.jl index 51fb2fc219..5cf2c64164 100644 --- a/test/base/array.jl +++ b/test/base/array.jl @@ -413,39 +413,6 @@ end @test view(b, :, 1, :) isa StridedCuArray end -@testset "accumulate" begin - for n in (0, 1, 2, 3, 10, 10_000, 16384, 16384+1) # small, large, odd & even, pow2 and not - @test testf(x->accumulate(+, x), rand(n)) - @test testf(x->accumulate(+, x), rand(n,2)) - @test testf((x,y)->accumulate(+, x; init=y), rand(n), rand()) - end - - # multidimensional - for (sizes, dims) in ((2,) => 2, - (3,4,5) => 2, - (1, 70, 50, 20) => 3) - @test testf(x->accumulate(+, x; dims=dims), rand(Int, sizes)) - @test testf(x->accumulate(+, x), rand(Int, sizes)) - end - - # using initializer - for (sizes, dims) in ((2,) => 2, - (3,4,5) => 2, - (1, 70, 50, 20) => 3) - @test testf((x,y)->accumulate(+, x; dims=dims, init=y), rand(Int, sizes), rand(Int)) - @test testf((x,y)->accumulate(+, x; init=y), rand(Int, sizes), rand(Int)) - end - - # in place - @test testf(x->(accumulate!(+, x, copy(x)); x), rand(2)) - - # specialized - @test testf(cumsum, rand(2)) - @test testf(cumprod, rand(2)) - - @test_throws ArgumentError("accumulate does not support the keyword arguments [:bad_kwarg]") accumulate(+, CUDA.rand(1024); bad_kwarg="bad") -end - @testset "logical indexing" begin @test CuArray{Int}(undef, 2)[CUDA.ones(Bool, 2)] isa CuArray @test CuArray{Int}(undef, 2, 2)[CUDA.ones(Bool, 2, 2)] isa CuArray From 3c02fa90c235603faef84fbc9056079930b5a527 Mon Sep 17 00:00:00 2001 From: Christian <28689358+christiangnrd@users.noreply.github.com> Date: Sun, 20 Jul 2025 15:12:18 -0300 Subject: [PATCH 2/3] REMOVE BEFORE MERGE [only benchmarks] --- .buildkite/pipeline.yml | 1 + test/runtests.jl | 3 +++ 2 files changed, 4 insertions(+) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index ef4a60a93f..eea1668548 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -423,6 +423,7 @@ steps: println("--- :julia: Instantiating project") Pkg.develop([PackageSpec(path=pwd())]) + Pkg.add(url="https://github.com/christiangnrd/GPUArrays.jl", rev="accumulatetests") println("+++ :julia: Benchmarking") include("perf/runbenchmarks.jl")' diff --git a/test/runtests.jl b/test/runtests.jl index 9172d18a5c..b6c479cce0 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -4,6 +4,9 @@ import REPL using Printf: @sprintf using Base.Filesystem: path_separator +using Pkg +Pkg.add(url="https://github.com/christiangnrd/GPUArrays.jl", rev="accumulatetests") + # parse some command-line arguments function extract_flag!(args, flag, default=nothing; typ=typeof(default)) for f in args From 066fc01530f24a2276a193b9546ac7d9020a7461 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Tue, 5 Aug 2025 18:31:22 -0300 Subject: [PATCH 3/3] Benchmark reverse on bigger arrays --- perf/array.jl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/perf/array.jl b/perf/array.jl index 65baa304dd..3dbab9816c 100644 --- a/perf/array.jl +++ b/perf/array.jl @@ -52,9 +52,13 @@ end let group = addgroup!(group, "reverse") group["1d"] = @async_benchmarkable reverse($gpu_vec) + group["1dL"] = @async_benchmarkable reverse($gpu_vec_long) group["2d"] = @async_benchmarkable reverse($gpu_mat; dims=1) + group["2dL"] = @async_benchmarkable reverse($gpu_mat_long; dims=1) group["1d_inplace"] = @async_benchmarkable reverse!($gpu_vec) + group["1dL_inplace"] = @async_benchmarkable reverse!($gpu_vec_long) group["2d_inplace"] = @async_benchmarkable reverse!($gpu_mat; dims=1) + group["2dL_inplace"] = @async_benchmarkable reverse!($gpu_mat_long; dims=2) end group["broadcast"] = @async_benchmarkable $gpu_mat .= 0f0