From 244c6183ba1951edbf2554849fbc4a84277fd3d8 Mon Sep 17 00:00:00 2001
From: Christian <28689358+christiangnrd@users.noreply.github.com>
Date: Sun, 20 Jul 2025 15:06:10 -0300
Subject: [PATCH 1/3] Use GPUArrays accumulation implementation

---
 src/CUDA.jl        |   1 -
 src/accumulate.jl  | 234 ---------------------------------------------
 test/base/array.jl |  33 -------
 3 files changed, 268 deletions(-)
 delete mode 100644 src/accumulate.jl

diff --git a/src/CUDA.jl b/src/CUDA.jl
index 9502ad3986..634a86d18b 100644
--- a/src/CUDA.jl
+++ b/src/CUDA.jl
@@ -108,7 +108,6 @@ include("texture.jl")
 include("indexing.jl")
 include("broadcast.jl")
 include("mapreduce.jl")
-include("accumulate.jl")
 include("reverse.jl")
 include("iterator.jl")
 include("sorting.jl")
diff --git a/src/accumulate.jl b/src/accumulate.jl
deleted file mode 100644
index 1ec21f20ea..0000000000
--- a/src/accumulate.jl
+++ /dev/null
@@ -1,234 +0,0 @@
-# scan and accumulate
-
-## COV_EXCL_START
-
-# partial scan of individual thread blocks within a grid
-# work-efficient implementation after Blelloch (1990)
-#
-# number of threads needs to be a power-of-2
-#
-# performance TODOs:
-# - shuffle
-# - warp-aggregate atomics
-# - the ND case is quite a bit slower than the 1D case (not using Cartesian indices,
-#   before 35fcbde1f2987023229034370b0c9091e18c4137). optimize or special-case?
-function partial_scan(op::Function, output::AbstractArray{T}, input::AbstractArray,
-                      Rdim, Rpre, Rpost, Rother, neutral, init,
-                      ::Val{inclusive}=Val(true)) where {T, inclusive}
-    threads = blockDim().x
-    thread = threadIdx().x
-    block = blockIdx().x
-
-    temp = CuDynamicSharedArray(T, (2*threads,))
-
-    # iterate the main dimension using threads and the first block dimension
-    i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
-    # iterate the other dimensions using the remaining block dimensions
-    j = (blockIdx().z-1i32) * gridDim().y + blockIdx().y
-
-    if j > length(Rother)
-        return
-    end
-
-    @inbounds begin
-        I = Rother[j]
-        Ipre = Rpre[I[1]]
-        Ipost = Rpost[I[2]]
-    end
-
-    # load input into shared memory (apply `op` to have the correct type)
-    @inbounds temp[thread] = if i <= length(Rdim)
-        op(neutral, input[Ipre, i, Ipost])
-    else
-        op(neutral, neutral)
-    end
-
-    # build sum in place up the tree
-    offset = 1
-    d = threads>>1
-    while d > 0
-        sync_threads()
-        @inbounds if thread <= d
-            ai = offset * (2*thread-1)
-            bi = offset * (2*thread)
-            temp[bi] = op(temp[ai], temp[bi])
-        end
-        offset *= 2
-        d >>= 1
-    end
-
-    # clear the last element
-    @inbounds if thread == 1
-        temp[threads] = neutral
-    end
-
-    # traverse down tree & build scan
-    d = 1
-    while d < threads
-        offset >>= 1
-        sync_threads()
-        @inbounds if thread <= d
-            ai = offset * (2*thread-1)
-            bi = offset * (2*thread)
-
-            t = temp[ai]
-            temp[ai] = temp[bi]
-            temp[bi] = op(t, temp[bi])
-        end
-        d *= 2
-    end
-
-    sync_threads()
-
-    # write results to device memory
-    @inbounds if i <= length(Rdim)
-        val = if inclusive
-            op(temp[thread], input[Ipre, i, Ipost])
-        else
-            temp[thread]
-        end
-        if init !== nothing
-            val = op(something(init), val)
-        end
-        output[Ipre, i, Ipost] = val
-    end
-
-    return
-end
-
-# aggregate the result of a partial scan by applying preceding block aggregates
-function aggregate_partial_scan(op::Function, output::AbstractArray,
-                                aggregates::AbstractArray, Rdim, Rpre, Rpost, Rother,
-                                init)
-    threads = blockDim().x
-    thread = threadIdx().x
-    block = blockIdx().x
-
-    # iterate the main dimension using threads and the first block dimension
-    i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
-    # iterate the other dimensions using the remaining block dimensions
-    j = (blockIdx().z-1i32) * gridDim().y + blockIdx().y
-
-    @inbounds if i <= length(Rdim) && j <= length(Rother)
-        I = Rother[j]
-        Ipre = Rpre[I[1]]
-        Ipost = Rpost[I[2]]
-
-        val = if block > 1
-            op(aggregates[Ipre, block-1, Ipost], output[Ipre, i, Ipost])
-        else
-            output[Ipre, i, Ipost]
-        end
-
-        if init !== nothing
-            val = op(something(init), val)
-        end
-
-        output[Ipre, i, Ipost] = val
-    end
-
-    return
-end
-
-## COV_EXCL_STOP
-
-function scan!(f::Function, output::AnyCuArray{T}, input::AnyCuArray;
-               dims::Integer, init=nothing, neutral=GPUArrays.neutral_element(f, T)) where {T}
-    dims > 0 || throw(ArgumentError("dims must be a positive integer"))
-    inds_t = axes(input)
-    axes(output) == inds_t || throw(DimensionMismatch("shape of B must match A"))
-    dims > ndims(input) && return copyto!(output, input)
-    isempty(inds_t[dims]) && return output
-
-    # iteration domain across the main dimension
-    Rdim = CartesianIndices((size(input, dims),))
-
-    # iteration domain for the other dimensions
-    Rpre = CartesianIndices(size(input)[1:dims-1])
-    Rpost = CartesianIndices(size(input)[dims+1:end])
-    Rother = CartesianIndices((length(Rpre), length(Rpost)))
-
-    # determine how many threads we can launch for the scan kernel
-    kernel = @cuda launch=false partial_scan(f, output, input, Rdim, Rpre, Rpost, Rother, neutral, init, Val(true))
-    kernel_config = launch_configuration(kernel.fun; shmem=(threads)->2*threads*sizeof(T))
-
-    # determine the grid layout to cover the other dimensions
-    if length(Rother) > 1
-        dev = device()
-        max_other_blocks = attribute(dev, DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y)
-        blocks_other = (min(length(Rother), max_other_blocks),
-                        cld(length(Rother), max_other_blocks))
-    else
-        blocks_other = (1, 1)
-    end
-
-    # does that suffice to scan the array in one go?
-    full = nextpow(2, length(Rdim))
-    if full <= kernel_config.threads
-        @cuda(threads=full, blocks=(1, blocks_other...), shmem=2*full*sizeof(T), name="scan",
-              partial_scan(f, output, input, Rdim, Rpre, Rpost, Rother, neutral, init, Val(true)))
-    else
-        # perform partial scans across the scanning dimension
-        # NOTE: don't set init here to avoid applying the value multiple times
-        partial = prevpow(2, kernel_config.threads)
-        blocks_dim = cld(length(Rdim), partial)
-        @cuda(threads=partial, blocks=(blocks_dim, blocks_other...), shmem=2*partial*sizeof(T),
-              partial_scan(f, output, input, Rdim, Rpre, Rpost, Rother, neutral, nothing, Val(true)))
-
-        # get the total of each thread block (except the first) of the partial scans
-        aggregates = fill(neutral, Base.setindex(size(input), blocks_dim, dims))
-        partials = selectdim(output, dims, partial:partial:length(Rdim))
-        indices = CartesianIndices(partials)
-        copyto!(aggregates, indices, partials, indices)
-
-        # scan these totals to get totals for the entire partial scan
-        accumulate!(f, aggregates, aggregates; dims=dims)
-
-        # add those totals to the partial scan result
-        # NOTE: we assume that this kernel requires fewer resources than the scan kernel.
-        #       if that does not hold, launch with fewer threads and calculate
-        #       the aggregate block index within the kernel itself.
-        @cuda(threads=partial, blocks=(blocks_dim, blocks_other...),
-              aggregate_partial_scan(f, output, aggregates, Rdim, Rpre, Rpost, Rother, init))
-
-        unsafe_free!(aggregates)
-    end
-
-    return output
-end
-
-
-## Base interface
-
-Base._accumulate!(op, output::AnyCuArray, input::AnyCuVector, dims::Nothing, init::Nothing) =
-    scan!(op, output, input; dims=1)
-
-Base._accumulate!(op, output::AnyCuArray, input::AnyCuArray, dims::Integer, init::Nothing) =
-    scan!(op, output, input; dims=dims)
-
-Base._accumulate!(op, output::AnyCuArray, input::CuVector, dims::Nothing, init::Some) =
-    scan!(op, output, input; dims=1, init=init)
-
-Base._accumulate!(op, output::AnyCuArray, input::AnyCuArray, dims::Integer, init::Some) =
-    scan!(op, output, input; dims=dims, init=init)
-
-Base.accumulate_pairwise!(op, result::AnyCuVector, v::AnyCuVector) = accumulate!(op, result, v)
-
-# default behavior unless dims are specified by the user
-function Base.accumulate(op, A::AnyCuArray;
-                         dims::Union{Nothing,Integer}=nothing, kw...)
-    if dims === nothing && !(A isa AbstractVector)
-        # This branch takes care of the cases not handled by `_accumulate!`.
-        return reshape(accumulate(op, A[:]; kw...), size(A))
-    end
-    nt = values(kw)
-    if isempty(kw)
-        out = similar(A, Base.promote_op(op, eltype(A), eltype(A)))
-    elseif keys(nt) === (:init,)
-        out = similar(A, Base.promote_op(op, typeof(nt.init), eltype(A)))
-    else
-        throw(ArgumentError("accumulate does not support the keyword arguments $(setdiff(keys(nt), (:init,)))"))
-    end
-    accumulate!(op, out, A; dims=dims, kw...)
-end
-
diff --git a/test/base/array.jl b/test/base/array.jl
index 51fb2fc219..5cf2c64164 100644
--- a/test/base/array.jl
+++ b/test/base/array.jl
@@ -413,39 +413,6 @@ end
   @test view(b, :, 1, :) isa StridedCuArray
 end
 
-@testset "accumulate" begin
-  for n in (0, 1, 2, 3, 10, 10_000, 16384, 16384+1) # small, large, odd & even, pow2 and not
-    @test testf(x->accumulate(+, x), rand(n))
-    @test testf(x->accumulate(+, x), rand(n,2))
-    @test testf((x,y)->accumulate(+, x; init=y), rand(n), rand())
-  end
-
-  # multidimensional
-  for (sizes, dims) in ((2,) => 2,
-                        (3,4,5) => 2,
-                        (1, 70, 50, 20) => 3)
-    @test testf(x->accumulate(+, x; dims=dims), rand(Int, sizes))
-    @test testf(x->accumulate(+, x), rand(Int, sizes))
-  end
-
-  # using initializer
-  for (sizes, dims) in ((2,) => 2,
-                        (3,4,5) => 2,
-                        (1, 70, 50, 20) => 3)
-    @test testf((x,y)->accumulate(+, x; dims=dims, init=y), rand(Int, sizes), rand(Int))
-    @test testf((x,y)->accumulate(+, x; init=y), rand(Int, sizes), rand(Int))
-  end
-
-  # in place
-  @test testf(x->(accumulate!(+, x, copy(x)); x), rand(2))
-
-  # specialized
-  @test testf(cumsum, rand(2))
-  @test testf(cumprod, rand(2))
-
-  @test_throws ArgumentError("accumulate does not support the keyword arguments [:bad_kwarg]") accumulate(+, CUDA.rand(1024); bad_kwarg="bad")
-end
-
 @testset "logical indexing" begin
   @test CuArray{Int}(undef, 2)[CUDA.ones(Bool, 2)] isa CuArray
   @test CuArray{Int}(undef, 2, 2)[CUDA.ones(Bool, 2, 2)] isa CuArray

From 3c02fa90c235603faef84fbc9056079930b5a527 Mon Sep 17 00:00:00 2001
From: Christian <28689358+christiangnrd@users.noreply.github.com>
Date: Sun, 20 Jul 2025 15:12:18 -0300
Subject: [PATCH 2/3] REMOVE BEFORE MERGE [only benchmarks]

---
 .buildkite/pipeline.yml | 1 +
 test/runtests.jl        | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index ef4a60a93f..eea1668548 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -423,6 +423,7 @@ steps:
 
         println("--- :julia: Instantiating project")
         Pkg.develop([PackageSpec(path=pwd())])
+        Pkg.add(url="https://github.com/christiangnrd/GPUArrays.jl", rev="accumulatetests")
 
         println("+++ :julia: Benchmarking")
         include("perf/runbenchmarks.jl")'
diff --git a/test/runtests.jl b/test/runtests.jl
index 9172d18a5c..b6c479cce0 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -4,6 +4,9 @@ import REPL
 using Printf: @sprintf
 using Base.Filesystem: path_separator
 
+using Pkg
+Pkg.add(url="https://github.com/christiangnrd/GPUArrays.jl", rev="accumulatetests")
+
 # parse some command-line arguments
 function extract_flag!(args, flag, default=nothing; typ=typeof(default))
     for f in args

From 066fc01530f24a2276a193b9546ac7d9020a7461 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Tue, 5 Aug 2025 18:31:22 -0300
Subject: [PATCH 3/3] Benchmark reverse on bigger arrays

---
 perf/array.jl | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/perf/array.jl b/perf/array.jl
index 65baa304dd..3dbab9816c 100644
--- a/perf/array.jl
+++ b/perf/array.jl
@@ -52,9 +52,13 @@ end
 
 let group = addgroup!(group, "reverse")
     group["1d"] = @async_benchmarkable reverse($gpu_vec)
+    group["1dL"] = @async_benchmarkable reverse($gpu_vec_long)
     group["2d"] = @async_benchmarkable reverse($gpu_mat; dims=1)
+    group["2dL"] = @async_benchmarkable reverse($gpu_mat_long; dims=1)
     group["1d_inplace"] = @async_benchmarkable reverse!($gpu_vec)
+    group["1dL_inplace"] = @async_benchmarkable reverse!($gpu_vec_long)
     group["2d_inplace"] = @async_benchmarkable reverse!($gpu_mat; dims=1)
+    group["2dL_inplace"] = @async_benchmarkable reverse!($gpu_mat_long; dims=2)
 end
 
 group["broadcast"] = @async_benchmarkable $gpu_mat .= 0f0