From fef83f2200e000761c309af923fbde3cee6c52c9 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Tue, 1 Jul 2025 08:50:25 +0200 Subject: [PATCH 1/7] Revert "Switch default algorithm for accumulate to ScanPrefixes" This reverts commit 2da869694f822f52a9ba0cf006a0da84db4436a2. --- src/accumulate/accumulate.jl | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/accumulate/accumulate.jl b/src/accumulate/accumulate.jl index 3179b2a..96adf30 100644 --- a/src/accumulate/accumulate.jl +++ b/src/accumulate/accumulate.jl @@ -160,9 +160,7 @@ function _accumulate_impl!( dims::Union{Nothing, Int}=nothing, inclusive::Bool=true, - # FIXME: Switch back to `DecoupledLookback()` as the default algorithm - # once https://github.com/JuliaGPU/AcceleratedKernels.jl/pull/44 is merged. - alg::AccumulateAlgorithm=ScanPrefixes(), + alg::AccumulateAlgorithm=DecoupledLookback(), # CPU settings max_tasks::Int=Threads.nthreads(), From fea0c5c39b4a3469345b350397d9ade8a6d26b08 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Sun, 15 Jun 2025 20:28:38 +0200 Subject: [PATCH 2/7] attempt to use UnsafeAtomics to fix race in accumulate --- Project.toml | 4 +++- src/AcceleratedKernels.jl | 1 + src/accumulate/accumulate_1d_gpu.jl | 24 +++++++++++++++--------- 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/Project.toml b/Project.toml index 25894ff..15fb563 100644 --- a/Project.toml +++ b/Project.toml @@ -8,6 +8,7 @@ ArgCheck = "dce04be8-c92d-5529-be00-80e4d2c0e197" GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a" +UnsafeAtomics = "013be700-e6cd-48c3-b4a1-df204f14c38f" [weakdeps] Metal = "dde4c033-4e86-420c-a63e-0dd931031962" @@ -23,5 +24,6 @@ GPUArraysCore = "0.2.0" KernelAbstractions = "0.9.34" Markdown = "1" Metal = "1" -oneAPI = "1, 2" +UnsafeAtomics = "0.3.0" julia = "1.10" +oneAPI = "1, 2" diff --git a/src/AcceleratedKernels.jl b/src/AcceleratedKernels.jl index f97c9d8..a5c51a3 100644 --- a/src/AcceleratedKernels.jl +++ b/src/AcceleratedKernels.jl @@ -14,6 +14,7 @@ module AcceleratedKernels using ArgCheck: @argcheck using GPUArraysCore: AbstractGPUArray, @allowscalar using KernelAbstractions +import UnsafeAtomics # Exposed functions from upstream packages diff --git a/src/accumulate/accumulate_1d_gpu.jl b/src/accumulate/accumulate_1d_gpu.jl index be3ee59..1ff7ba2 100644 --- a/src/accumulate/accumulate_1d_gpu.jl +++ b/src/accumulate/accumulate_1d_gpu.jl @@ -149,7 +149,7 @@ end @kernel cpu=false inbounds=true unsafe_indices=true function _accumulate_previous!( - op, v, flags, @Const(prefixes), + op, v, flags::AbstractArray{UInt8}, @Const(prefixes), ) len = length(v) @@ -169,9 +169,9 @@ end running_prefix = prefixes[iblock - 0x1 + 0x1] inspected_block = signed(typeof(iblock))(iblock) - 0x2 while inspected_block >= 0x0 - # Opportunistic: a previous block finished everything - if flags[inspected_block + 0x1] == ACC_FLAG_A + if UnsafeAtomics.load(pointer(flags, inspected_block + 0x1), UnsafeAtomics.monotonic) == ACC_FLAG_A + UnsafeAtomics.fence(UnsafeAtomics.acquire) # (fence before reading from v) # Previous blocks (except last) always have filled values in v, so index is inbounds running_prefix = op(running_prefix, v[(inspected_block + 0x1) * block_size * 0x2]) break @@ -194,11 +194,17 @@ end end # Set flag for "aggregate of all prefixes up to this block finished" - @synchronize() # This is needed so that the flag is not set before copying into v, but - # there should be better memory fences to guarantee ordering without - # thread synchronization... + # There are two synchronization concerns here: + # 1. Withing a group we want to ensure that all writed to `v` have occured before setting the flag. + # 2. Between groups we need to use a fence and atomic load/store to ensure that memory operations are not re-ordered + @synchronize() # within-block + # Note: This fence is needed to ensure that the flag is not set before copying into v. + # See https://doc.rust-lang.org/std/sync/atomic/fn.fence.html + # for more details. + # We use the happens-before relation between stores to `v` and the store to `flags`. + UnsafeAtomics.fence(UnsafeAtomics.release) if ithread == 0x0 - flags[iblock + 0x1] = ACC_FLAG_A + UnsafeAtomics.store!(pointer(flags, iblock + 0x1), ACC_FLAG_A, UnsafeAtomics.monotonic) end end @@ -285,9 +291,9 @@ function accumulate_1d!( end if isnothing(temp_flags) - flags = similar(v, Int8, num_blocks) + flags = similar(v, UInt8, num_blocks) else - @argcheck eltype(temp_flags) <: Integer + @argcheck eltype(temp_flags) == UInt8 @argcheck length(temp_flags) >= num_blocks flags = temp_flags end From d4698abe2545138692603df8b3a5ce5aca679433 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Tue, 1 Jul 2025 08:52:57 +0200 Subject: [PATCH 3/7] Switch Metal back to the DecoupledLookback algorithm --- Project.toml | 3 --- ext/AcceleratedKernelsMetalExt.jl | 41 ------------------------------- 2 files changed, 44 deletions(-) delete mode 100644 ext/AcceleratedKernelsMetalExt.jl diff --git a/Project.toml b/Project.toml index 15fb563..8b948b6 100644 --- a/Project.toml +++ b/Project.toml @@ -11,11 +11,9 @@ Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a" UnsafeAtomics = "013be700-e6cd-48c3-b4a1-df204f14c38f" [weakdeps] -Metal = "dde4c033-4e86-420c-a63e-0dd931031962" oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" [extensions] -AcceleratedKernelsMetalExt = "Metal" AcceleratedKernelsoneAPIExt = "oneAPI" [compat] @@ -23,7 +21,6 @@ ArgCheck = "2" GPUArraysCore = "0.2.0" KernelAbstractions = "0.9.34" Markdown = "1" -Metal = "1" UnsafeAtomics = "0.3.0" julia = "1.10" oneAPI = "1, 2" diff --git a/ext/AcceleratedKernelsMetalExt.jl b/ext/AcceleratedKernelsMetalExt.jl deleted file mode 100644 index 7d191dd..0000000 --- a/ext/AcceleratedKernelsMetalExt.jl +++ /dev/null @@ -1,41 +0,0 @@ -module AcceleratedKernelsMetalExt - - -using Metal -import AcceleratedKernels as AK - - -# On Metal use the ScanPrefixes accumulation algorithm by default as the DecoupledLookback algorithm -# cannot be supported due to Metal's weaker memory consistency guarantees. -function AK.accumulate!( - op, v::AbstractArray, backend::MetalBackend; - init, - # Algorithm choice is the only differing default - alg::AK.AccumulateAlgorithm=AK.ScanPrefixes(), - kwargs... -) - AK._accumulate_impl!( - op, v, backend; - init, alg, - kwargs... - ) -end - - -# Two-argument version for compatibility with Base.accumulate! -function AK.accumulate!( - op, dst::AbstractArray, src::AbstractArray, backend::MetalBackend; - init, - # Algorithm choice is the only differing default - alg::AK.AccumulateAlgorithm=AK.ScanPrefixes(), - kwargs... -) - copyto!(dst, src) - AK._accumulate_impl!( - op, dst, backend; - init, alg, - kwargs... - ) -end - -end # module AcceleratedKernelsMetalExt \ No newline at end of file From bc75725f5d586c38d7c7e1f53d6041ca45cf4e55 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Tue, 8 Jul 2025 11:49:45 +0200 Subject: [PATCH 4/7] allow for temp_flags of bigger size --- src/accumulate/accumulate_1d_gpu.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/accumulate/accumulate_1d_gpu.jl b/src/accumulate/accumulate_1d_gpu.jl index 1ff7ba2..11c4bb3 100644 --- a/src/accumulate/accumulate_1d_gpu.jl +++ b/src/accumulate/accumulate_1d_gpu.jl @@ -204,7 +204,7 @@ end # We use the happens-before relation between stores to `v` and the store to `flags`. UnsafeAtomics.fence(UnsafeAtomics.release) if ithread == 0x0 - UnsafeAtomics.store!(pointer(flags, iblock + 0x1), ACC_FLAG_A, UnsafeAtomics.monotonic) + UnsafeAtomics.store!(pointer(flags, iblock + 0x1), convert(eltype(flags), ACC_FLAG_A), UnsafeAtomics.monotonic) end end @@ -293,7 +293,7 @@ function accumulate_1d!( if isnothing(temp_flags) flags = similar(v, UInt8, num_blocks) else - @argcheck eltype(temp_flags) == UInt8 + @argcheck eltype(temp_flags) <: Integer @argcheck length(temp_flags) >= num_blocks flags = temp_flags end From 25d035b5930428ba63f66631a642858ac7cebaaf Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Tue, 8 Jul 2025 12:22:26 +0200 Subject: [PATCH 5/7] Update src/accumulate/accumulate_1d_gpu.jl --- src/accumulate/accumulate_1d_gpu.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/accumulate/accumulate_1d_gpu.jl b/src/accumulate/accumulate_1d_gpu.jl index 11c4bb3..d777099 100644 --- a/src/accumulate/accumulate_1d_gpu.jl +++ b/src/accumulate/accumulate_1d_gpu.jl @@ -149,7 +149,7 @@ end @kernel cpu=false inbounds=true unsafe_indices=true function _accumulate_previous!( - op, v, flags::AbstractArray{UInt8}, @Const(prefixes), + op, v, flags, @Const(prefixes), ) len = length(v) From efd724087adcbcfc93e0209c29378e29da67e911 Mon Sep 17 00:00:00 2001 From: anicusan Date: Mon, 21 Jul 2025 07:59:52 +0100 Subject: [PATCH 6/7] Made ScanPrefixes the default accumulate algorithm --- src/accumulate/accumulate.jl | 30 ++++++++++++------------------ src/arithmetics.jl | 10 ++-------- 2 files changed, 14 insertions(+), 26 deletions(-) diff --git a/src/accumulate/accumulate.jl b/src/accumulate/accumulate.jl index 93d42c2..78ae287 100644 --- a/src/accumulate/accumulate.jl +++ b/src/accumulate/accumulate.jl @@ -40,7 +40,7 @@ include("accumulate_nd.jl") min_elems::Int=2, # Algorithm choice - alg::AccumulateAlgorithm=DecoupledLookback(), + alg::AccumulateAlgorithm=ScanPrefixes(), # GPU settings block_size::Int=256, @@ -60,7 +60,7 @@ include("accumulate_nd.jl") min_elems::Int=2, # Algorithm choice - alg::AccumulateAlgorithm=DecoupledLookback(), + alg::AccumulateAlgorithm=ScanPrefixes(), # GPU settings block_size::Int=256, @@ -89,13 +89,13 @@ becomes faster if it is a more compute-heavy operation to hide memory latency - ## GPU For the 1D case (`dims=nothing`), the `alg` can be one of the following: -- `DecoupledLookback()`: the default algorithm, using opportunistic lookback to reuse earlier - blocks' results; requires device-level memory consistency guarantees, which Apple Metal does not - provide. -- `ScanPrefixes()`: a simpler algorithm that scans the prefixes of each block, with no lookback; it - has similar performance as `DecoupledLookback()` for large block sizes, and small to medium arrays, +- `ScanPrefixes()`: the default algorithm that scans the prefixes of each block, with no lookback; it + has better performance than `DecoupledLookback()` for large block sizes, and small to medium arrays, but poorer scaling for many blocks; there is no performance degradation below `block_size^2` - elements. + elements, but it remains fast well into millions of elements. +- `DecoupledLookback()`: a more complex algorithm using opportunistic lookback to reuse earlier + blocks' results; requires device-level memory consistency guarantees (which Apple Metal does not + provide) and atomic orderings; theoretically more scalable for many blocks. A different, unique algorithm is used for the multi-dimensional case (`dims` is an integer). @@ -105,13 +105,7 @@ The temporaries are only used for the 1D case (`dims=nothing`): `temp` stores pe `temp_flags` is only used for the `DecoupledLookback()` algorithm for flagging if blocks are ready; they should both have at least `(length(v) + 2 * block_size - 1) รท (2 * block_size)` elements; also, `eltype(v) === eltype(temp)` is required; the elements in `temp_flags` can be any integers, but -`Int8` is used by default to reduce memory usage. - -# Platform-Specific Notes -On Metal, the `alg=ScanPrefixes()` algorithm is used by default, as Apple Metal GPUs do not have -strong enough memory consistency guarantees for the `DecoupledLookback()` algorithm - which -produces incorrect results about 0.38% of the time (the beauty of parallel algorithms, ey). Also, -`block_size=1024` is used here by default to reduce the number of coupled lookbacks. +`UInt8` is used by default to reduce memory usage. # Examples Example computing an inclusive prefix sum (the typical GPU "scan"): @@ -123,7 +117,7 @@ v = oneAPI.ones(Int32, 100_000) AK.accumulate!(+, v, init=0) # Use a different algorithm -AK.accumulate!(+, v, alg=AK.ScanPrefixes()) +AK.accumulate!(+, v, alg=AK.DecoupledLookback()) ``` """ function accumulate!( @@ -160,7 +154,7 @@ function _accumulate_impl!( dims::Union{Nothing, Int}=nothing, inclusive::Bool=true, - alg::AccumulateAlgorithm=DecoupledLookback(), + alg::AccumulateAlgorithm=ScanPrefixes(), # CPU settings max_tasks::Int=Threads.nthreads(), @@ -212,7 +206,7 @@ end min_elems::Int=2, # Algorithm choice - alg::AccumulateAlgorithm=DecoupledLookback(), + alg::AccumulateAlgorithm=ScanPrefixes(), # GPU settings block_size::Int=256, diff --git a/src/arithmetics.jl b/src/arithmetics.jl index 790db06..a14a9e2 100644 --- a/src/arithmetics.jl +++ b/src/arithmetics.jl @@ -307,7 +307,7 @@ end dims::Union{Nothing, Int}=nothing, # Algorithm choice - alg::AccumulateAlgorithm=DecoupledLookback(), + alg::AccumulateAlgorithm=ScanPrefixes(), # GPU settings block_size::Int=256, @@ -318,9 +318,6 @@ end Cumulative sum of elements of an array, with optional `init` and `dims`. Arguments are the same as for [`accumulate`](@ref). -## Platform-Specific Notes -On Apple Metal, the `alg=ScanPrefixes()` algorithm is used by default. - # Examples Simple cumulative sum of elements in a vector: ```julia @@ -360,7 +357,7 @@ end dims::Union{Nothing, Int}=nothing, # Algorithm choice - alg::AccumulateAlgorithm=DecoupledLookback(), + alg::AccumulateAlgorithm=ScanPrefixes(), # GPU settings block_size::Int=256, @@ -371,9 +368,6 @@ end Cumulative product of elements of an array, with optional `init` and `dims`. Arguments are the same as for [`accumulate`](@ref). -## Platform-Specific Notes -On Apple Metal, the `alg=ScanPrefixes()` algorithm is used by default. - # Examples Simple cumulative product of elements in a vector: ```julia From 97b63316840c54f6503381e9a46cfccefb46f10b Mon Sep 17 00:00:00 2001 From: anicusan Date: Tue, 22 Jul 2025 10:29:44 -0400 Subject: [PATCH 7/7] omit DecoupledLookback from testing in oneAPI until atomic orderings are fixed there --- test/runtests.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index dcb4896..716fd8e 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -21,7 +21,9 @@ elseif "--oneAPI" in ARGS using oneAPI oneAPI.versioninfo() const BACKEND = oneAPIBackend() - TEST_DL[] = true + + # FIXME: need atomic orderings for `DecoupledLookback` in oneAPI + # TEST_DL[] = true elseif "--AMDGPU" in ARGS Pkg.add("AMDGPU") using AMDGPU