From fef83f2200e000761c309af923fbde3cee6c52c9 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Tue, 1 Jul 2025 08:50:25 +0200
Subject: [PATCH 1/7] Revert "Switch default algorithm for accumulate to
 ScanPrefixes"

This reverts commit 2da869694f822f52a9ba0cf006a0da84db4436a2.
---
 src/accumulate/accumulate.jl | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/accumulate/accumulate.jl b/src/accumulate/accumulate.jl
index 3179b2a..96adf30 100644
--- a/src/accumulate/accumulate.jl
+++ b/src/accumulate/accumulate.jl
@@ -160,9 +160,7 @@ function _accumulate_impl!(
     dims::Union{Nothing, Int}=nothing,
     inclusive::Bool=true,
 
-    # FIXME: Switch back to `DecoupledLookback()` as the default algorithm
-    #         once https://github.com/JuliaGPU/AcceleratedKernels.jl/pull/44 is merged.
-    alg::AccumulateAlgorithm=ScanPrefixes(),
+    alg::AccumulateAlgorithm=DecoupledLookback(),
 
     # CPU settings
     max_tasks::Int=Threads.nthreads(),

From fea0c5c39b4a3469345b350397d9ade8a6d26b08 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Sun, 15 Jun 2025 20:28:38 +0200
Subject: [PATCH 2/7] attempt to use UnsafeAtomics to fix race in accumulate

---
 Project.toml                        |  4 +++-
 src/AcceleratedKernels.jl           |  1 +
 src/accumulate/accumulate_1d_gpu.jl | 24 +++++++++++++++---------
 3 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/Project.toml b/Project.toml
index 25894ff..15fb563 100644
--- a/Project.toml
+++ b/Project.toml
@@ -8,6 +8,7 @@ ArgCheck = "dce04be8-c92d-5529-be00-80e4d2c0e197"
 GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a"
+UnsafeAtomics = "013be700-e6cd-48c3-b4a1-df204f14c38f"
 
 [weakdeps]
 Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
@@ -23,5 +24,6 @@ GPUArraysCore = "0.2.0"
 KernelAbstractions = "0.9.34"
 Markdown = "1"
 Metal = "1"
-oneAPI = "1, 2"
+UnsafeAtomics = "0.3.0"
 julia = "1.10"
+oneAPI = "1, 2"
diff --git a/src/AcceleratedKernels.jl b/src/AcceleratedKernels.jl
index f97c9d8..a5c51a3 100644
--- a/src/AcceleratedKernels.jl
+++ b/src/AcceleratedKernels.jl
@@ -14,6 +14,7 @@ module AcceleratedKernels
 using ArgCheck: @argcheck
 using GPUArraysCore: AbstractGPUArray, @allowscalar
 using KernelAbstractions
+import UnsafeAtomics
 
 
 # Exposed functions from upstream packages
diff --git a/src/accumulate/accumulate_1d_gpu.jl b/src/accumulate/accumulate_1d_gpu.jl
index be3ee59..1ff7ba2 100644
--- a/src/accumulate/accumulate_1d_gpu.jl
+++ b/src/accumulate/accumulate_1d_gpu.jl
@@ -149,7 +149,7 @@ end
 
 
 @kernel cpu=false inbounds=true unsafe_indices=true function _accumulate_previous!(
-    op, v, flags, @Const(prefixes),
+    op, v, flags::AbstractArray{UInt8}, @Const(prefixes),
 )
 
     len = length(v)
@@ -169,9 +169,9 @@ end
     running_prefix = prefixes[iblock - 0x1 + 0x1]
     inspected_block = signed(typeof(iblock))(iblock) - 0x2
     while inspected_block >= 0x0
-
         # Opportunistic: a previous block finished everything
-        if flags[inspected_block + 0x1] == ACC_FLAG_A
+        if UnsafeAtomics.load(pointer(flags, inspected_block + 0x1), UnsafeAtomics.monotonic) == ACC_FLAG_A
+            UnsafeAtomics.fence(UnsafeAtomics.acquire) # (fence before reading from v)
             # Previous blocks (except last) always have filled values in v, so index is inbounds
             running_prefix = op(running_prefix, v[(inspected_block + 0x1) * block_size * 0x2])
             break
@@ -194,11 +194,17 @@ end
     end
 
     # Set flag for "aggregate of all prefixes up to this block finished"
-    @synchronize()      # This is needed so that the flag is not set before copying into v, but
-                        # there should be better memory fences to guarantee ordering without
-                        # thread synchronization...
+    # There are two synchronization concerns here:
+    # 1. Withing a group we want to ensure that all writed to `v` have occured before setting the flag.
+    # 2. Between groups we need to use a fence and atomic load/store to ensure that memory operations are not re-ordered
+    @synchronize() # within-block
+    # Note: This fence is needed to ensure that the flag is not set before copying into v.
+    #       See https://doc.rust-lang.org/std/sync/atomic/fn.fence.html
+    #       for more details.
+    #       We use the happens-before relation between stores to `v` and the store to `flags`.
+    UnsafeAtomics.fence(UnsafeAtomics.release)
     if ithread == 0x0
-        flags[iblock + 0x1] = ACC_FLAG_A
+        UnsafeAtomics.store!(pointer(flags, iblock + 0x1), ACC_FLAG_A, UnsafeAtomics.monotonic)
     end
 end
 
@@ -285,9 +291,9 @@ function accumulate_1d!(
     end
 
     if isnothing(temp_flags)
-        flags = similar(v, Int8, num_blocks)
+        flags = similar(v, UInt8, num_blocks)
     else
-        @argcheck eltype(temp_flags) <: Integer
+        @argcheck eltype(temp_flags) == UInt8
         @argcheck length(temp_flags) >= num_blocks
         flags = temp_flags
     end

From d4698abe2545138692603df8b3a5ce5aca679433 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Tue, 1 Jul 2025 08:52:57 +0200
Subject: [PATCH 3/7] Switch Metal back to the DecoupledLookback algorithm

---
 Project.toml                      |  3 ---
 ext/AcceleratedKernelsMetalExt.jl | 41 -------------------------------
 2 files changed, 44 deletions(-)
 delete mode 100644 ext/AcceleratedKernelsMetalExt.jl

diff --git a/Project.toml b/Project.toml
index 15fb563..8b948b6 100644
--- a/Project.toml
+++ b/Project.toml
@@ -11,11 +11,9 @@ Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a"
 UnsafeAtomics = "013be700-e6cd-48c3-b4a1-df204f14c38f"
 
 [weakdeps]
-Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
 oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
 
 [extensions]
-AcceleratedKernelsMetalExt = "Metal"
 AcceleratedKernelsoneAPIExt = "oneAPI"
 
 [compat]
@@ -23,7 +21,6 @@ ArgCheck = "2"
 GPUArraysCore = "0.2.0"
 KernelAbstractions = "0.9.34"
 Markdown = "1"
-Metal = "1"
 UnsafeAtomics = "0.3.0"
 julia = "1.10"
 oneAPI = "1, 2"
diff --git a/ext/AcceleratedKernelsMetalExt.jl b/ext/AcceleratedKernelsMetalExt.jl
deleted file mode 100644
index 7d191dd..0000000
--- a/ext/AcceleratedKernelsMetalExt.jl
+++ /dev/null
@@ -1,41 +0,0 @@
-module AcceleratedKernelsMetalExt
-
-
-using Metal
-import AcceleratedKernels as AK
-
-
-# On Metal use the ScanPrefixes accumulation algorithm by default as the DecoupledLookback algorithm
-# cannot be supported due to Metal's weaker memory consistency guarantees.
-function AK.accumulate!(
-    op, v::AbstractArray, backend::MetalBackend;
-    init,
-    # Algorithm choice is the only differing default
-    alg::AK.AccumulateAlgorithm=AK.ScanPrefixes(),
-    kwargs...
-)
-    AK._accumulate_impl!(
-        op, v, backend;
-        init, alg,
-        kwargs...
-    )
-end
-
-
-# Two-argument version for compatibility with Base.accumulate!
-function AK.accumulate!(
-    op, dst::AbstractArray, src::AbstractArray, backend::MetalBackend;
-    init,
-    # Algorithm choice is the only differing default
-    alg::AK.AccumulateAlgorithm=AK.ScanPrefixes(),
-    kwargs...
-)
-    copyto!(dst, src)
-    AK._accumulate_impl!(
-        op, dst, backend;
-        init, alg,
-        kwargs...
-    )
-end
-
-end   # module AcceleratedKernelsMetalExt
\ No newline at end of file

From bc75725f5d586c38d7c7e1f53d6041ca45cf4e55 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Tue, 8 Jul 2025 11:49:45 +0200
Subject: [PATCH 4/7] allow for temp_flags of bigger size

---
 src/accumulate/accumulate_1d_gpu.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/accumulate/accumulate_1d_gpu.jl b/src/accumulate/accumulate_1d_gpu.jl
index 1ff7ba2..11c4bb3 100644
--- a/src/accumulate/accumulate_1d_gpu.jl
+++ b/src/accumulate/accumulate_1d_gpu.jl
@@ -204,7 +204,7 @@ end
     #       We use the happens-before relation between stores to `v` and the store to `flags`.
     UnsafeAtomics.fence(UnsafeAtomics.release)
     if ithread == 0x0
-        UnsafeAtomics.store!(pointer(flags, iblock + 0x1), ACC_FLAG_A, UnsafeAtomics.monotonic)
+        UnsafeAtomics.store!(pointer(flags, iblock + 0x1), convert(eltype(flags), ACC_FLAG_A), UnsafeAtomics.monotonic)
     end
 end
 
@@ -293,7 +293,7 @@ function accumulate_1d!(
     if isnothing(temp_flags)
         flags = similar(v, UInt8, num_blocks)
     else
-        @argcheck eltype(temp_flags) == UInt8
+        @argcheck eltype(temp_flags) <: Integer
         @argcheck length(temp_flags) >= num_blocks
         flags = temp_flags
     end

From 25d035b5930428ba63f66631a642858ac7cebaaf Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Tue, 8 Jul 2025 12:22:26 +0200
Subject: [PATCH 5/7] Update src/accumulate/accumulate_1d_gpu.jl

---
 src/accumulate/accumulate_1d_gpu.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/accumulate/accumulate_1d_gpu.jl b/src/accumulate/accumulate_1d_gpu.jl
index 11c4bb3..d777099 100644
--- a/src/accumulate/accumulate_1d_gpu.jl
+++ b/src/accumulate/accumulate_1d_gpu.jl
@@ -149,7 +149,7 @@ end
 
 
 @kernel cpu=false inbounds=true unsafe_indices=true function _accumulate_previous!(
-    op, v, flags::AbstractArray{UInt8}, @Const(prefixes),
+    op, v, flags, @Const(prefixes),
 )
 
     len = length(v)

From efd724087adcbcfc93e0209c29378e29da67e911 Mon Sep 17 00:00:00 2001
From: anicusan <aln705@student.bham.ac.uk>
Date: Mon, 21 Jul 2025 07:59:52 +0100
Subject: [PATCH 6/7] Made ScanPrefixes the default accumulate algorithm

---
 src/accumulate/accumulate.jl | 30 ++++++++++++------------------
 src/arithmetics.jl           | 10 ++--------
 2 files changed, 14 insertions(+), 26 deletions(-)

diff --git a/src/accumulate/accumulate.jl b/src/accumulate/accumulate.jl
index 93d42c2..78ae287 100644
--- a/src/accumulate/accumulate.jl
+++ b/src/accumulate/accumulate.jl
@@ -40,7 +40,7 @@ include("accumulate_nd.jl")
         min_elems::Int=2,
 
         # Algorithm choice
-        alg::AccumulateAlgorithm=DecoupledLookback(),
+        alg::AccumulateAlgorithm=ScanPrefixes(),
 
         # GPU settings
         block_size::Int=256,
@@ -60,7 +60,7 @@ include("accumulate_nd.jl")
         min_elems::Int=2,
 
         # Algorithm choice
-        alg::AccumulateAlgorithm=DecoupledLookback(),
+        alg::AccumulateAlgorithm=ScanPrefixes(),
 
         # GPU settings
         block_size::Int=256,
@@ -89,13 +89,13 @@ becomes faster if it is a more compute-heavy operation to hide memory latency -
 
 ## GPU
 For the 1D case (`dims=nothing`), the `alg` can be one of the following:
-- `DecoupledLookback()`: the default algorithm, using opportunistic lookback to reuse earlier
-  blocks' results; requires device-level memory consistency guarantees, which Apple Metal does not
-  provide.
-- `ScanPrefixes()`: a simpler algorithm that scans the prefixes of each block, with no lookback; it
-  has similar performance as `DecoupledLookback()` for large block sizes, and small to medium arrays,
+- `ScanPrefixes()`: the default algorithm that scans the prefixes of each block, with no lookback; it
+  has better performance than `DecoupledLookback()` for large block sizes, and small to medium arrays,
   but poorer scaling for many blocks; there is no performance degradation below `block_size^2`
-  elements.
+  elements, but it remains fast well into millions of elements.
+- `DecoupledLookback()`: a more complex algorithm using opportunistic lookback to reuse earlier
+  blocks' results; requires device-level memory consistency guarantees (which Apple Metal does not
+  provide) and atomic orderings; theoretically more scalable for many blocks.
 
 A different, unique algorithm is used for the multi-dimensional case (`dims` is an integer).
 
@@ -105,13 +105,7 @@ The temporaries are only used for the 1D case (`dims=nothing`): `temp` stores pe
 `temp_flags` is only used for the `DecoupledLookback()` algorithm for flagging if blocks are ready;
 they should both have at least `(length(v) + 2 * block_size - 1) ÷ (2 * block_size)` elements; also,
 `eltype(v) === eltype(temp)` is required; the elements in `temp_flags` can be any integers, but
-`Int8` is used by default to reduce memory usage.
-
-# Platform-Specific Notes
-On Metal, the `alg=ScanPrefixes()` algorithm is used by default, as Apple Metal GPUs do not have
-strong enough memory consistency guarantees for the `DecoupledLookback()` algorithm - which
-produces incorrect results about 0.38% of the time (the beauty of parallel algorithms, ey). Also,
-`block_size=1024` is used here by default to reduce the number of coupled lookbacks.
+`UInt8` is used by default to reduce memory usage.
 
 # Examples
 Example computing an inclusive prefix sum (the typical GPU "scan"):
@@ -123,7 +117,7 @@ v = oneAPI.ones(Int32, 100_000)
 AK.accumulate!(+, v, init=0)
 
 # Use a different algorithm
-AK.accumulate!(+, v, alg=AK.ScanPrefixes())
+AK.accumulate!(+, v, alg=AK.DecoupledLookback())
 ```
 """
 function accumulate!(
@@ -160,7 +154,7 @@ function _accumulate_impl!(
     dims::Union{Nothing, Int}=nothing,
     inclusive::Bool=true,
 
-    alg::AccumulateAlgorithm=DecoupledLookback(),
+    alg::AccumulateAlgorithm=ScanPrefixes(),
 
     # CPU settings
     max_tasks::Int=Threads.nthreads(),
@@ -212,7 +206,7 @@ end
         min_elems::Int=2,
 
         # Algorithm choice
-        alg::AccumulateAlgorithm=DecoupledLookback(),
+        alg::AccumulateAlgorithm=ScanPrefixes(),
 
         # GPU settings
         block_size::Int=256,
diff --git a/src/arithmetics.jl b/src/arithmetics.jl
index 790db06..a14a9e2 100644
--- a/src/arithmetics.jl
+++ b/src/arithmetics.jl
@@ -307,7 +307,7 @@ end
         dims::Union{Nothing, Int}=nothing,
 
         # Algorithm choice
-        alg::AccumulateAlgorithm=DecoupledLookback(),
+        alg::AccumulateAlgorithm=ScanPrefixes(),
 
         # GPU settings
         block_size::Int=256,
@@ -318,9 +318,6 @@ end
 Cumulative sum of elements of an array, with optional `init` and `dims`. Arguments are the same as
 for [`accumulate`](@ref).
 
-## Platform-Specific Notes
-On Apple Metal, the `alg=ScanPrefixes()` algorithm is used by default.
-
 # Examples
 Simple cumulative sum of elements in a vector:
 ```julia
@@ -360,7 +357,7 @@ end
         dims::Union{Nothing, Int}=nothing,
 
         # Algorithm choice
-        alg::AccumulateAlgorithm=DecoupledLookback(),
+        alg::AccumulateAlgorithm=ScanPrefixes(),
 
         # GPU settings
         block_size::Int=256,
@@ -371,9 +368,6 @@ end
 Cumulative product of elements of an array, with optional `init` and `dims`. Arguments are the same
 as for [`accumulate`](@ref).
 
-## Platform-Specific Notes
-On Apple Metal, the `alg=ScanPrefixes()` algorithm is used by default.
-
 # Examples
 Simple cumulative product of elements in a vector:
 ```julia

From 97b63316840c54f6503381e9a46cfccefb46f10b Mon Sep 17 00:00:00 2001
From: anicusan <aln705@student.bham.ac.uk>
Date: Tue, 22 Jul 2025 10:29:44 -0400
Subject: [PATCH 7/7] omit DecoupledLookback from testing in oneAPI until
 atomic orderings are fixed there

---
 test/runtests.jl | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index dcb4896..716fd8e 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -21,7 +21,9 @@ elseif "--oneAPI" in ARGS
     using oneAPI
     oneAPI.versioninfo()
     const BACKEND = oneAPIBackend()
-    TEST_DL[] = true
+
+    # FIXME: need atomic orderings for `DecoupledLookback` in oneAPI
+    # TEST_DL[] = true
 elseif "--AMDGPU" in ARGS
     Pkg.add("AMDGPU")
     using AMDGPU