Skip to content
Merged
7 changes: 3 additions & 4 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,19 @@ ArgCheck = "dce04be8-c92d-5529-be00-80e4d2c0e197"
GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a"
UnsafeAtomics = "013be700-e6cd-48c3-b4a1-df204f14c38f"

[weakdeps]
Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"

[extensions]
AcceleratedKernelsMetalExt = "Metal"
AcceleratedKernelsoneAPIExt = "oneAPI"

[compat]
ArgCheck = "2"
GPUArraysCore = "0.2.0"
KernelAbstractions = "0.9.34"
Markdown = "1"
Metal = "1"
oneAPI = "1, 2"
UnsafeAtomics = "0.3.0"
julia = "1.10"
oneAPI = "1, 2"
41 changes: 0 additions & 41 deletions ext/AcceleratedKernelsMetalExt.jl

This file was deleted.

1 change: 1 addition & 0 deletions src/AcceleratedKernels.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ module AcceleratedKernels
using ArgCheck: @argcheck
using GPUArraysCore: AnyGPUArray, @allowscalar
using KernelAbstractions
import UnsafeAtomics


# Exposed functions from upstream packages
Expand Down
30 changes: 11 additions & 19 deletions src/accumulate/accumulate.jl
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ include("accumulate_nd.jl")
min_elems::Int=2,

# Algorithm choice
alg::AccumulateAlgorithm=DecoupledLookback(),
alg::AccumulateAlgorithm=ScanPrefixes(),

# GPU settings
block_size::Int=256,
Expand All @@ -60,7 +60,7 @@ include("accumulate_nd.jl")
min_elems::Int=2,

# Algorithm choice
alg::AccumulateAlgorithm=DecoupledLookback(),
alg::AccumulateAlgorithm=ScanPrefixes(),

# GPU settings
block_size::Int=256,
Expand Down Expand Up @@ -89,13 +89,13 @@ becomes faster if it is a more compute-heavy operation to hide memory latency -

## GPU
For the 1D case (`dims=nothing`), the `alg` can be one of the following:
- `DecoupledLookback()`: the default algorithm, using opportunistic lookback to reuse earlier
blocks' results; requires device-level memory consistency guarantees, which Apple Metal does not
provide.
- `ScanPrefixes()`: a simpler algorithm that scans the prefixes of each block, with no lookback; it
has similar performance as `DecoupledLookback()` for large block sizes, and small to medium arrays,
- `ScanPrefixes()`: the default algorithm that scans the prefixes of each block, with no lookback; it
has better performance than `DecoupledLookback()` for large block sizes, and small to medium arrays,
but poorer scaling for many blocks; there is no performance degradation below `block_size^2`
elements.
elements, but it remains fast well into millions of elements.
- `DecoupledLookback()`: a more complex algorithm using opportunistic lookback to reuse earlier
blocks' results; requires device-level memory consistency guarantees (which Apple Metal does not
provide) and atomic orderings; theoretically more scalable for many blocks.

A different, unique algorithm is used for the multi-dimensional case (`dims` is an integer).

Expand All @@ -105,13 +105,7 @@ The temporaries are only used for the 1D case (`dims=nothing`): `temp` stores pe
`temp_flags` is only used for the `DecoupledLookback()` algorithm for flagging if blocks are ready;
they should both have at least `(length(v) + 2 * block_size - 1) ÷ (2 * block_size)` elements; also,
`eltype(v) === eltype(temp)` is required; the elements in `temp_flags` can be any integers, but
`Int8` is used by default to reduce memory usage.

# Platform-Specific Notes
On Metal, the `alg=ScanPrefixes()` algorithm is used by default, as Apple Metal GPUs do not have
strong enough memory consistency guarantees for the `DecoupledLookback()` algorithm - which
produces incorrect results about 0.38% of the time (the beauty of parallel algorithms, ey). Also,
`block_size=1024` is used here by default to reduce the number of coupled lookbacks.
`UInt8` is used by default to reduce memory usage.

# Examples
Example computing an inclusive prefix sum (the typical GPU "scan"):
Expand All @@ -123,7 +117,7 @@ v = oneAPI.ones(Int32, 100_000)
AK.accumulate!(+, v, init=0)

# Use a different algorithm
AK.accumulate!(+, v, alg=AK.ScanPrefixes())
AK.accumulate!(+, v, alg=AK.DecoupledLookback())
```
"""
function accumulate!(
Expand Down Expand Up @@ -160,8 +154,6 @@ function _accumulate_impl!(
dims::Union{Nothing, Int}=nothing,
inclusive::Bool=true,

# FIXME: Switch back to `DecoupledLookback()` as the default algorithm
# once https://github.com/JuliaGPU/AcceleratedKernels.jl/pull/44 is merged.
alg::AccumulateAlgorithm=ScanPrefixes(),

# CPU settings
Expand Down Expand Up @@ -214,7 +206,7 @@ end
min_elems::Int=2,

# Algorithm choice
alg::AccumulateAlgorithm=DecoupledLookback(),
alg::AccumulateAlgorithm=ScanPrefixes(),

# GPU settings
block_size::Int=256,
Expand Down
20 changes: 13 additions & 7 deletions src/accumulate/accumulate_1d_gpu.jl
Original file line number Diff line number Diff line change
Expand Up @@ -169,9 +169,9 @@ end
running_prefix = prefixes[iblock - 0x1 + 0x1]
inspected_block = signed(typeof(iblock))(iblock) - 0x2
while inspected_block >= 0x0

# Opportunistic: a previous block finished everything
if flags[inspected_block + 0x1] == ACC_FLAG_A
if UnsafeAtomics.load(pointer(flags, inspected_block + 0x1), UnsafeAtomics.monotonic) == ACC_FLAG_A
UnsafeAtomics.fence(UnsafeAtomics.acquire) # (fence before reading from v)
# Previous blocks (except last) always have filled values in v, so index is inbounds
running_prefix = op(running_prefix, v[(inspected_block + 0x1) * block_size * 0x2])
break
Expand All @@ -194,11 +194,17 @@ end
end

# Set flag for "aggregate of all prefixes up to this block finished"
@synchronize() # This is needed so that the flag is not set before copying into v, but
# there should be better memory fences to guarantee ordering without
# thread synchronization...
# There are two synchronization concerns here:
# 1. Withing a group we want to ensure that all writed to `v` have occured before setting the flag.
# 2. Between groups we need to use a fence and atomic load/store to ensure that memory operations are not re-ordered
@synchronize() # within-block
# Note: This fence is needed to ensure that the flag is not set before copying into v.
# See https://doc.rust-lang.org/std/sync/atomic/fn.fence.html
# for more details.
# We use the happens-before relation between stores to `v` and the store to `flags`.
UnsafeAtomics.fence(UnsafeAtomics.release)
if ithread == 0x0
flags[iblock + 0x1] = ACC_FLAG_A
UnsafeAtomics.store!(pointer(flags, iblock + 0x1), convert(eltype(flags), ACC_FLAG_A), UnsafeAtomics.monotonic)
end
end

Expand Down Expand Up @@ -285,7 +291,7 @@ function accumulate_1d_gpu!(
end

if isnothing(temp_flags)
flags = similar(v, Int8, num_blocks)
flags = similar(v, UInt8, num_blocks)
else
@argcheck eltype(temp_flags) <: Integer
@argcheck length(temp_flags) >= num_blocks
Expand Down
10 changes: 2 additions & 8 deletions src/arithmetics.jl
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ end
dims::Union{Nothing, Int}=nothing,

# Algorithm choice
alg::AccumulateAlgorithm=DecoupledLookback(),
alg::AccumulateAlgorithm=ScanPrefixes(),

# GPU settings
block_size::Int=256,
Expand All @@ -318,9 +318,6 @@ end
Cumulative sum of elements of an array, with optional `init` and `dims`. Arguments are the same as
for [`accumulate`](@ref).

## Platform-Specific Notes
On Apple Metal, the `alg=ScanPrefixes()` algorithm is used by default.

# Examples
Simple cumulative sum of elements in a vector:
```julia
Expand Down Expand Up @@ -360,7 +357,7 @@ end
dims::Union{Nothing, Int}=nothing,

# Algorithm choice
alg::AccumulateAlgorithm=DecoupledLookback(),
alg::AccumulateAlgorithm=ScanPrefixes(),

# GPU settings
block_size::Int=256,
Expand All @@ -371,9 +368,6 @@ end
Cumulative product of elements of an array, with optional `init` and `dims`. Arguments are the same
as for [`accumulate`](@ref).

## Platform-Specific Notes
On Apple Metal, the `alg=ScanPrefixes()` algorithm is used by default.

# Examples
Simple cumulative product of elements in a vector:
```julia
Expand Down
4 changes: 3 additions & 1 deletion test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@ elseif "--oneAPI" in ARGS
using oneAPI
oneAPI.versioninfo()
const BACKEND = oneAPIBackend()
TEST_DL[] = true

# FIXME: need atomic orderings for `DecoupledLookback` in oneAPI
# TEST_DL[] = true
elseif "--AMDGPU" in ARGS
Pkg.add("AMDGPU")
using AMDGPU
Expand Down
Loading