Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,19 @@ ArgCheck = "dce04be8-c92d-5529-be00-80e4d2c0e197"
GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a"
UnsafeAtomics = "013be700-e6cd-48c3-b4a1-df204f14c38f"

[weakdeps]
Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"

[extensions]
AcceleratedKernelsMetalExt = "Metal"
AcceleratedKernelsoneAPIExt = "oneAPI"

[compat]
ArgCheck = "2"
GPUArraysCore = "0.2.0"
KernelAbstractions = "0.9.34"
Markdown = "1"
Metal = "1"
oneAPI = "1, 2"
UnsafeAtomics = "0.3.0"
julia = "1.10"
oneAPI = "1, 2"
41 changes: 0 additions & 41 deletions ext/AcceleratedKernelsMetalExt.jl

This file was deleted.

1 change: 1 addition & 0 deletions src/AcceleratedKernels.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ module AcceleratedKernels
using ArgCheck: @argcheck
using GPUArraysCore: AbstractGPUArray, @allowscalar
using KernelAbstractions
import UnsafeAtomics


# Exposed functions from upstream packages
Expand Down
4 changes: 1 addition & 3 deletions src/accumulate/accumulate.jl
Original file line number Diff line number Diff line change
Expand Up @@ -160,9 +160,7 @@ function _accumulate_impl!(
dims::Union{Nothing, Int}=nothing,
inclusive::Bool=true,

# FIXME: Switch back to `DecoupledLookback()` as the default algorithm
# once https://github.com/JuliaGPU/AcceleratedKernels.jl/pull/44 is merged.
alg::AccumulateAlgorithm=ScanPrefixes(),
alg::AccumulateAlgorithm=DecoupledLookback(),

# CPU settings
max_tasks::Int=Threads.nthreads(),
Expand Down
20 changes: 13 additions & 7 deletions src/accumulate/accumulate_1d_gpu.jl
Original file line number Diff line number Diff line change
Expand Up @@ -169,9 +169,9 @@ end
running_prefix = prefixes[iblock - 0x1 + 0x1]
inspected_block = signed(typeof(iblock))(iblock) - 0x2
while inspected_block >= 0x0

# Opportunistic: a previous block finished everything
if flags[inspected_block + 0x1] == ACC_FLAG_A
if UnsafeAtomics.load(pointer(flags, inspected_block + 0x1), UnsafeAtomics.monotonic) == ACC_FLAG_A
UnsafeAtomics.fence(UnsafeAtomics.acquire) # (fence before reading from v)
# Previous blocks (except last) always have filled values in v, so index is inbounds
running_prefix = op(running_prefix, v[(inspected_block + 0x1) * block_size * 0x2])
break
Expand All @@ -194,11 +194,17 @@ end
end

# Set flag for "aggregate of all prefixes up to this block finished"
@synchronize() # This is needed so that the flag is not set before copying into v, but
# there should be better memory fences to guarantee ordering without
# thread synchronization...
# There are two synchronization concerns here:
# 1. Withing a group we want to ensure that all writed to `v` have occured before setting the flag.
# 2. Between groups we need to use a fence and atomic load/store to ensure that memory operations are not re-ordered
@synchronize() # within-block
# Note: This fence is needed to ensure that the flag is not set before copying into v.
# See https://doc.rust-lang.org/std/sync/atomic/fn.fence.html
# for more details.
# We use the happens-before relation between stores to `v` and the store to `flags`.
UnsafeAtomics.fence(UnsafeAtomics.release)
if ithread == 0x0
flags[iblock + 0x1] = ACC_FLAG_A
UnsafeAtomics.store!(pointer(flags, iblock + 0x1), convert(eltype(flags), ACC_FLAG_A), UnsafeAtomics.monotonic)
end
end

Expand Down Expand Up @@ -285,7 +291,7 @@ function accumulate_1d!(
end

if isnothing(temp_flags)
flags = similar(v, Int8, num_blocks)
flags = similar(v, UInt8, num_blocks)
else
@argcheck eltype(temp_flags) <: Integer
@argcheck length(temp_flags) >= num_blocks
Expand Down
Loading