Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions .github/workflows/CI-CPU.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,46 @@ jobs:
- uses: julia-actions/julia-runtest@v1
env:
JULIA_NUM_THREADS: ${{ matrix.env.JULIA_NUM_THREADS }}
OpenCL:
name: OpenCL
runs-on: ubuntu-latest
timeout-minutes: 60
permissions: # needed to allow julia-actions/cache to proactively delete old caches that it has created
actions: write
contents: read
strategy:
fail-fast: true
steps:
- uses: actions/checkout@v4
- uses: julia-actions/setup-julia@v2
with:
version: 1
arch: x64
- uses: julia-actions/cache@v2
- uses: julia-actions/julia-buildpkg@v1
- uses: julia-actions/julia-runtest@v1
with:
test_args: '--OpenCL'
# cpuKA:
# name: KA CPU Backend
# runs-on: ubuntu-latest
# timeout-minutes: 60
# permissions: # needed to allow julia-actions/cache to proactively delete old caches that it has created
# actions: write
# contents: read
# strategy:
# fail-fast: true
# steps:
# - uses: actions/checkout@v4
# - uses: julia-actions/setup-julia@v2
# with:
# version: 1
# arch: x64
# - uses: julia-actions/cache@v2
# - uses: julia-actions/julia-buildpkg@v1
# - uses: julia-actions/julia-runtest@v1
# with:
# test_args: '--cpuKA'
docs:
name: Documentation
runs-on: ubuntu-latest
Expand Down
2 changes: 1 addition & 1 deletion src/AcceleratedKernels.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ module AcceleratedKernels

# Internal dependencies
using ArgCheck: @argcheck
using GPUArraysCore: AbstractGPUArray, @allowscalar
using GPUArraysCore: AnyGPUArray, @allowscalar
using KernelAbstractions


Expand Down
24 changes: 17 additions & 7 deletions src/accumulate/accumulate.jl
Original file line number Diff line number Diff line change
Expand Up @@ -167,24 +167,34 @@ function _accumulate_impl!(
# CPU settings
max_tasks::Int=Threads.nthreads(),
min_elems::Int=2,
prefer_threads::Bool=true,

# GPU settings
block_size::Int=256,
temp::Union{Nothing, AbstractArray}=nothing,
temp_flags::Union{Nothing, AbstractArray}=nothing,
)
if isnothing(dims)
return accumulate_1d!(
op, v, backend, alg;
init, neutral, inclusive,
max_tasks, min_elems,
block_size, temp, temp_flags,
)
return if use_KA_algo(v, prefer_threads)
accumulate_1d_gpu!(
op, v, backend, alg;
init, neutral, inclusive,
max_tasks, min_elems,
block_size, temp, temp_flags,
)
else
accumulate_1d_cpu!(
op, v, backend, alg;
init, neutral, inclusive,
max_tasks, min_elems,
block_size, temp, temp_flags,
)
end
else
return accumulate_nd!(
op, v, backend;
init, neutral, dims, inclusive,
max_tasks, min_elems,
max_tasks, min_elems, prefer_threads,
block_size,
)
end
Expand Down
4 changes: 2 additions & 2 deletions src/accumulate/accumulate_1d_cpu.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
function accumulate_1d!(
op, v::AbstractArray, backend::CPU, alg;
function accumulate_1d_cpu!(
op, v::AbstractArray, backend::Backend, alg;
init,
neutral,
inclusive::Bool,
Expand Down
8 changes: 4 additions & 4 deletions src/accumulate/accumulate_1d_gpu.jl
Original file line number Diff line number Diff line change
Expand Up @@ -248,8 +248,8 @@ end


# DecoupledLookback algorithm
function accumulate_1d!(
op, v::AbstractArray, backend::GPU, ::DecoupledLookback;
function accumulate_1d_gpu!(
op, v::AbstractArray, backend::Backend, ::DecoupledLookback;
init,
neutral,
inclusive::Bool,
Expand Down Expand Up @@ -307,8 +307,8 @@ end


# ScanPrefixes algorithm
function accumulate_1d!(
op, v::AbstractArray, backend::GPU, ::ScanPrefixes;
function accumulate_1d_gpu!(
op, v::AbstractArray, backend, ::ScanPrefixes;
init,
neutral,
inclusive::Bool,
Expand Down
3 changes: 2 additions & 1 deletion src/accumulate/accumulate_nd.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ function accumulate_nd!(
# CPU settings
max_tasks::Int,
min_elems::Int,
prefer_threads::Bool=true,

# GPU settings
block_size::Int,
Expand All @@ -34,7 +35,7 @@ function accumulate_nd!(

# Degenerate cases end

if backend isa CPU
if !use_KA_algo(v, prefer_threads)
_accumulate_nd_cpu_sections!(op, v; init, dims, inclusive, max_tasks, min_elems)
else
# On GPUs we have two parallelisation approaches, based on which dimension has more elements:
Expand Down
10 changes: 6 additions & 4 deletions src/foreachindex.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ end
function _forindices_gpu(
f,
indices,
backend::GPU;
backend::Backend;

block_size::Int=256,
)
Expand Down Expand Up @@ -125,11 +125,12 @@ function foreachindex(
# CPU settings
max_tasks=Threads.nthreads(),
min_elems=1,
prefer_threads::Bool=true,

# GPU settings
block_size=256,
)
if backend isa GPU
if use_KA_algo(itr, prefer_threads)
_forindices_gpu(f, eachindex(itr), backend; block_size)
else
_forindices_threads(f, eachindex(itr); max_tasks, min_elems)
Expand Down Expand Up @@ -218,6 +219,7 @@ function foraxes(
# CPU settings
max_tasks=Threads.nthreads(),
min_elems=1,
prefer_threads::Bool=true,

# GPU settings
block_size=256,
Expand All @@ -226,11 +228,11 @@ function foraxes(
return foreachindex(
f, itr, backend;
max_tasks, min_elems,
block_size,
prefer_threads, block_size,
)
end

if backend isa GPU
if use_KA_algo(itr, prefer_threads)
_forindices_gpu(f, axes(itr, dims), backend; block_size)
else
_forindices_threads(f, axes(itr, dims); max_tasks, min_elems)
Expand Down
11 changes: 2 additions & 9 deletions src/map.jl
Original file line number Diff line number Diff line change
Expand Up @@ -33,19 +33,12 @@ end
"""
function map!(
f, dst::AbstractArray, src::AbstractArray, backend::Backend=get_backend(src);

# CPU settings
max_tasks=Threads.nthreads(),
min_elems=1,

# GPU settings
block_size=256,
kwargs...
)
@argcheck length(dst) == length(src)
foreachindex(
src, backend;
max_tasks, min_elems,
block_size,
kwargs...
) do idx
dst[idx] = f(src[idx])
end
Expand Down
12 changes: 8 additions & 4 deletions src/predicates.jl
Original file line number Diff line number Diff line change
Expand Up @@ -114,11 +114,12 @@ function _any_impl(
# CPU settings
max_tasks=Threads.nthreads(),
min_elems=1,
prefer_threads::Bool=true,

# GPU settings
block_size::Int=256,
)
if backend isa GPU
if use_KA_algo(v, prefer_threads)
@argcheck block_size > 0

# Some platforms crash when multiple threads write to the same memory location in a global
Expand All @@ -137,7 +138,8 @@ function _any_impl(
backend;
init=false,
neutral=false,
block_size=block_size,
prefer_threads=true,
block_size,
temp=alg.temp,
switch_below=alg.switch_below,
)
Expand Down Expand Up @@ -246,11 +248,12 @@ function _all_impl(
# CPU settings
max_tasks=Threads.nthreads(),
min_elems=1,
prefer_threads::Bool=true,

# GPU settings
block_size::Int=256,
)
if backend isa GPU
if use_KA_algo(v, prefer_threads)
@argcheck block_size > 0

# Some platforms crash when multiple threads write to the same memory location in a global
Expand All @@ -269,7 +272,8 @@ function _all_impl(
backend;
init=true,
neutral=true,
block_size=block_size,
prefer_threads=false,
block_size,
temp=alg.temp,
switch_below=alg.switch_below,
)
Expand Down
4 changes: 2 additions & 2 deletions src/reduce/mapreduce_1d_cpu.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
function mapreduce_1d(
f, op, src::AbstractArray, backend::CPU;
function mapreduce_1d_cpu(
f, op, src::AbstractArray, backend::Backend;
init,
neutral,

Expand Down
4 changes: 2 additions & 2 deletions src/reduce/mapreduce_1d_gpu.jl
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,8 @@
end


function mapreduce_1d(
f, op, src::AbstractArray, backend::GPU;
function mapreduce_1d_gpu(
f, op, src::AbstractArray, backend::Backend;
init,
neutral,

Expand Down
3 changes: 2 additions & 1 deletion src/reduce/mapreduce_nd.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ function mapreduce_nd(
# CPU settings - ignored here
max_tasks::Int,
min_elems::Int,
prefer_threads::Bool=true,

# GPU settings
block_size::Int,
Expand Down Expand Up @@ -113,7 +114,7 @@ function mapreduce_nd(
end
dst_size = length(dst)

if backend isa CPU
if !use_KA_algo(src, prefer_threads)
_mapreduce_nd_cpu_sections!(
f, op, dst, src;
init,
Expand Down
29 changes: 20 additions & 9 deletions src/reduce/reduce.jl
Original file line number Diff line number Diff line change
Expand Up @@ -175,25 +175,36 @@ function _mapreduce_impl(
# CPU settings
max_tasks::Int=Threads.nthreads(),
min_elems::Int=1,
prefer_threads::Bool=true,

# GPU settings
block_size::Int=256,
temp::Union{Nothing, AbstractArray}=nothing,
switch_below::Int=0,
)
if isnothing(dims)
return mapreduce_1d(
f, op, src, backend;
init, neutral,
max_tasks, min_elems,
block_size, temp,
switch_below
)
if use_KA_algo(src, prefer_threads)
mapreduce_1d_gpu(
f, op, src, backend;
init, neutral,
max_tasks, min_elems,
block_size, temp,
switch_below
)
else
mapreduce_1d_cpu(
f, op, src, backend;
init, neutral,
max_tasks, min_elems,
block_size, temp,
switch_below
)
end
else
return mapreduce_nd(
f, op, src, backend;
init, neutral,
dims, max_tasks=max_tasks,
init, neutral, dims,
max_tasks, prefer_threads,
min_elems, block_size,
temp,
)
Expand Down
14 changes: 7 additions & 7 deletions src/sort/merge_sort.jl
Original file line number Diff line number Diff line change
Expand Up @@ -125,27 +125,27 @@ end

"""
merge_sort!(
v::AbstractGPUArray, backend::Backend=get_backend(v);
v::AbstractArray, backend::Backend=get_backend(v);

lt=isless,
by=identity,
rev::Union{Nothing, Bool}=nothing,
order::Base.Order.Ordering=Base.Order.Forward,

block_size::Int=256,
temp::Union{Nothing, AbstractGPUArray}=nothing,
temp::Union{Nothing, AbstractArray}=nothing,
)
"""
function merge_sort!(
v::AbstractGPUArray, backend::Backend=get_backend(v);
v::AbstractArray, backend::Backend=get_backend(v);

lt=isless,
by=identity,
rev::Union{Nothing, Bool}=nothing,
order::Base.Order.Ordering=Base.Order.Forward,

block_size::Int=256,
temp::Union{Nothing, AbstractGPUArray}=nothing,
temp::Union{Nothing, AbstractArray}=nothing,
)
# Simple sanity checks
@argcheck block_size > 0
Expand Down Expand Up @@ -195,19 +195,19 @@ end

"""
merge_sort(
v::AbstractGPUArray, backend::Backend=get_backend(v);
v::AbstractArray, backend::Backend=get_backend(v);

lt=isless,
by=identity,
rev::Union{Nothing, Bool}=nothing,
order::Base.Order.Ordering=Base.Order.Forward,

block_size::Int=256,
temp::Union{Nothing, AbstractGPUArray}=nothing,
temp::Union{Nothing, AbstractArray}=nothing,
)
"""
function merge_sort(
v::AbstractGPUArray, backend::Backend=get_backend(v);
v::AbstractArray, backend::Backend=get_backend(v);
kwargs...
)
v_copy = copy(v)
Expand Down
Loading
Loading