Skip to content
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
813 changes: 414 additions & 399 deletions .buildkite/pipeline.yml

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ ExprTools = "0.1"
GPUArrays = "11.2.4"
GPUCompiler = "1.4"
GPUToolbox = "0.3, 1"
KernelAbstractions = "0.9.38"
KernelAbstractions = "0.10"
LLVM = "9.3.1"
LLVMLoopInfo = "1"
LazyArtifacts = "1"
Expand Down
1 change: 1 addition & 0 deletions src/CUDA.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ module CUDA
using GPUCompiler

using GPUArrays
import KernelAbstractions: KernelIntrinsics as KI

using GPUToolbox

Expand Down
60 changes: 43 additions & 17 deletions src/CUDAKernels.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ using ..CUDA
using ..CUDA: @device_override, CUSPARSE, default_memory, UnifiedMemory

import KernelAbstractions as KA
import KernelAbstractions: KernelIntrinsics as KI

import StaticArrays
import SparseArrays: AbstractSparseArray
Expand Down Expand Up @@ -157,37 +158,61 @@ function (obj::KA.Kernel{CUDABackend})(args...; ndrange=nothing, workgroupsize=n
return nothing
end


function KI.KIKernel(::CUDABackend, f, args...; kwargs...)
kern = eval(quote
@cuda launch=false $(kwargs...) $(f)($(args...))
end)
KI.KIKernel{CUDABackend, typeof(kern)}(CUDABackend(), kern)
end

function (obj::KI.KIKernel{CUDABackend})(args...; numworkgroups=nothing, workgroupsize=nothing, kwargs...)
threadsPerThreadgroup = isnothing(workgroupsize) ? 1 : workgroupsize
threadgroupsPerGrid = isnothing(numworkgroups) ? 1 : numworkgroups

obj.kern(args...; threads=threadsPerThreadgroup, blocks=threadgroupsPerGrid, kwargs...)
end


function KI.kernel_max_work_group_size(::CUDABackend, kikern::KI.KIKernel{<:CUDABackend}; max_work_items::Int=typemax(Int))::Int
Int(min(kikern.kern.pipeline.maxTotalThreadsPerThreadgroup, max_work_items))
end
function KI.max_work_group_size(::CUDABackend)::Int
Int(attribute(device(), DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK))
end
function KI.multiprocessor_count(::CUDABackend)::Int
Int(attribute(device(), DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
end

## indexing

## COV_EXCL_START
@device_override @inline function KA.__index_Local_Linear(ctx)
return threadIdx().x
@device_override @inline function KI.get_local_id()
return (; x = Int(threadIdx().x), y = Int(threadIdx().y), z = Int(threadIdx().z))
end


@device_override @inline function KA.__index_Group_Linear(ctx)
return blockIdx().x
@device_override @inline function KI.get_group_id()
return (; x = Int(blockIdx().x), y = Int(blockIdx().y), z = Int(blockIdx().z))
end

@device_override @inline function KA.__index_Global_Linear(ctx)
I = @inbounds KA.expand(KA.__iterspace(ctx), blockIdx().x, threadIdx().x)
# TODO: This is unfortunate, can we get the linear index cheaper
@inbounds LinearIndices(KA.__ndrange(ctx))[I]
@device_override @inline function KI.get_global_id()
return (; x = Int(blockDim().x), y = Int(blockDim().y), z = Int(blockDim().z))
end

@device_override @inline function KA.__index_Local_Cartesian(ctx)
@inbounds KA.workitems(KA.__iterspace(ctx))[threadIdx().x]
@device_override @inline function KI.get_local_size()
return (; x = Int((blockDim().x-1)*blockDim().x + threadIdx().x), y = Int((blockDim().y-1)*blockDim().y + threadIdx().y), z = Int((blockDim().z-1)*blockDim().z + threadIdx().z))
end

@device_override @inline function KA.__index_Group_Cartesian(ctx)
@inbounds KA.blocks(KA.__iterspace(ctx))[blockIdx().x]
@device_override @inline function KI.get_num_grouups()
return (; x = Int(gridDim().x), y = Int(gridDim().y), z = Int(gridDim().z))
end

@device_override @inline function KA.__index_Global_Cartesian(ctx)
return @inbounds KA.expand(KA.__iterspace(ctx), blockIdx().x, threadIdx().x)
@device_override @inline function KI.get_global_size()
return (; x = Int(blockDim().x * gridDim().x), y = Int(blockDim().y * gridDim().y), z = Int(lockDim().z * gridDim().z))
end

@device_override @inline function KA.__validindex(ctx)
@device_override @inline function KI.__validindex(ctx)
if KA.__dynamic_checkbounds(ctx)
I = @inbounds KA.expand(KA.__iterspace(ctx), blockIdx().x, threadIdx().x)
return I in KA.__ndrange(ctx)
Expand All @@ -198,7 +223,8 @@ end

## shared and scratch memory

@device_override @inline function KA.SharedMemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id}
# @device_override @inline function KI.localmemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id}
@device_override @inline function KI.localmemory(::Type{T}, ::Val{Dims}) where {T, Dims}
CuStaticSharedArray(T, Dims)
end

Expand All @@ -208,7 +234,7 @@ end

## synchronization and printing

@device_override @inline function KA.__synchronize()
@device_override @inline function KI.barrier()
sync_threads()
end

Expand Down
26 changes: 13 additions & 13 deletions src/accumulate.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,16 @@
function partial_scan(op::Function, output::AbstractArray{T}, input::AbstractArray,
Rdim, Rpre, Rpost, Rother, neutral, init,
::Val{inclusive}=Val(true)) where {T, inclusive}
threads = blockDim().x
thread = threadIdx().x
block = blockIdx().x
threads = KI.get_local_size().x
thread = KI.get_local_id().x
block = KI.get_group_id().x

temp = CuDynamicSharedArray(T, (2*threads,))

# iterate the main dimension using threads and the first block dimension
i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
i = (KI.get_group_id().x-1i32) * KI.get_local_size().x + KI.get_local_id().x
# iterate the other dimensions using the remaining block dimensions
j = (blockIdx().z-1i32) * gridDim().y + blockIdx().y
j = (KI.get_group_id().z-1i32) * KI.get_num_groups().y + KI.get_group_id().y

if j > length(Rother)
return
Expand All @@ -47,7 +47,7 @@ function partial_scan(op::Function, output::AbstractArray{T}, input::AbstractArr
offset = 1
d = threads>>1
while d > 0
sync_threads()
KI.barrier()
@inbounds if thread <= d
ai = offset * (2*thread-1)
bi = offset * (2*thread)
Expand All @@ -66,7 +66,7 @@ function partial_scan(op::Function, output::AbstractArray{T}, input::AbstractArr
d = 1
while d < threads
offset >>= 1
sync_threads()
KI.barrier()
@inbounds if thread <= d
ai = offset * (2*thread-1)
bi = offset * (2*thread)
Expand All @@ -78,7 +78,7 @@ function partial_scan(op::Function, output::AbstractArray{T}, input::AbstractArr
d *= 2
end

sync_threads()
KI.barrier()

# write results to device memory
@inbounds if i <= length(Rdim)
Expand All @@ -100,14 +100,14 @@ end
function aggregate_partial_scan(op::Function, output::AbstractArray,
aggregates::AbstractArray, Rdim, Rpre, Rpost, Rother,
init)
threads = blockDim().x
thread = threadIdx().x
block = blockIdx().x
threads = KI.get_local_size().x
thread = KI.get_local_id().x
block = KI.get_group_id().x

# iterate the main dimension using threads and the first block dimension
i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
i = (KI.get_group_id().x-1i32) * KI.get_local_size().x + KI.get_local_id().x
# iterate the other dimensions using the remaining block dimensions
j = (blockIdx().z-1i32) * gridDim().y + blockIdx().y
j = (KI.get_group_id().z-1i32) * KI.get_num_groups().y + KI.get_group_id().y

@inbounds if i <= length(Rdim) && j <= length(Rother)
I = Rother[j]
Expand Down
8 changes: 3 additions & 5 deletions src/device/random.jl
Original file line number Diff line number Diff line change
Expand Up @@ -63,18 +63,16 @@ end
@inline Philox2x32() = Philox2x32{7}()

@inline function Base.getproperty(rng::Philox2x32, field::Symbol)
threadId = threadIdx().x + (threadIdx().y - 1i32) * blockDim().x +
(threadIdx().z - 1i32) * blockDim().x * blockDim().y
warpId = (threadId - 1i32) >> 0x5 + 1i32 # fld1

if field === :key
@inbounds global_random_keys()[warpId]
elseif field === :ctr1
@inbounds global_random_counters()[warpId]
elseif field === :ctr2
blockId = blockIdx().x + (blockIdx().y - 1i32) * gridDim().x +
(blockIdx().z - 1i32) * gridDim().x * gridDim().y
globalId = threadId + (blockId - 1i32) * (blockDim().x * blockDim().y * blockDim().z)
globalId = KI.get_global_id().x +
(KI.get_global_id().y - 1i32) * KI.get_global_size().x +
(KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
globalId%UInt32
end::UInt32
end
Expand Down
2 changes: 1 addition & 1 deletion src/indexing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ function Base.findall(bools::AnyCuArray{Bool})
if n > 0
## COV_EXCL_START
function kernel(ys::CuDeviceArray, bools, indices)
i = threadIdx().x + (blockIdx().x - 1i32) * blockDim().x
i = KI.get_local_id().x + (KI.get_group_id().x - 1i32) * KI.get_local_size().x

@inbounds if i <= length(bools) && bools[i]
i′ = CartesianIndices(bools)[i]
Expand Down
40 changes: 20 additions & 20 deletions src/mapreduce.jl
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ end
@inline function reduce_block(op, val::T, neutral, shuffle::Val{true}) where T
# shared mem for partial sums
assume(warpsize() == 32)
shared = CuStaticSharedArray(T, 32)
shared = KI.localmemory(T, 32)

wid, lane = fldmod1(threadIdx().x, warpsize())
wid, lane = fldmod1(KI.get_local_id().x, warpsize())

# each warp performs partial reduction
val = reduce_warp(op, val)
Expand All @@ -32,10 +32,10 @@ end
end

# wait for all partial reductions
sync_threads()
KI.barrier()

# read from shared memory only if that warp existed
val = if threadIdx().x <= fld1(blockDim().x, warpsize())
val = if KI.get_local_id().x <= fld1(KI.get_local_size().x, warpsize())
@inbounds shared[lane]
else
neutral
Expand All @@ -49,8 +49,8 @@ end
return val
end
@inline function reduce_block(op, val::T, neutral, shuffle::Val{false}) where T
threads = blockDim().x
thread = threadIdx().x
threads = KI.get_local_size().x
thread = KI.get_local_id().x

# shared mem for a complete reduction
shared = CuDynamicSharedArray(T, (threads,))
Expand All @@ -59,7 +59,7 @@ end
# perform a reduction
d = 1
while d < threads
sync_threads()
KI.barrier()
index = 2 * d * (thread-1) + 1
@inbounds if index <= threads
other_val = if index + d <= threads
Expand Down Expand Up @@ -92,10 +92,10 @@ function partial_mapreduce_grid(f, op, neutral, Rreduce, Rother, shuffle, R::Abs

# decompose the 1D hardware indices into separate ones for reduction (across threads
# and possibly blocks if it doesn't fit) and other elements (remaining blocks)
threadIdx_reduce = threadIdx().x
blockDim_reduce = blockDim().x
blockIdx_reduce, blockIdx_other = fldmod1(blockIdx().x, length(Rother))
gridDim_reduce = gridDim().x ÷ length(Rother)
threadIdx_reduce = KI.get_local_id().x
blockDim_reduce = KI.get_local_size().x
blockIdx_reduce, blockIdx_other = fldmod1(KI.get_group_id().x, length(Rother))
gridDim_reduce = KI.get_num_groups().x ÷ length(Rother)

# block-based indexing into the values outside of the reduction dimension
# (that means we can safely synchronize threads within this block)
Expand Down Expand Up @@ -134,7 +134,7 @@ function partial_mapreduce_grid(f, op, neutral, Rreduce, Rother, shuffle, R::Abs
end

function serial_mapreduce_kernel(f, op, neutral, Rreduce, Rother, R, As)
grid_idx = threadIdx().x + (blockIdx().x - 1i32) * blockDim().x
grid_idx = KI.get_local_id().x + (KI.get_group_id().x - 1i32) * KI.get_local_size().x
@inbounds if grid_idx <= length(Rother)
Iother = Rother[grid_idx]

Expand All @@ -160,14 +160,14 @@ end

# factored out for use in tests
function serial_mapreduce_threshold(dev)
max_concurrency = attribute(dev, DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK) *
attribute(dev, DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT)
max_concurrency = KI.max_work_group_size(CUDABackend()) * KI.multiprocessor_count(CUDABackend())
return max_concurrency
end

function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
A::Union{AbstractArray,Broadcast.Broadcasted};
init=nothing) where {F, OP, T}
backend = CUDABackend()
if !isa(A, Broadcast.Broadcasted)
# XXX: Base.axes isn't defined anymore for Broadcasted, breaking this check
Base.check_reducedims(R, A)
Expand Down Expand Up @@ -201,11 +201,11 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
# If `Rother` is large enough, then a naive loop is more efficient than partial reductions.
if length(Rother) >= serial_mapreduce_threshold(dev)
args = (f, op, init, Rreduce, Rother, R, A)
kernel = @cuda launch=false serial_mapreduce_kernel(args...)
kernel = KI.KIKernel(backend, serial_mapreduce_kernel, args...)
kernel_config = launch_configuration(kernel.fun)
threads = kernel_config.threads
blocks = cld(length(Rother), threads)
kernel(args...; threads, blocks)
kernel(args...; workgroupsize=threads, numworkgroups=blocks)
return R
end

Expand All @@ -228,9 +228,9 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
# we might not be able to launch all those threads to reduce each slice in one go.
# that's why each threads also loops across their inputs, processing multiple values
# so that we can span the entire reduction dimension using a single thread block.
kernel = @cuda launch=false partial_mapreduce_grid(f, op, init, Rreduce, Rother, Val(shuffle), R, A)
kernel = KI.KIKernel(backend, partial_mapreduce_grid, f, op, init, Rreduce, Rother, Val(shuffle), R, A)
compute_shmem(threads) = shuffle ? 0 : threads*sizeof(T)
kernel_config = launch_configuration(kernel.fun; shmem=compute_shmem∘compute_threads)
kernel_config = launch_configuration(kernel.kern.fun; shmem=compute_shmem∘compute_threads)
reduce_threads = compute_threads(kernel_config.threads)
reduce_shmem = compute_shmem(reduce_threads)

Expand All @@ -255,7 +255,7 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
# perform the actual reduction
if reduce_blocks == 1
# we can cover the dimensions to reduce using a single block
kernel(f, op, init, Rreduce, Rother, Val(shuffle), R, A; threads, blocks, shmem)
kernel(f, op, init, Rreduce, Rother, Val(shuffle), R, A; ; workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem)
else
# TODO: provide a version that atomically reduces from different blocks

Expand Down Expand Up @@ -286,7 +286,7 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
end

partial_kernel(f, op, init, Rreduce, Rother, Val(shuffle), partial, A;
threads=partial_threads, blocks=partial_blocks, shmem=partial_shmem)
workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem)

GPUArrays.mapreducedim!(identity, op, R, partial; init)
end
Expand Down
8 changes: 5 additions & 3 deletions test/base/kernelabstractions.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@ using SparseArrays

include(joinpath(dirname(pathof(KernelAbstractions)), "..", "test", "testsuite.jl"))

Testsuite.testsuite(()->CUDABackend(false, false), "CUDA", CUDA, CuArray, CuDeviceArray)
Testsuite.testsuite(()->CUDABackend(false, false), "CUDA", CUDA, CuArray, CuDeviceArray; skip_tests=Set([
"CPU synchronization",
"fallback test: callable types",]))
for (PreferBlocks, AlwaysInline) in Iterators.product((true, false), (true, false))
Testsuite.unittest_testsuite(()->CUDABackend(PreferBlocks, AlwaysInline), "CUDA", CUDA, CuDeviceArray)
end
Expand All @@ -16,13 +18,13 @@ end
@testset "CUDA Backend Adapt Tests" begin
# CPU → GPU
A = sprand(Float32, 10, 10, 0.5) #CSC
A_d = adapt(CUDABackend(), A)
A_d = adapt(CUDABackend(), A)
@test A_d isa CUSPARSE.CuSparseMatrixCSC
@test adapt(CUDABackend(), A_d) |> typeof == typeof(A_d)

# GPU → CPU
B_d = A |> cu # CuCSC
B = adapt(KA.CPU(), A_d)
@test B isa SparseMatrixCSC
@test adapt(KA.CPU(), B) |> typeof == typeof(B)
@test adapt(KA.CPU(), B) |> typeof == typeof(B)
end