Skip to content

Trial support for thread-block clusters #2825

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 22 additions & 4 deletions lib/cudadrv/execution.jl
Original file line number Diff line number Diff line change
Expand Up @@ -44,22 +44,29 @@ end

"""
launch(f::CuFunction; args...; blocks::CuDim=1, threads::CuDim=1,
cooperative=false, shmem=0, stream=stream())
clusters::CuDim=1, cooperative=false, shmem=0, stream=stream())

Low-level call to launch a CUDA function `f` on the GPU, using `blocks` and `threads` as
respectively the grid and block configuration. Dynamic shared memory is allocated according
to `shmem`, and the kernel is launched on stream `stream`.
to `shmem`, and the kernel is launched on stream `stream`. If `clusters > 1` and compute
capability is `>= 9.0`, [thread block clusters](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#thread-block-clusters)
are launched. If `clusters > 1` and compute capability is `< 9.0`, an error is thrown, as
thread block clusters are not supported.

Arguments to a kernel should either be bitstype, in which case they will be copied to the
internal kernel parameter buffer, or a pointer to device memory.

This is a low-level call, prefer to use [`cudacall`](@ref) instead.
"""
function launch(f::CuFunction, args::Vararg{Any,N}; blocks::CuDim=1, threads::CuDim=1,
cooperative::Bool=false, shmem::Integer=0,
clusters::CuDim=1, cooperative::Bool=false, shmem::Integer=0,
stream::CuStream=stream()) where {N}
blockdim = CuDim3(blocks)
threaddim = CuDim3(threads)
clusterdim = CuDim3(clusters)
if CUDA.capability(device()) < v"9.0" && (clusterdim.x != 1 || clusterdim.y != 1 || clusterdim.y != 1)
throw(ArgumentError("devices with compute capability under 9.0 do not support thread block clusters!"))
end

try
pack_arguments(args...) do kernelParams
Expand All @@ -68,11 +75,22 @@ function launch(f::CuFunction, args::Vararg{Any,N}; blocks::CuDim=1, threads::Cu
blockdim.x, blockdim.y, blockdim.z,
threaddim.x, threaddim.y, threaddim.z,
shmem, stream, kernelParams)
else
elseif clusterdim.x == 1 && clusterdim.y == 1 && clusterdim.z == 1
cuLaunchKernel(f,
blockdim.x, blockdim.y, blockdim.z,
threaddim.x, threaddim.y, threaddim.z,
shmem, stream, kernelParams, C_NULL)
else
attr_val = CUlaunchAttributeValue()
attr_val.clusterDim.x = clusterdim.x
attr_val.clusterDim.y = clusterdim.y
attr_val.clusterDim.z = clusterdim.z
attr = CUlaunchAttribute(CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION, (0,0,0,0), attr_val)
config = CUlaunchConfig(blockdim.x, blockdim.y, blockdim.z,
threaddim.x, threaddim.y, threaddim.z,
shmem, stream, Ref(attr), Cuint(1))

cuLaunchKernelEx(config, f, kernelParams, C_NULL)
end
end
catch err
Expand Down
6 changes: 5 additions & 1 deletion src/compiler/execution.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ export @cuda, cudaconvert, cufunction, dynamic_cufunction, nextwarp, prevwarp

const MACRO_KWARGS = [:dynamic, :launch]
const COMPILER_KWARGS = [:kernel, :name, :always_inline, :minthreads, :maxthreads, :blocks_per_sm, :maxregs, :fastmath, :cap, :ptx]
const LAUNCH_KWARGS = [:cooperative, :blocks, :threads, :shmem, :stream]
const LAUNCH_KWARGS = [:cooperative, :blocks, :threads, :clusters, :shmem, :stream]


"""
Expand Down Expand Up @@ -224,6 +224,10 @@ The following keyword arguments are supported:
- `blocks` (default: `1`): Number of thread blocks to launch, or a 1-, 2- or 3-tuple of
dimensions (e.g. `blocks=(2, 4, 2)` for a 3D grid of blocks).
Use [`blockIdx()`](@ref) and [`gridDim()`](@ref) to query from within the kernel.
- `clusters` (default: `0`): Number of thread blocks to launch as a cooperative cluster,
or a 1-, 2- or 3-tuple of dimensions (e.g. `clusters=(2, 2, 2)` for a 3D grid).
Use [`clusterIdx()`](@ref) and [`clusterDim()`](@ref) to query from within the kernel.
Only supported on compute capability 9.0 and above. If `clusters=0`, no clusters are launched.
- `shmem`(default: `0`): Amount of dynamic shared memory in bytes to allocate per thread block;
used by [`CuDynamicSharedArray`](@ref).
- `stream` (default: [`stream()`](@ref)): [`CuStream`](@ref) to launch the kernel on.
Expand Down
29 changes: 28 additions & 1 deletion src/device/intrinsics/indexing.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Indexing and dimensions (B.4)

export
threadIdx, blockDim, blockIdx, gridDim,
threadIdx, blockDim, blockIdx, gridDim, clusterIdx, clusterDim,
laneid, lanemask, warpsize, active_mask, FULL_MASK

@generated function _index(::Val{name}, ::Val{range}) where {name, range}
Expand Down Expand Up @@ -38,6 +38,9 @@ end
# https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
const max_block_size = (x=1024, y=1024, z=64)
const max_grid_size = (x=2^31-1, y=65535, z=65535)
# maximum total "linear" dimension is 8, 16 on Hopper
# https://forums.developer.nvidia.com/t/cluster-size-limitation/279795
const max_cluster_size = (x=8, y=8, z=8)

for dim in (:x, :y, :z)
# Thread index
Expand All @@ -59,6 +62,16 @@ for dim in (:x, :y, :z)
fn = Symbol("gridDim_$dim")
intr = Symbol("nctaid.$dim")
@eval @inline $fn() = _index($(Val(intr)), $(Val(1:max_grid_size[dim])))

# Cluster index
fn = Symbol("clusterIdx_$dim")
intr = Symbol("clusterid.$dim")
@eval @inline $fn() = _index($(Val(intr)), $(Val(0:max_cluster_size[dim]-1))) + 1i32

# Cluster size in grid (#clusters per grid)
fn = Symbol("clusterDim_$dim")
intr = Symbol("nclusterid.$dim")
@eval @inline $fn() = _index($(Val(intr)), $(Val(1:max_cluster_size[dim])))
end

@device_functions begin
Expand All @@ -70,6 +83,20 @@ Returns the dimensions of the grid.
"""
@inline gridDim() = (x=gridDim_x(), y=gridDim_y(), z=gridDim_z())

"""
clusterIdx()::NamedTuple

Returns the cluster index within the grid.
"""
@inline clusterIdx() = (x=clusterIdx_x(), y=clusterIdx_y(), z=clusterIdx_z())

"""
clusterDim()::NamedTuple

Returns the dimensions of the cluster.
"""
@inline clusterDim() = (x=clusterDim_x(), y=clusterDim_y(), z=clusterDim_z())

"""
blockIdx()::NamedTuple

Expand Down
7 changes: 7 additions & 0 deletions test/core/execution.jl
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,13 @@ end
@cuda stream=s dummy()
end

@testset "clusters" begin
if CUDA.capability(device()) >= v"9.0"
@cuda threads=64 clusters=2 dummy()
else
@test_throws ArgumentError @cuda threads=64 clusters=2 dummy()
end
end

@testset "external kernels" begin
@eval module KernelModule
Expand Down