diff --git a/lib/cudadrv/execution.jl b/lib/cudadrv/execution.jl index 4725e5679b..643bbaed1f 100644 --- a/lib/cudadrv/execution.jl +++ b/lib/cudadrv/execution.jl @@ -44,11 +44,14 @@ end """ launch(f::CuFunction; args...; blocks::CuDim=1, threads::CuDim=1, - cooperative=false, shmem=0, stream=stream()) + clusters::CuDim=1, cooperative=false, shmem=0, stream=stream()) Low-level call to launch a CUDA function `f` on the GPU, using `blocks` and `threads` as respectively the grid and block configuration. Dynamic shared memory is allocated according -to `shmem`, and the kernel is launched on stream `stream`. +to `shmem`, and the kernel is launched on stream `stream`. If `clusters > 1` and compute +capability is `>= 9.0`, [thread block clusters](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#thread-block-clusters) +are launched. If `clusters > 1` and compute capability is `< 9.0`, an error is thrown, as +thread block clusters are not supported. Arguments to a kernel should either be bitstype, in which case they will be copied to the internal kernel parameter buffer, or a pointer to device memory. @@ -56,10 +59,14 @@ internal kernel parameter buffer, or a pointer to device memory. This is a low-level call, prefer to use [`cudacall`](@ref) instead. """ function launch(f::CuFunction, args::Vararg{Any,N}; blocks::CuDim=1, threads::CuDim=1, - cooperative::Bool=false, shmem::Integer=0, + clusters::CuDim=1, cooperative::Bool=false, shmem::Integer=0, stream::CuStream=stream()) where {N} blockdim = CuDim3(blocks) threaddim = CuDim3(threads) + clusterdim = CuDim3(clusters) + if CUDA.capability(device()) < v"9.0" && (clusterdim.x != 1 || clusterdim.y != 1 || clusterdim.y != 1) + throw(ArgumentError("devices with compute capability under 9.0 do not support thread block clusters!")) + end try pack_arguments(args...) do kernelParams @@ -68,11 +75,22 @@ function launch(f::CuFunction, args::Vararg{Any,N}; blocks::CuDim=1, threads::Cu blockdim.x, blockdim.y, blockdim.z, threaddim.x, threaddim.y, threaddim.z, shmem, stream, kernelParams) - else + elseif clusterdim.x == 1 && clusterdim.y == 1 && clusterdim.z == 1 cuLaunchKernel(f, blockdim.x, blockdim.y, blockdim.z, threaddim.x, threaddim.y, threaddim.z, shmem, stream, kernelParams, C_NULL) + else + attr_val = CUlaunchAttributeValue() + attr_val.clusterDim.x = clusterdim.x + attr_val.clusterDim.y = clusterdim.y + attr_val.clusterDim.z = clusterdim.z + attr = CUlaunchAttribute(CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION, (0,0,0,0), attr_val) + config = CUlaunchConfig(blockdim.x, blockdim.y, blockdim.z, + threaddim.x, threaddim.y, threaddim.z, + shmem, stream, Ref(attr), Cuint(1)) + + cuLaunchKernelEx(config, f, kernelParams, C_NULL) end end catch err diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl index 635070fcef..6d8f306012 100644 --- a/src/compiler/execution.jl +++ b/src/compiler/execution.jl @@ -7,7 +7,7 @@ export @cuda, cudaconvert, cufunction, dynamic_cufunction, nextwarp, prevwarp const MACRO_KWARGS = [:dynamic, :launch] const COMPILER_KWARGS = [:kernel, :name, :always_inline, :minthreads, :maxthreads, :blocks_per_sm, :maxregs, :fastmath, :cap, :ptx] -const LAUNCH_KWARGS = [:cooperative, :blocks, :threads, :shmem, :stream] +const LAUNCH_KWARGS = [:cooperative, :blocks, :threads, :clusters, :shmem, :stream] """ @@ -224,6 +224,10 @@ The following keyword arguments are supported: - `blocks` (default: `1`): Number of thread blocks to launch, or a 1-, 2- or 3-tuple of dimensions (e.g. `blocks=(2, 4, 2)` for a 3D grid of blocks). Use [`blockIdx()`](@ref) and [`gridDim()`](@ref) to query from within the kernel. +- `clusters` (default: `0`): Number of thread blocks to launch as a cooperative cluster, + or a 1-, 2- or 3-tuple of dimensions (e.g. `clusters=(2, 2, 2)` for a 3D grid). + Use [`clusterIdx()`](@ref) and [`clusterDim()`](@ref) to query from within the kernel. + Only supported on compute capability 9.0 and above. If `clusters=0`, no clusters are launched. - `shmem`(default: `0`): Amount of dynamic shared memory in bytes to allocate per thread block; used by [`CuDynamicSharedArray`](@ref). - `stream` (default: [`stream()`](@ref)): [`CuStream`](@ref) to launch the kernel on. diff --git a/src/device/intrinsics/indexing.jl b/src/device/intrinsics/indexing.jl index 7677c553f5..039dbb69df 100644 --- a/src/device/intrinsics/indexing.jl +++ b/src/device/intrinsics/indexing.jl @@ -1,7 +1,7 @@ # Indexing and dimensions (B.4) export - threadIdx, blockDim, blockIdx, gridDim, + threadIdx, blockDim, blockIdx, gridDim, clusterIdx, clusterDim, laneid, lanemask, warpsize, active_mask, FULL_MASK @generated function _index(::Val{name}, ::Val{range}) where {name, range} @@ -38,6 +38,9 @@ end # https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities const max_block_size = (x=1024, y=1024, z=64) const max_grid_size = (x=2^31-1, y=65535, z=65535) +# maximum total "linear" dimension is 8, 16 on Hopper +# https://forums.developer.nvidia.com/t/cluster-size-limitation/279795 +const max_cluster_size = (x=8, y=8, z=8) for dim in (:x, :y, :z) # Thread index @@ -59,6 +62,16 @@ for dim in (:x, :y, :z) fn = Symbol("gridDim_$dim") intr = Symbol("nctaid.$dim") @eval @inline $fn() = _index($(Val(intr)), $(Val(1:max_grid_size[dim]))) + + # Cluster index + fn = Symbol("clusterIdx_$dim") + intr = Symbol("clusterid.$dim") + @eval @inline $fn() = _index($(Val(intr)), $(Val(0:max_cluster_size[dim]-1))) + 1i32 + + # Cluster size in grid (#clusters per grid) + fn = Symbol("clusterDim_$dim") + intr = Symbol("nclusterid.$dim") + @eval @inline $fn() = _index($(Val(intr)), $(Val(1:max_cluster_size[dim]))) end @device_functions begin @@ -70,6 +83,20 @@ Returns the dimensions of the grid. """ @inline gridDim() = (x=gridDim_x(), y=gridDim_y(), z=gridDim_z()) +""" + clusterIdx()::NamedTuple + +Returns the cluster index within the grid. +""" +@inline clusterIdx() = (x=clusterIdx_x(), y=clusterIdx_y(), z=clusterIdx_z()) + +""" + clusterDim()::NamedTuple + +Returns the dimensions of the cluster. +""" +@inline clusterDim() = (x=clusterDim_x(), y=clusterDim_y(), z=clusterDim_z()) + """ blockIdx()::NamedTuple diff --git a/test/core/execution.jl b/test/core/execution.jl index ad799d6766..1da47a4831 100644 --- a/test/core/execution.jl +++ b/test/core/execution.jl @@ -159,6 +159,13 @@ end @cuda stream=s dummy() end +@testset "clusters" begin + if CUDA.capability(device()) >= v"9.0" + @cuda threads=64 clusters=2 dummy() + else + @test_throws ArgumentError @cuda threads=64 clusters=2 dummy() + end +end @testset "external kernels" begin @eval module KernelModule