JuliaGPU · kshyatt · Jul 26, 2025
diff --git a/lib/cudadrv/execution.jl b/lib/cudadrv/execution.jl
@@ -44,22 +44,29 @@ end
 
 """
     launch(f::CuFunction; args...; blocks::CuDim=1, threads::CuDim=1,
-           cooperative=false, shmem=0, stream=stream())
+           clusters::CuDim=1, cooperative=false, shmem=0, stream=stream())
 
 Low-level call to launch a CUDA function `f` on the GPU, using `blocks` and `threads` as
 respectively the grid and block configuration. Dynamic shared memory is allocated according
-to `shmem`, and the kernel is launched on stream `stream`.
+to `shmem`, and the kernel is launched on stream `stream`. If `clusters > 1` and compute
+capability is `>= 9.0`, [thread block clusters](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#thread-block-clusters)
+are launched. If `clusters > 1` and compute capability is `< 9.0`, an error is thrown, as
+thread block clusters are not supported.
 
 Arguments to a kernel should either be bitstype, in which case they will be copied to the
 internal kernel parameter buffer, or a pointer to device memory.
 
 This is a low-level call, prefer to use [`cudacall`](@ref) instead.
 """
 function launch(f::CuFunction, args::Vararg{Any,N}; blocks::CuDim=1, threads::CuDim=1,
-                cooperative::Bool=false, shmem::Integer=0,
+                clusters::CuDim=1, cooperative::Bool=false, shmem::Integer=0,
                 stream::CuStream=stream()) where {N}
     blockdim = CuDim3(blocks)
     threaddim = CuDim3(threads)
+    clusterdim = CuDim3(clusters)
+    if CUDA.capability(device()) < v"9.0" && (clusterdim.x != 1 || clusterdim.y != 1 || clusterdim.y != 1)
+        throw(ArgumentError("devices with compute capability under 9.0 do not support thread block clusters!"))
+    end
 
     try
         pack_arguments(args...) do kernelParams
@@ -68,11 +75,22 @@ function launch(f::CuFunction, args::Vararg{Any,N}; blocks::CuDim=1, threads::Cu
                                           blockdim.x, blockdim.y, blockdim.z,
                                           threaddim.x, threaddim.y, threaddim.z,
                                           shmem, stream, kernelParams)
-            else
+            elseif clusterdim.x == 1 && clusterdim.y == 1 && clusterdim.z == 1
                 cuLaunchKernel(f,
                                blockdim.x, blockdim.y, blockdim.z,
                                threaddim.x, threaddim.y, threaddim.z,
                                shmem, stream, kernelParams, C_NULL)
+            else
+                attr_val = CUlaunchAttributeValue()
+                attr_val.clusterDim.x = clusterdim.x
+                attr_val.clusterDim.y = clusterdim.y
+                attr_val.clusterDim.z = clusterdim.z
+                attr = CUlaunchAttribute(CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION, (0,0,0,0), attr_val)
+                config = CUlaunchConfig(blockdim.x, blockdim.y, blockdim.z,
+                                        threaddim.x, threaddim.y, threaddim.z,
+                                        shmem, stream, Ref(attr), Cuint(1))
+
+                cuLaunchKernelEx(config, f, kernelParams, C_NULL) 
             end
         end
     catch err

diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl
@@ -7,7 +7,7 @@ export @cuda, cudaconvert, cufunction, dynamic_cufunction, nextwarp, prevwarp
 
 const MACRO_KWARGS = [:dynamic, :launch]
 const COMPILER_KWARGS = [:kernel, :name, :always_inline, :minthreads, :maxthreads, :blocks_per_sm, :maxregs, :fastmath, :cap, :ptx]
-const LAUNCH_KWARGS = [:cooperative, :blocks, :threads, :shmem, :stream]
+const LAUNCH_KWARGS = [:cooperative, :blocks, :threads, :clusters, :shmem, :stream]
 
 
 """
@@ -224,6 +224,10 @@ The following keyword arguments are supported:
 - `blocks` (default: `1`): Number of thread blocks to launch, or a 1-, 2- or 3-tuple of
   dimensions (e.g. `blocks=(2, 4, 2)` for a 3D grid of blocks).
   Use [`blockIdx()`](@ref) and [`gridDim()`](@ref) to query from within the kernel.
+- `clusters` (default: `0`): Number of thread blocks to launch as a cooperative cluster,
+  or a 1-, 2- or 3-tuple of dimensions (e.g. `clusters=(2, 2, 2)` for a 3D grid).
+  Use [`clusterIdx()`](@ref) and [`clusterDim()`](@ref) to query from within the kernel.
+  Only supported on compute capability 9.0 and above. If `clusters=0`, no clusters are launched.
 - `shmem`(default: `0`): Amount of dynamic shared memory in bytes to allocate per thread block;
   used by [`CuDynamicSharedArray`](@ref).
 - `stream` (default: [`stream()`](@ref)): [`CuStream`](@ref) to launch the kernel on.

diff --git a/src/device/intrinsics/indexing.jl b/src/device/intrinsics/indexing.jl
@@ -1,7 +1,7 @@
 # Indexing and dimensions (B.4)
 
 export
-    threadIdx, blockDim, blockIdx, gridDim,
+    threadIdx, blockDim, blockIdx, gridDim, clusterIdx, clusterDim,
     laneid, lanemask, warpsize, active_mask, FULL_MASK
 
 @generated function _index(::Val{name}, ::Val{range}) where {name, range}
@@ -38,6 +38,9 @@ end
 #      https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
 const max_block_size = (x=1024, y=1024, z=64)
 const max_grid_size  = (x=2^31-1, y=65535, z=65535)
+# maximum total "linear" dimension is 8, 16 on Hopper
+# https://forums.developer.nvidia.com/t/cluster-size-limitation/279795
+const max_cluster_size = (x=8, y=8, z=8)
 
 for dim in (:x, :y, :z)
     # Thread index
@@ -59,6 +62,16 @@ for dim in (:x, :y, :z)
     fn = Symbol("gridDim_$dim")
     intr = Symbol("nctaid.$dim")
     @eval @inline $fn() = _index($(Val(intr)), $(Val(1:max_grid_size[dim])))
+
+    # Cluster index
+    fn = Symbol("clusterIdx_$dim")
+    intr = Symbol("clusterid.$dim")
+    @eval @inline $fn() = _index($(Val(intr)), $(Val(0:max_cluster_size[dim]-1))) + 1i32
+
+    # Cluster size in grid (#clusters per grid)
+    fn = Symbol("clusterDim_$dim")
+    intr = Symbol("nclusterid.$dim")
+    @eval @inline $fn() = _index($(Val(intr)), $(Val(1:max_cluster_size[dim])))
 end
 
 @device_functions begin
@@ -70,6 +83,20 @@ Returns the dimensions of the grid.
 """
 @inline gridDim() =   (x=gridDim_x(),   y=gridDim_y(),   z=gridDim_z())
 
+"""
+    clusterIdx()::NamedTuple
+
+Returns the cluster index within the grid.
+"""
+@inline clusterIdx() =  (x=clusterIdx_x(),  y=clusterIdx_y(),  z=clusterIdx_z())
+
+"""
+    clusterDim()::NamedTuple
+
+Returns the dimensions of the cluster.
+"""
+@inline clusterDim() =  (x=clusterDim_x(),  y=clusterDim_y(),  z=clusterDim_z())
+
 """
     blockIdx()::NamedTuple
 

diff --git a/test/core/execution.jl b/test/core/execution.jl
@@ -159,6 +159,13 @@ end
     @cuda stream=s dummy()
 end
 
+@testset "clusters" begin
+    if CUDA.capability(device()) >= v"9.0"
+        @cuda threads=64 clusters=2 dummy()
+    else
+        @test_throws ArgumentError @cuda threads=64 clusters=2 dummy()
+    end
+end
 
 @testset "external kernels" begin
     @eval module KernelModule