[NVPTX] Add support for "blocksareclusters" kernel attr

rajatbajpai · rajatbajpai · commit 4bbfecc1194f · 2025-08-06T12:43:27.000+05:30
This change introduces a new kernel attribute that allows
thread blocks to be mapped to clusters.
diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
@@ -92,6 +92,12 @@ Function Attributes
     dimension. Specifying a different cluster dimension at launch will result in
     a runtime error or kernel launch failure. Only supported for Hopper+.
 
+``"nvvm.blocksareclusters"``
+    This attribute implies that the grid launch configuration for the corresponding
+    kernel function is specifying the number of clusters instead of the number of thread
+    blocks. This attribute is only allowed for kernel functions and requires
+    ``nvvm.reqntid`` and ``nvvm.cluster_dim`` attributes.
+
 .. _address_spaces:
 
 Address Spaces
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -414,6 +414,18 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
   // the reqntid directive, and set the unspecified ones to 1.
   // If none of Reqntid* is specified, don't output reqntid directive.
   const auto ReqNTID = getReqNTID(F);
+
+  const NVPTXTargetMachine &NTM = static_cast<const NVPTXTargetMachine &>(TM);
+  const auto *STI = static_cast<const NVPTXSubtarget *>(NTM.getSubtargetImpl());
+
+  const bool BlocksAreClusters =
+      F.hasFnAttribute("nvvm.blocksareclusters");
+  if (BlocksAreClusters && STI->getSmVersion() >= 90) {
+    if (ReqNTID.empty() || getClusterDim(F).empty())
+      report_fatal_error("blocksareclusters requires reqntid and cluster_dim");
+    O << ".blocksareclusters\n";
+  }
+
   if (!ReqNTID.empty())
     O << formatv(".reqntid {0:$[, ]}\n",
                  make_range(ReqNTID.begin(), ReqNTID.end()));
@@ -431,14 +443,14 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
 
   // .maxclusterrank directive requires SM_90 or higher, make sure that we
   // filter it out for lower SM versions, as it causes a hard ptxas crash.
-  const NVPTXTargetMachine &NTM = static_cast<const NVPTXTargetMachine &>(TM);
-  const auto *STI = static_cast<const NVPTXSubtarget *>(NTM.getSubtargetImpl());
-
   if (STI->getSmVersion() >= 90) {
     const auto ClusterDim = getClusterDim(F);
 
     if (!ClusterDim.empty()) {
-      O << ".explicitcluster\n";
+
+      if (!BlocksAreClusters)
+        O << ".explicitcluster\n";
+
       if (ClusterDim[0] != 0) {
         assert(llvm::all_of(ClusterDim, [](unsigned D) { return D != 0; }) &&
                "cluster_dim_x != 0 implies cluster_dim_y and cluster_dim_z "
diff --git a/llvm/test/CodeGen/NVPTX/blocksareclusters-kernel-attr.ll b/llvm/test/CodeGen/NVPTX/blocksareclusters-kernel-attr.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 | FileCheck %s
+
+target triple = "nvptx64-nvidia-cuda"
+
+; Test "blocksareclusters" attribute with full "reqntid" and "cluster_dim"
+; attributes.
+define ptx_kernel void @kernel1(i32* %input, i32* %output) #0 #1 #2 {
+; CHECK-LABEL: kernel1(
+; CHECK:       .blocksareclusters
+; CHECK-NEXT:  .reqntid 1024, 1, 1
+; CHECK-NEXT:  .reqnctapercluster 2, 2, 2
+; CHECK-NEXT:  {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ret;
+  ret void
+}
+
+; Test "blocksareclusters" attribute with single dimension "reqntid" and
+; "cluster_dim" attributes.
+define ptx_kernel void @kernel2(i32* %input, i32* %output) #0 #3 #4 {
+; CHECK-LABEL: kernel2(
+; CHECK:       .blocksareclusters
+; CHECK-NEXT:  .reqntid 1024
+; CHECK-NEXT:  .reqnctapercluster 2 // @kernel2
+; CHECK-NEXT:  {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ret;
+  ret void
+}
+
+; Test "blocksareclusters" attribute with two dimensions(not z dimension)
+; "reqntid" and "cluster_dim" attributes.
+define ptx_kernel void @kernel3(i32* %input, i32* %output) #0 #5 #6 {
+; CHECK-LABEL: kernel3(
+; CHECK:       .blocksareclusters
+; CHECK-NEXT:  .reqntid 512, 2
+; CHECK-NEXT:  .reqnctapercluster 2, 2 // @kernel3
+; CHECK-NEXT:  {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ret;
+  ret void
+}
+
+; Test "blocksareclusters" attribute with full "reqntid" and "cluster_dim"
+; attributes where kernel attribute is provided through metadata.
+define void @kernel4(i32* %input, i32* %output) #0 #1 #2 {
+; CHECK-LABEL: kernel4(
+; CHECK:       .blocksareclusters
+; CHECK-NEXT:  .reqntid 1024, 1, 1
+; CHECK-NEXT:  .reqnctapercluster 2, 2, 2 // @kernel4
+; CHECK-NEXT:  {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ret;
+  ret void
+}
+
+attributes #0 = { "nvvm.blocksareclusters" }
+
+attributes #1 = { "nvvm.reqntid"="1024,1,1" }
+attributes #2 = { "nvvm.cluster_dim"="2,2,2" }
+
+attributes #3 = { "nvvm.reqntid"="1024" }
+attributes #4 = { "nvvm.cluster_dim"="2" }
+
+attributes #5 = { "nvvm.reqntid"="512,2" }
+attributes #6 = { "nvvm.cluster_dim"="2,2" }
+
+!0 = !{void (i32*, i32*)* @kernel4, !"kernel", i32 1 }
+!nvvm.annotations = !{!0}