From 1fa2aaa08c604dd6dc06dfba190797a3628edbe4 Mon Sep 17 00:00:00 2001 From: rbajpai Date: Fri, 1 Aug 2025 18:17:03 +0530 Subject: [PATCH 1/2] [NVPTX] Add support for "blocksareclusters" kernel attr This change introduces a new kernel attribute that allows thread blocks to be mapped to clusters. --- llvm/docs/NVPTXUsage.rst | 6 ++ llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 19 ++++- .../NVPTX/blocksareclusters-kernel-attr.ll | 78 +++++++++++++++++++ 3 files changed, 99 insertions(+), 4 deletions(-) create mode 100644 llvm/test/CodeGen/NVPTX/blocksareclusters-kernel-attr.ll diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index 2dc8f9ff6a57f..629bf2ea5afb4 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -92,6 +92,12 @@ Function Attributes dimension. Specifying a different cluster dimension at launch will result in a runtime error or kernel launch failure. Only supported for Hopper+. +``"nvvm.blocksareclusters"`` + This attribute implies that the grid launch configuration for the corresponding + kernel function is specifying the number of clusters instead of the number of thread + blocks. This attribute is only allowed for kernel functions and requires + ``nvvm.reqntid`` and ``nvvm.cluster_dim`` attributes. + .. _address_spaces: Address Spaces diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index 38912a7f09e30..385bf334ba338 100644 --- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -414,6 +414,17 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F, // the reqntid directive, and set the unspecified ones to 1. // If none of Reqntid* is specified, don't output reqntid directive. const auto ReqNTID = getReqNTID(F); + + const NVPTXTargetMachine &NTM = static_cast(TM); + const auto *STI = static_cast(NTM.getSubtargetImpl()); + + const bool BlocksAreClusters = F.hasFnAttribute("nvvm.blocksareclusters"); + if (BlocksAreClusters && STI->getSmVersion() >= 90) { + if (ReqNTID.empty() || getClusterDim(F).empty()) + report_fatal_error("blocksareclusters requires reqntid and cluster_dim"); + O << ".blocksareclusters\n"; + } + if (!ReqNTID.empty()) O << formatv(".reqntid {0:$[, ]}\n", make_range(ReqNTID.begin(), ReqNTID.end())); @@ -431,14 +442,14 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F, // .maxclusterrank directive requires SM_90 or higher, make sure that we // filter it out for lower SM versions, as it causes a hard ptxas crash. - const NVPTXTargetMachine &NTM = static_cast(TM); - const auto *STI = static_cast(NTM.getSubtargetImpl()); - if (STI->getSmVersion() >= 90) { const auto ClusterDim = getClusterDim(F); if (!ClusterDim.empty()) { - O << ".explicitcluster\n"; + + if (!BlocksAreClusters) + O << ".explicitcluster\n"; + if (ClusterDim[0] != 0) { assert(llvm::all_of(ClusterDim, [](unsigned D) { return D != 0; }) && "cluster_dim_x != 0 implies cluster_dim_y and cluster_dim_z " diff --git a/llvm/test/CodeGen/NVPTX/blocksareclusters-kernel-attr.ll b/llvm/test/CodeGen/NVPTX/blocksareclusters-kernel-attr.ll new file mode 100644 index 0000000000000..13357f015a176 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/blocksareclusters-kernel-attr.ll @@ -0,0 +1,78 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 | FileCheck %s + +target triple = "nvptx64-nvidia-cuda" + +; Test "blocksareclusters" attribute with full "reqntid" and "cluster_dim" +; attributes. +define ptx_kernel void @kernel1(i32* %input, i32* %output) #0 #1 #2 { +; CHECK-LABEL: kernel1( +; CHECK: .blocksareclusters +; CHECK-NEXT: .reqntid 1024, 1, 1 +; CHECK-NEXT: .reqnctapercluster 2, 2, 2 +; CHECK-NEXT: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ret; + ret void +} + +; Test "blocksareclusters" attribute with single dimension "reqntid" and +; "cluster_dim" attributes. +define ptx_kernel void @kernel2(i32* %input, i32* %output) #0 #3 #4 { +; CHECK-LABEL: kernel2( +; CHECK: .blocksareclusters +; CHECK-NEXT: .reqntid 1024 +; CHECK-NEXT: .reqnctapercluster 2 // @kernel2 +; CHECK-NEXT: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ret; + ret void +} + +; Test "blocksareclusters" attribute with two dimensions(not z dimension) +; "reqntid" and "cluster_dim" attributes. +define ptx_kernel void @kernel3(i32* %input, i32* %output) #0 #5 #6 { +; CHECK-LABEL: kernel3( +; CHECK: .blocksareclusters +; CHECK-NEXT: .reqntid 512, 2 +; CHECK-NEXT: .reqnctapercluster 2, 2 // @kernel3 +; CHECK-NEXT: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ret; + ret void +} + +; Test "blocksareclusters" attribute with full "reqntid" and "cluster_dim" +; attributes where kernel attribute is provided through metadata. +define void @kernel4(i32* %input, i32* %output) #0 #1 #2 { +; CHECK-LABEL: kernel4( +; CHECK: .blocksareclusters +; CHECK-NEXT: .reqntid 1024, 1, 1 +; CHECK-NEXT: .reqnctapercluster 2, 2, 2 // @kernel4 +; CHECK-NEXT: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ret; + ret void +} + +attributes #0 = { "nvvm.blocksareclusters" } + +attributes #1 = { "nvvm.reqntid"="1024,1,1" } +attributes #2 = { "nvvm.cluster_dim"="2,2,2" } + +attributes #3 = { "nvvm.reqntid"="1024" } +attributes #4 = { "nvvm.cluster_dim"="2" } + +attributes #5 = { "nvvm.reqntid"="512,2" } +attributes #6 = { "nvvm.cluster_dim"="2,2" } + +!0 = !{void (i32*, i32*)* @kernel4, !"kernel", i32 1 } +!nvvm.annotations = !{!0} From 0326668300443383be8f552cf32e47da7d40394c Mon Sep 17 00:00:00 2001 From: rbajpai Date: Thu, 14 Aug 2025 14:50:54 +0530 Subject: [PATCH 2/2] Addressed review comments In addition to "blocksareclusters" kernel attr this change also add "ptx90" support in NVPTX backend. --- llvm/lib/Target/NVPTX/NVPTX.td | 8 ++-- llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 28 ++++++++----- llvm/lib/Target/NVPTX/NVPTXUtilities.cpp | 4 ++ llvm/lib/Target/NVPTX/NVPTXUtilities.h | 2 + .../NVPTX/blocksareclusters-kernel-attr.ll | 42 ++++++------------- 5 files changed, 39 insertions(+), 45 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTX.td b/llvm/lib/Target/NVPTX/NVPTX.td index 83992606bc419..8a445f82e7001 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.td +++ b/llvm/lib/Target/NVPTX/NVPTX.td @@ -97,10 +97,10 @@ foreach sm = [20, 21, 30, 32, 35, 37, 50, 52, 53, def SM#sm#a : FeatureSM<""#sm#"a", !add(!mul(sm, 10), 3)>; } -foreach version = [32, 40, 41, 42, 43, 50, 60, 61, 62, 63, 64, 65, - 70, 71, 72, 73, 74, 75, 76, 77, 78, - 80, 81, 82, 83, 84, 85, 86, 87, 88] in - def PTX#version: FeaturePTX; +foreach version = [32, 40, 41, 42, 43, 50, 60, 61, 62, 63, 64, 65, 70, 71, 72, + 73, 74, 75, 76, 77, 78, 80, 81, 82, 83, 84, 85, 86, 87, 88, + 90] in + def PTX#version : FeaturePTX; //===----------------------------------------------------------------------===// // NVPTX supported processors. diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index 385bf334ba338..f1b4398431a34 100644 --- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -414,17 +414,6 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F, // the reqntid directive, and set the unspecified ones to 1. // If none of Reqntid* is specified, don't output reqntid directive. const auto ReqNTID = getReqNTID(F); - - const NVPTXTargetMachine &NTM = static_cast(TM); - const auto *STI = static_cast(NTM.getSubtargetImpl()); - - const bool BlocksAreClusters = F.hasFnAttribute("nvvm.blocksareclusters"); - if (BlocksAreClusters && STI->getSmVersion() >= 90) { - if (ReqNTID.empty() || getClusterDim(F).empty()) - report_fatal_error("blocksareclusters requires reqntid and cluster_dim"); - O << ".blocksareclusters\n"; - } - if (!ReqNTID.empty()) O << formatv(".reqntid {0:$[, ]}\n", make_range(ReqNTID.begin(), ReqNTID.end())); @@ -442,8 +431,12 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F, // .maxclusterrank directive requires SM_90 or higher, make sure that we // filter it out for lower SM versions, as it causes a hard ptxas crash. + const NVPTXTargetMachine &NTM = static_cast(TM); + const auto *STI = static_cast(NTM.getSubtargetImpl()); + if (STI->getSmVersion() >= 90) { const auto ClusterDim = getClusterDim(F); + const bool BlocksAreClusters = hasBlocksAreClusters(F); if (!ClusterDim.empty()) { @@ -463,6 +456,19 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F, "should be 0 as well"); } } + + if (BlocksAreClusters) { + LLVMContext &Ctx = F.getContext(); + if (ReqNTID.empty() || ClusterDim.empty()) { + Ctx.emitError( + "blocksareclusters requires reqntid and cluster_dim attributes"); + } else if (STI->getPTXVersion() < 90) { + Ctx.emitError("blocksareclusters requires PTX version >= 9.0"); + } else { + O << ".blocksareclusters\n"; + } + } + if (const auto Maxclusterrank = getMaxClusterRank(F)) O << ".maxclusterrank " << *Maxclusterrank << "\n"; } diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp index 6586f925504f1..274b04fdd30b5 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp @@ -352,6 +352,10 @@ std::optional getMaxNReg(const Function &F) { return getFnAttrParsedInt(F, "nvvm.maxnreg"); } +bool hasBlocksAreClusters(const Function &F) { + return F.hasFnAttribute("nvvm.blocksareclusters"); +} + MaybeAlign getAlign(const CallInst &I, unsigned Index) { // First check the alignstack metadata if (MaybeAlign StackAlign = diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h index 4eb452f398220..9421f9f54d0a6 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h @@ -60,6 +60,8 @@ std::optional getMaxClusterRank(const Function &); std::optional getMinCTASm(const Function &); std::optional getMaxNReg(const Function &); +bool hasBlocksAreClusters(const Function &); + inline bool isKernelFunction(const Function &F) { return F.getCallingConv() == CallingConv::PTX_Kernel; } diff --git a/llvm/test/CodeGen/NVPTX/blocksareclusters-kernel-attr.ll b/llvm/test/CodeGen/NVPTX/blocksareclusters-kernel-attr.ll index 13357f015a176..a0a99fe55654f 100644 --- a/llvm/test/CodeGen/NVPTX/blocksareclusters-kernel-attr.ll +++ b/llvm/test/CodeGen/NVPTX/blocksareclusters-kernel-attr.ll @@ -1,15 +1,15 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 | FileCheck %s +; RUN: llc < %s -mcpu=sm_90 -mattr=+ptx90 | FileCheck %s target triple = "nvptx64-nvidia-cuda" ; Test "blocksareclusters" attribute with full "reqntid" and "cluster_dim" ; attributes. -define ptx_kernel void @kernel1(i32* %input, i32* %output) #0 #1 #2 { +define ptx_kernel void @kernel1(ptr %input, ptr %output) #0 #1 #2 { ; CHECK-LABEL: kernel1( -; CHECK: .blocksareclusters -; CHECK-NEXT: .reqntid 1024, 1, 1 +; CHECK: .reqntid 1024, 1, 1 ; CHECK-NEXT: .reqnctapercluster 2, 2, 2 +; CHECK-NEXT: .blocksareclusters ; CHECK-NEXT: { ; CHECK-EMPTY: ; CHECK-EMPTY: @@ -20,11 +20,11 @@ define ptx_kernel void @kernel1(i32* %input, i32* %output) #0 #1 #2 { ; Test "blocksareclusters" attribute with single dimension "reqntid" and ; "cluster_dim" attributes. -define ptx_kernel void @kernel2(i32* %input, i32* %output) #0 #3 #4 { +define ptx_kernel void @kernel2(ptr %input, ptr %output) #0 #3 #4 { ; CHECK-LABEL: kernel2( -; CHECK: .blocksareclusters -; CHECK-NEXT: .reqntid 1024 -; CHECK-NEXT: .reqnctapercluster 2 // @kernel2 +; CHECK: .reqntid 1024 +; CHECK-NEXT: .reqnctapercluster 2 +; CHECK-NEXT: .blocksareclusters // @kernel2 ; CHECK-NEXT: { ; CHECK-EMPTY: ; CHECK-EMPTY: @@ -35,26 +35,11 @@ define ptx_kernel void @kernel2(i32* %input, i32* %output) #0 #3 #4 { ; Test "blocksareclusters" attribute with two dimensions(not z dimension) ; "reqntid" and "cluster_dim" attributes. -define ptx_kernel void @kernel3(i32* %input, i32* %output) #0 #5 #6 { +define ptx_kernel void @kernel3(ptr %input, ptr %output) #0 #5 #6 { ; CHECK-LABEL: kernel3( -; CHECK: .blocksareclusters -; CHECK-NEXT: .reqntid 512, 2 -; CHECK-NEXT: .reqnctapercluster 2, 2 // @kernel3 -; CHECK-NEXT: { -; CHECK-EMPTY: -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ret; - ret void -} - -; Test "blocksareclusters" attribute with full "reqntid" and "cluster_dim" -; attributes where kernel attribute is provided through metadata. -define void @kernel4(i32* %input, i32* %output) #0 #1 #2 { -; CHECK-LABEL: kernel4( -; CHECK: .blocksareclusters -; CHECK-NEXT: .reqntid 1024, 1, 1 -; CHECK-NEXT: .reqnctapercluster 2, 2, 2 // @kernel4 +; CHECK: .reqntid 512, 2 +; CHECK-NEXT: .reqnctapercluster 2, 2 +; CHECK-NEXT: .blocksareclusters // @kernel3 ; CHECK-NEXT: { ; CHECK-EMPTY: ; CHECK-EMPTY: @@ -73,6 +58,3 @@ attributes #4 = { "nvvm.cluster_dim"="2" } attributes #5 = { "nvvm.reqntid"="512,2" } attributes #6 = { "nvvm.cluster_dim"="2,2" } - -!0 = !{void (i32*, i32*)* @kernel4, !"kernel", i32 1 } -!nvvm.annotations = !{!0}