Skip to content

Commit 4bbfecc

Browse files
committed
[NVPTX] Add support for "blocksareclusters" kernel attr
This change introduces a new kernel attribute that allows thread blocks to be mapped to clusters.
1 parent 90d1d23 commit 4bbfecc

File tree

3 files changed

+100
-4
lines changed

3 files changed

+100
-4
lines changed

llvm/docs/NVPTXUsage.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,12 @@ Function Attributes
9292
dimension. Specifying a different cluster dimension at launch will result in
9393
a runtime error or kernel launch failure. Only supported for Hopper+.
9494

95+
``"nvvm.blocksareclusters"``
96+
This attribute implies that the grid launch configuration for the corresponding
97+
kernel function is specifying the number of clusters instead of the number of thread
98+
blocks. This attribute is only allowed for kernel functions and requires
99+
``nvvm.reqntid`` and ``nvvm.cluster_dim`` attributes.
100+
95101
.. _address_spaces:
96102

97103
Address Spaces

llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -414,6 +414,18 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
414414
// the reqntid directive, and set the unspecified ones to 1.
415415
// If none of Reqntid* is specified, don't output reqntid directive.
416416
const auto ReqNTID = getReqNTID(F);
417+
418+
const NVPTXTargetMachine &NTM = static_cast<const NVPTXTargetMachine &>(TM);
419+
const auto *STI = static_cast<const NVPTXSubtarget *>(NTM.getSubtargetImpl());
420+
421+
const bool BlocksAreClusters =
422+
F.hasFnAttribute("nvvm.blocksareclusters");
423+
if (BlocksAreClusters && STI->getSmVersion() >= 90) {
424+
if (ReqNTID.empty() || getClusterDim(F).empty())
425+
report_fatal_error("blocksareclusters requires reqntid and cluster_dim");
426+
O << ".blocksareclusters\n";
427+
}
428+
417429
if (!ReqNTID.empty())
418430
O << formatv(".reqntid {0:$[, ]}\n",
419431
make_range(ReqNTID.begin(), ReqNTID.end()));
@@ -431,14 +443,14 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
431443

432444
// .maxclusterrank directive requires SM_90 or higher, make sure that we
433445
// filter it out for lower SM versions, as it causes a hard ptxas crash.
434-
const NVPTXTargetMachine &NTM = static_cast<const NVPTXTargetMachine &>(TM);
435-
const auto *STI = static_cast<const NVPTXSubtarget *>(NTM.getSubtargetImpl());
436-
437446
if (STI->getSmVersion() >= 90) {
438447
const auto ClusterDim = getClusterDim(F);
439448

440449
if (!ClusterDim.empty()) {
441-
O << ".explicitcluster\n";
450+
451+
if (!BlocksAreClusters)
452+
O << ".explicitcluster\n";
453+
442454
if (ClusterDim[0] != 0) {
443455
assert(llvm::all_of(ClusterDim, [](unsigned D) { return D != 0; }) &&
444456
"cluster_dim_x != 0 implies cluster_dim_y and cluster_dim_z "
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 | FileCheck %s
3+
4+
target triple = "nvptx64-nvidia-cuda"
5+
6+
; Test "blocksareclusters" attribute with full "reqntid" and "cluster_dim"
7+
; attributes.
8+
define ptx_kernel void @kernel1(i32* %input, i32* %output) #0 #1 #2 {
9+
; CHECK-LABEL: kernel1(
10+
; CHECK: .blocksareclusters
11+
; CHECK-NEXT: .reqntid 1024, 1, 1
12+
; CHECK-NEXT: .reqnctapercluster 2, 2, 2
13+
; CHECK-NEXT: {
14+
; CHECK-EMPTY:
15+
; CHECK-EMPTY:
16+
; CHECK-NEXT: // %bb.0:
17+
; CHECK-NEXT: ret;
18+
ret void
19+
}
20+
21+
; Test "blocksareclusters" attribute with single dimension "reqntid" and
22+
; "cluster_dim" attributes.
23+
define ptx_kernel void @kernel2(i32* %input, i32* %output) #0 #3 #4 {
24+
; CHECK-LABEL: kernel2(
25+
; CHECK: .blocksareclusters
26+
; CHECK-NEXT: .reqntid 1024
27+
; CHECK-NEXT: .reqnctapercluster 2 // @kernel2
28+
; CHECK-NEXT: {
29+
; CHECK-EMPTY:
30+
; CHECK-EMPTY:
31+
; CHECK-NEXT: // %bb.0:
32+
; CHECK-NEXT: ret;
33+
ret void
34+
}
35+
36+
; Test "blocksareclusters" attribute with two dimensions(not z dimension)
37+
; "reqntid" and "cluster_dim" attributes.
38+
define ptx_kernel void @kernel3(i32* %input, i32* %output) #0 #5 #6 {
39+
; CHECK-LABEL: kernel3(
40+
; CHECK: .blocksareclusters
41+
; CHECK-NEXT: .reqntid 512, 2
42+
; CHECK-NEXT: .reqnctapercluster 2, 2 // @kernel3
43+
; CHECK-NEXT: {
44+
; CHECK-EMPTY:
45+
; CHECK-EMPTY:
46+
; CHECK-NEXT: // %bb.0:
47+
; CHECK-NEXT: ret;
48+
ret void
49+
}
50+
51+
; Test "blocksareclusters" attribute with full "reqntid" and "cluster_dim"
52+
; attributes where kernel attribute is provided through metadata.
53+
define void @kernel4(i32* %input, i32* %output) #0 #1 #2 {
54+
; CHECK-LABEL: kernel4(
55+
; CHECK: .blocksareclusters
56+
; CHECK-NEXT: .reqntid 1024, 1, 1
57+
; CHECK-NEXT: .reqnctapercluster 2, 2, 2 // @kernel4
58+
; CHECK-NEXT: {
59+
; CHECK-EMPTY:
60+
; CHECK-EMPTY:
61+
; CHECK-NEXT: // %bb.0:
62+
; CHECK-NEXT: ret;
63+
ret void
64+
}
65+
66+
attributes #0 = { "nvvm.blocksareclusters" }
67+
68+
attributes #1 = { "nvvm.reqntid"="1024,1,1" }
69+
attributes #2 = { "nvvm.cluster_dim"="2,2,2" }
70+
71+
attributes #3 = { "nvvm.reqntid"="1024" }
72+
attributes #4 = { "nvvm.cluster_dim"="2" }
73+
74+
attributes #5 = { "nvvm.reqntid"="512,2" }
75+
attributes #6 = { "nvvm.cluster_dim"="2,2" }
76+
77+
!0 = !{void (i32*, i32*)* @kernel4, !"kernel", i32 1 }
78+
!nvvm.annotations = !{!0}

0 commit comments

Comments
 (0)