Skip to content

Commit 6eea017

Browse files
[NVPTX] Support for fence.acquire and fence.release
1 parent 617278e commit 6eea017

File tree

10 files changed

+637
-159
lines changed

10 files changed

+637
-159
lines changed

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -648,9 +648,50 @@ static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S,
648648
if (S == NVPTX::Scope::Cluster)
649649
T->failIfClustersUnsupported(".cluster scope fence");
650650

651+
// Fall back to .acq_rel if .acquire, .release is not supported.
652+
if (!T->hasSplitAcquireAndReleaseFences() &&
653+
(O == NVPTX::Ordering::Acquire || O == NVPTX::Ordering::Release))
654+
O = NVPTX::Ordering::AcquireRelease;
655+
651656
switch (O) {
652657
case NVPTX::Ordering::Acquire:
658+
switch (S) {
659+
case NVPTX::Scope::System:
660+
return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_sys
661+
: NVPTX::INT_MEMBAR_SYS;
662+
case NVPTX::Scope::Block:
663+
return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_cta
664+
: NVPTX::INT_MEMBAR_CTA;
665+
case NVPTX::Scope::Cluster:
666+
return NVPTX::atomic_thread_fence_acquire_cluster;
667+
case NVPTX::Scope::Device:
668+
return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_gpu
669+
: NVPTX::INT_MEMBAR_GL;
670+
case NVPTX::Scope::Thread:
671+
report_fatal_error(
672+
formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
673+
ScopeToString(S)));
674+
}
675+
break;
653676
case NVPTX::Ordering::Release:
677+
switch (S) {
678+
case NVPTX::Scope::System:
679+
return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_sys
680+
: NVPTX::INT_MEMBAR_SYS;
681+
case NVPTX::Scope::Block:
682+
return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_cta
683+
: NVPTX::INT_MEMBAR_CTA;
684+
case NVPTX::Scope::Cluster:
685+
return NVPTX::atomic_thread_fence_release_cluster;
686+
case NVPTX::Scope::Device:
687+
return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_gpu
688+
: NVPTX::INT_MEMBAR_GL;
689+
case NVPTX::Scope::Thread:
690+
report_fatal_error(
691+
formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
692+
ScopeToString(S)));
693+
}
694+
break;
654695
case NVPTX::Ordering::AcquireRelease: {
655696
switch (S) {
656697
case NVPTX::Scope::System:

llvm/lib/Target/NVPTX/NVPTXInstrInfo.td

Lines changed: 10 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -3866,33 +3866,16 @@ def : Pat <
38663866
// PTX Fence instructions
38673867
////////////////////////////////////////////////////////////////////////////////
38683868

3869-
def atomic_thread_fence_seq_cst_sys :
3870-
NVPTXInst<(outs), (ins), "fence.sc.sys;", []>,
3871-
Requires<[hasPTX<60>, hasSM<70>]>;
3872-
def atomic_thread_fence_acq_rel_sys :
3873-
NVPTXInst<(outs), (ins), "fence.acq_rel.sys;", []>,
3874-
Requires<[hasPTX<60>, hasSM<70>]>;
3875-
3876-
def atomic_thread_fence_seq_cst_gpu :
3877-
NVPTXInst<(outs), (ins), "fence.sc.gpu;", []>,
3878-
Requires<[hasPTX<60>, hasSM<70>]>;
3879-
def atomic_thread_fence_acq_rel_gpu :
3880-
NVPTXInst<(outs), (ins), "fence.acq_rel.gpu;", []>,
3881-
Requires<[hasPTX<60>, hasSM<70>]>;
3882-
3883-
def atomic_thread_fence_seq_cst_cluster :
3884-
NVPTXInst<(outs), (ins), "fence.sc.cluster;", []>,
3885-
Requires<[hasPTX<78>, hasSM<90>]>;
3886-
def atomic_thread_fence_acq_rel_cluster :
3887-
NVPTXInst<(outs), (ins), "fence.acq_rel.cluster;", []>,
3888-
Requires<[hasPTX<78>, hasSM<90>]>;
3889-
3890-
def atomic_thread_fence_seq_cst_cta :
3891-
NVPTXInst<(outs), (ins), "fence.sc.cta;", []>,
3892-
Requires<[hasPTX<60>, hasSM<70>]>;
3893-
def atomic_thread_fence_acq_rel_cta :
3894-
NVPTXInst<(outs), (ins), "fence.acq_rel.cta;", []>,
3895-
Requires<[hasPTX<60>, hasSM<70>]>;
3869+
class NVPTXFenceInst<string scope, string sem, Predicate ptx>:
3870+
NVPTXInst<(outs), (ins), "fence."#sem#"."#scope#";", []>,
3871+
Requires<[ptx, hasSM<70>]>;
3872+
3873+
foreach scope = ["sys", "gpu", "cluster", "cta"] in {
3874+
def atomic_thread_fence_seq_cst_#scope: NVPTXFenceInst<scope, "sc", hasPTX<60>>;
3875+
def atomic_thread_fence_acq_rel_#scope: NVPTXFenceInst<scope, "acq_rel", hasPTX<60>>;
3876+
def atomic_thread_fence_acquire_#scope: NVPTXFenceInst<scope, "acquire", hasPTX<87>>;
3877+
def atomic_thread_fence_release_#scope: NVPTXFenceInst<scope, "release", hasPTX<87>>;
3878+
}
38963879

38973880
def fpimm_any_zero : FPImmLeaf<fAny, [{
38983881
return Imm.isZero();

llvm/lib/Target/NVPTX/NVPTXSubtarget.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,10 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
8888
// Does SM & PTX support memory orderings (weak and atomic: relaxed, acquire,
8989
// release, acq_rel, sc) ?
9090
bool hasMemoryOrdering() const { return SmVersion >= 70 && PTXVersion >= 60; }
91+
// Does SM & PTX support .acquire and .release qualifiers for fence?
92+
bool hasSplitAcquireAndReleaseFences() const {
93+
return SmVersion >= 90 && PTXVersion >= 86;
94+
}
9195
// Does SM & PTX support atomic relaxed MMIO operations ?
9296
bool hasRelaxedMMIO() const { return SmVersion >= 70 && PTXVersion >= 82; }
9397
bool hasDotInstructions() const {

llvm/test/CodeGen/NVPTX/fence-sm-90.ll

Lines changed: 0 additions & 30 deletions
This file was deleted.
Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
3+
; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx50 | FileCheck %s --check-prefix=SM30
4+
; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_30 -mattr=+ptx50 | %ptxas-verfy %}
5+
6+
7+
define void @fence_acquire_() {
8+
; SM30-LABEL: fence_acquire_(
9+
; SM30: {
10+
; SM30-EMPTY:
11+
; SM30-EMPTY:
12+
; SM30-NEXT: // %bb.0:
13+
; SM30-NEXT: membar.sys;
14+
; SM30-NEXT: ret;
15+
fence syncscope("") acquire
16+
ret void
17+
}
18+
19+
20+
define void @fence_acquire_block() {
21+
; SM30-LABEL: fence_acquire_block(
22+
; SM30: {
23+
; SM30-EMPTY:
24+
; SM30-EMPTY:
25+
; SM30-NEXT: // %bb.0:
26+
; SM30-NEXT: membar.cta;
27+
; SM30-NEXT: ret;
28+
fence syncscope("block") acquire
29+
ret void
30+
}
31+
32+
; .cluster scope unsupported on SM = 30 PTX = 50
33+
34+
define void @fence_acquire_device() {
35+
; SM30-LABEL: fence_acquire_device(
36+
; SM30: {
37+
; SM30-EMPTY:
38+
; SM30-EMPTY:
39+
; SM30-NEXT: // %bb.0:
40+
; SM30-NEXT: membar.gl;
41+
; SM30-NEXT: ret;
42+
fence syncscope("device") acquire
43+
ret void
44+
}
45+
46+
47+
define void @fence_release_() {
48+
; SM30-LABEL: fence_release_(
49+
; SM30: {
50+
; SM30-EMPTY:
51+
; SM30-EMPTY:
52+
; SM30-NEXT: // %bb.0:
53+
; SM30-NEXT: membar.sys;
54+
; SM30-NEXT: ret;
55+
fence syncscope("") release
56+
ret void
57+
}
58+
59+
60+
define void @fence_release_block() {
61+
; SM30-LABEL: fence_release_block(
62+
; SM30: {
63+
; SM30-EMPTY:
64+
; SM30-EMPTY:
65+
; SM30-NEXT: // %bb.0:
66+
; SM30-NEXT: membar.cta;
67+
; SM30-NEXT: ret;
68+
fence syncscope("block") release
69+
ret void
70+
}
71+
72+
; .cluster scope unsupported on SM = 30 PTX = 50
73+
74+
define void @fence_release_device() {
75+
; SM30-LABEL: fence_release_device(
76+
; SM30: {
77+
; SM30-EMPTY:
78+
; SM30-EMPTY:
79+
; SM30-NEXT: // %bb.0:
80+
; SM30-NEXT: membar.gl;
81+
; SM30-NEXT: ret;
82+
fence syncscope("device") release
83+
ret void
84+
}
85+
86+
87+
define void @fence_acq_rel_() {
88+
; SM30-LABEL: fence_acq_rel_(
89+
; SM30: {
90+
; SM30-EMPTY:
91+
; SM30-EMPTY:
92+
; SM30-NEXT: // %bb.0:
93+
; SM30-NEXT: membar.sys;
94+
; SM30-NEXT: ret;
95+
fence syncscope("") acq_rel
96+
ret void
97+
}
98+
99+
100+
define void @fence_acq_rel_block() {
101+
; SM30-LABEL: fence_acq_rel_block(
102+
; SM30: {
103+
; SM30-EMPTY:
104+
; SM30-EMPTY:
105+
; SM30-NEXT: // %bb.0:
106+
; SM30-NEXT: membar.cta;
107+
; SM30-NEXT: ret;
108+
fence syncscope("block") acq_rel
109+
ret void
110+
}
111+
112+
; .cluster scope unsupported on SM = 30 PTX = 50
113+
114+
define void @fence_acq_rel_device() {
115+
; SM30-LABEL: fence_acq_rel_device(
116+
; SM30: {
117+
; SM30-EMPTY:
118+
; SM30-EMPTY:
119+
; SM30-NEXT: // %bb.0:
120+
; SM30-NEXT: membar.gl;
121+
; SM30-NEXT: ret;
122+
fence syncscope("device") acq_rel
123+
ret void
124+
}
125+
126+
127+
define void @fence_seq_cst_() {
128+
; SM30-LABEL: fence_seq_cst_(
129+
; SM30: {
130+
; SM30-EMPTY:
131+
; SM30-EMPTY:
132+
; SM30-NEXT: // %bb.0:
133+
; SM30-NEXT: membar.sys;
134+
; SM30-NEXT: ret;
135+
fence syncscope("") seq_cst
136+
ret void
137+
}
138+
139+
140+
define void @fence_seq_cst_block() {
141+
; SM30-LABEL: fence_seq_cst_block(
142+
; SM30: {
143+
; SM30-EMPTY:
144+
; SM30-EMPTY:
145+
; SM30-NEXT: // %bb.0:
146+
; SM30-NEXT: membar.cta;
147+
; SM30-NEXT: ret;
148+
fence syncscope("block") seq_cst
149+
ret void
150+
}
151+
152+
; .cluster scope unsupported on SM = 30 PTX = 50
153+
154+
define void @fence_seq_cst_device() {
155+
; SM30-LABEL: fence_seq_cst_device(
156+
; SM30: {
157+
; SM30-EMPTY:
158+
; SM30-EMPTY:
159+
; SM30-NEXT: // %bb.0:
160+
; SM30-NEXT: membar.gl;
161+
; SM30-NEXT: ret;
162+
fence syncscope("device") seq_cst
163+
ret void
164+
}
165+

0 commit comments

Comments
 (0)