Skip to content

Commit 46082e6

Browse files
author
git apple-llvm automerger
committed
Merge commit '735209c0688b' from llvm.org/main into next
2 parents eae0389 + 735209c commit 46082e6

File tree

22 files changed

+389
-197
lines changed

22 files changed

+389
-197
lines changed

clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1160,6 +1160,22 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
11601160
case NVPTX::BI__nvvm_fence_sc_cluster:
11611161
return Builder.CreateCall(
11621162
CGM.getIntrinsic(Intrinsic::nvvm_fence_sc_cluster));
1163+
case NVPTX::BI__nvvm_bar_sync:
1164+
return Builder.CreateCall(
1165+
CGM.getIntrinsic(Intrinsic::nvvm_barrier_cta_sync_aligned_all),
1166+
EmitScalarExpr(E->getArg(0)));
1167+
case NVPTX::BI__syncthreads:
1168+
return Builder.CreateCall(
1169+
CGM.getIntrinsic(Intrinsic::nvvm_barrier_cta_sync_aligned_all),
1170+
Builder.getInt32(0));
1171+
case NVPTX::BI__nvvm_barrier_sync:
1172+
return Builder.CreateCall(
1173+
CGM.getIntrinsic(Intrinsic::nvvm_barrier_cta_sync_all),
1174+
EmitScalarExpr(E->getArg(0)));
1175+
case NVPTX::BI__nvvm_barrier_sync_cnt:
1176+
return Builder.CreateCall(
1177+
CGM.getIntrinsic(Intrinsic::nvvm_barrier_cta_sync),
1178+
{EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1))});
11631179
default:
11641180
return nullptr;
11651181
}

clang/test/CodeGen/builtins-nvptx-ptx60.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,10 @@ __device__ void nvvm_sync(unsigned mask, int i, float f, int a, int b,
3232
// CHECK: call void @llvm.nvvm.bar.warp.sync(i32
3333
// expected-error@+1 {{'__nvvm_bar_warp_sync' needs target feature ptx60}}
3434
__nvvm_bar_warp_sync(mask);
35-
// CHECK: call void @llvm.nvvm.barrier.sync(i32
35+
// CHECK: call void @llvm.nvvm.barrier.cta.sync.all(i32
3636
// expected-error@+1 {{'__nvvm_barrier_sync' needs target feature ptx60}}
3737
__nvvm_barrier_sync(mask);
38-
// CHECK: call void @llvm.nvvm.barrier.sync.cnt(i32
38+
// CHECK: call void @llvm.nvvm.barrier.cta.sync(i32
3939
// expected-error@+1 {{'__nvvm_barrier_sync_cnt' needs target feature ptx60}}
4040
__nvvm_barrier_sync_cnt(mask, i);
4141

clang/test/CodeGen/builtins-nvptx.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ __device__ int read_pms() {
198198

199199
__device__ void sync() {
200200

201-
// CHECK: call void @llvm.nvvm.bar.sync(i32 0)
201+
// CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
202202

203203
__nvvm_bar_sync(0);
204204

@@ -259,7 +259,7 @@ __device__ void nvvm_math(float f1, float f2, double d1, double d2) {
259259
__nvvm_membar_gl();
260260
// CHECK: call void @llvm.nvvm.membar.sys()
261261
__nvvm_membar_sys();
262-
// CHECK: call void @llvm.nvvm.barrier0()
262+
// CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
263263
__syncthreads();
264264
}
265265

clang/test/Headers/gpuintrin.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -887,7 +887,7 @@ __gpu_kernel void foo() {
887887
// NVPTX-LABEL: define internal void @__gpu_sync_threads(
888888
// NVPTX-SAME: ) #[[ATTR0]] {
889889
// NVPTX-NEXT: [[ENTRY:.*:]]
890-
// NVPTX-NEXT: call void @llvm.nvvm.barrier0()
890+
// NVPTX-NEXT: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
891891
// NVPTX-NEXT: ret void
892892
//
893893
//

llvm/docs/NVPTXUsage.rst

Lines changed: 42 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -199,21 +199,58 @@ map in the following way to CUDA builtins:
199199
Barriers
200200
--------
201201

202-
'``llvm.nvvm.barrier0``'
203-
^^^^^^^^^^^^^^^^^^^^^^^^^^^
202+
'``llvm.nvvm.barrier.cta.*``'
203+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
204204

205205
Syntax:
206206
"""""""
207207

208208
.. code-block:: llvm
209209
210-
declare void @llvm.nvvm.barrier0()
210+
declare void @llvm.nvvm.barrier.cta.sync(i32 %id, i32 %n)
211+
declare void @llvm.nvvm.barrier.cta.sync.all(i32 %id)
212+
declare void @llvm.nvvm.barrier.cta.arrive(i32 %id, i32 %n)
213+
214+
declare void @llvm.nvvm.barrier.cta.sync.aligned(i32 %id, i32 %n)
215+
declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 %id)
216+
declare void @llvm.nvvm.barrier.cta.arrive.aligned(i32 %id, i32 %n)
211217
212218
Overview:
213219
"""""""""
214220

215-
The '``@llvm.nvvm.barrier0()``' intrinsic emits a PTX ``bar.sync 0``
216-
instruction, equivalent to the ``__syncthreads()`` call in CUDA.
221+
The '``@llvm.nvvm.barrier.cta.*``' family of intrinsics perform barrier
222+
synchronization and communication within a CTA. They can be used by the threads
223+
within the CTA for synchronization and communication.
224+
225+
Semantics:
226+
""""""""""
227+
228+
Operand %id specifies a logical barrier resource and must fall within the range
229+
0 through 15. When present, operand %n specifies the number of threads
230+
participating in the barrier. When specifying a thread count, the value must be
231+
a multiple of the warp size. With the '``@llvm.nvvm.barrier.cta.sync.*``'
232+
variants, the '``.all``' suffix indicates that all threads in the CTA should
233+
participate in the barrier and the %n operand is not present.
234+
235+
All forms of the '``@llvm.nvvm.barrier.cta.*``' intrinsic cause the executing
236+
thread to wait for all non-exited threads from its warp and then marks the
237+
warp's arrival at the barrier. In addition to signaling its arrival at the
238+
barrier, the '``@llvm.nvvm.barrier.cta.sync.*``' intrinsics cause the executing
239+
thread to wait for non-exited threads of all other warps participating in the
240+
barrier to arrive. On the other hand, the '``@llvm.nvvm.barrier.cta.arrive.*``'
241+
intrinsic does not cause the executing thread to wait for threads of other
242+
participating warps.
243+
244+
When a barrier completes, the waiting threads are restarted without delay,
245+
and the barrier is reinitialized so that it can be immediately reused.
246+
247+
The '``@llvm.nvvm.barrier.cta.*``' intrinsic has an optional '``.aligned``'
248+
modifier to indicate textual alignment of the barrier. When specified, it
249+
indicates that all threads in the CTA will execute the same
250+
'``@llvm.nvvm.barrier.cta.*``' instruction. In conditionally executed code, an
251+
aligned '``@llvm.nvvm.barrier.cta.*``' instruction should only be used if it is
252+
known that all threads in the CTA evaluate the condition identically, otherwise
253+
behavior is undefined.
217254

218255
Electing a thread
219256
-----------------

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 18 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,12 @@
128128
// * llvm.nvvm.swap.lo.hi.b64 --> llvm.fshl(x, x, 32)
129129
// * llvm.nvvm.atomic.load.inc.32 --> atomicrmw uinc_wrap
130130
// * llvm.nvvm.atomic.load.dec.32 --> atomicrmw udec_wrap
131+
// * llvm.nvvm.barrier0 --> llvm.nvvm.barrier.cta.sync.aligned.all(0)
132+
// * llvm.nvvm.barrier.n --> llvm.nvvm.barrier.cta.sync.aligned.all(x)
133+
// * llvm.nvvm.bar.sync --> llvm.nvvm.barrier.cta.sync.aligned.all(x)
134+
// * llvm.nvvm.barrier --> llvm.nvvm.barrier.cta.sync.aligned(x, y)
135+
// * llvm.nvvm.barrier.sync --> llvm.nvvm.barrier.cta.sync.all(x)
136+
// * llvm.nvvm.barrier.sync.cnt --> llvm.nvvm.barrier.cta.sync(x, y)
131137

132138
def llvm_global_ptr_ty : LLVMQualPointerType<1>; // (global)ptr
133139
def llvm_shared_ptr_ty : LLVMQualPointerType<3>; // (shared)ptr
@@ -1263,35 +1269,28 @@ let TargetPrefix = "nvvm" in {
12631269
defm int_nvvm_atomic_cas_gen_i : PTXAtomicWithScope3<llvm_anyint_ty>;
12641270

12651271
// Bar.Sync
1266-
1267-
// The builtin for "bar.sync 0" is called __syncthreads. Unlike most of the
1268-
// intrinsics in this file, this one is a user-facing API.
1269-
def int_nvvm_barrier0 : ClangBuiltin<"__syncthreads">,
1270-
Intrinsic<[], [], [IntrConvergent, IntrNoCallback]>;
1271-
// Synchronize all threads in the CTA at barrier 'n'.
1272-
def int_nvvm_barrier_n : ClangBuiltin<"__nvvm_bar_n">,
1273-
Intrinsic<[], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
1274-
// Synchronize 'm', a multiple of warp size, (arg 2) threads in
1275-
// the CTA at barrier 'n' (arg 1).
1276-
def int_nvvm_barrier : ClangBuiltin<"__nvvm_bar">,
1277-
Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
12781272
def int_nvvm_barrier0_popc : ClangBuiltin<"__nvvm_bar0_popc">,
12791273
Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
12801274
def int_nvvm_barrier0_and : ClangBuiltin<"__nvvm_bar0_and">,
12811275
Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
12821276
def int_nvvm_barrier0_or : ClangBuiltin<"__nvvm_bar0_or">,
12831277
Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
12841278

1285-
def int_nvvm_bar_sync : NVVMBuiltin,
1286-
Intrinsic<[], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
12871279
def int_nvvm_bar_warp_sync : NVVMBuiltin,
12881280
Intrinsic<[], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
12891281

1290-
// barrier.sync id[, cnt]
1291-
def int_nvvm_barrier_sync : NVVMBuiltin,
1292-
Intrinsic<[], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
1293-
def int_nvvm_barrier_sync_cnt : NVVMBuiltin,
1294-
Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
1282+
// barrier{.cta}.sync{.aligned} a{, b};
1283+
// barrier{.cta}.arrive{.aligned} a, b;
1284+
let IntrProperties = [IntrConvergent, IntrNoCallback] in {
1285+
foreach align = ["", "_aligned"] in {
1286+
def int_nvvm_barrier_cta_sync # align # _all :
1287+
Intrinsic<[], [llvm_i32_ty]>;
1288+
def int_nvvm_barrier_cta_sync # align :
1289+
Intrinsic<[], [llvm_i32_ty, llvm_i32_ty]>;
1290+
def int_nvvm_barrier_cta_arrive # align :
1291+
Intrinsic<[], [llvm_i32_ty, llvm_i32_ty]>;
1292+
}
1293+
}
12951294

12961295
// barrier.cluster.[wait, arrive, arrive.relaxed]
12971296
def int_nvvm_barrier_cluster_arrive :

llvm/lib/IR/AutoUpgrade.cpp

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1350,12 +1350,9 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
13501350
// nvvm.abs.{i,ii}
13511351
Expand =
13521352
Name == "i" || Name == "ll" || Name == "bf16" || Name == "bf16x2";
1353-
else if (Name == "fabs.f" || Name == "fabs.ftz.f" || Name == "fabs.d")
1353+
else if (Name.consume_front("fabs."))
13541354
// nvvm.fabs.{f,ftz.f,d}
1355-
Expand = true;
1356-
else if (Name == "clz.ll" || Name == "popc.ll" || Name == "h2f" ||
1357-
Name == "swap.lo.hi.b64")
1358-
Expand = true;
1355+
Expand = Name == "f" || Name == "ftz.f" || Name == "d";
13591356
else if (Name.consume_front("max.") || Name.consume_front("min."))
13601357
// nvvm.{min,max}.{i,ii,ui,ull}
13611358
Expand = Name == "s" || Name == "i" || Name == "ll" || Name == "us" ||
@@ -1387,7 +1384,18 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
13871384
Expand = (Name.starts_with("i.") || Name.starts_with("f.") ||
13881385
Name.starts_with("p."));
13891386
else
1390-
Expand = false;
1387+
Expand = StringSwitch<bool>(Name)
1388+
.Case("barrier0", true)
1389+
.Case("barrier.n", true)
1390+
.Case("barrier.sync.cnt", true)
1391+
.Case("barrier.sync", true)
1392+
.Case("barrier", true)
1393+
.Case("bar.sync", true)
1394+
.Case("clz.ll", true)
1395+
.Case("popc.ll", true)
1396+
.Case("h2f", true)
1397+
.Case("swap.lo.hi.b64", true)
1398+
.Default(false);
13911399

13921400
if (Expand) {
13931401
NewFn = nullptr;
@@ -2502,6 +2510,20 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI,
25022510
MDNode *MD = MDNode::get(Builder.getContext(), {});
25032511
LD->setMetadata(LLVMContext::MD_invariant_load, MD);
25042512
return LD;
2513+
} else if (Name == "barrier0" || Name == "barrier.n" || Name == "bar.sync") {
2514+
Value *Arg =
2515+
Name.ends_with('0') ? Builder.getInt32(0) : CI->getArgOperand(0);
2516+
Rep = Builder.CreateIntrinsic(Intrinsic::nvvm_barrier_cta_sync_aligned_all,
2517+
{}, {Arg});
2518+
} else if (Name == "barrier") {
2519+
Rep = Builder.CreateIntrinsic(Intrinsic::nvvm_barrier_cta_sync_aligned, {},
2520+
{CI->getArgOperand(0), CI->getArgOperand(1)});
2521+
} else if (Name == "barrier.sync") {
2522+
Rep = Builder.CreateIntrinsic(Intrinsic::nvvm_barrier_cta_sync_all, {},
2523+
{CI->getArgOperand(0)});
2524+
} else if (Name == "barrier.sync.cnt") {
2525+
Rep = Builder.CreateIntrinsic(Intrinsic::nvvm_barrier_cta_sync, {},
2526+
{CI->getArgOperand(0), CI->getArgOperand(1)});
25052527
} else {
25062528
Intrinsic::ID IID = shouldUpgradeNVPTXBF16Intrinsic(Name);
25072529
if (IID != Intrinsic::not_intrinsic &&

llvm/lib/Target/NVPTX/NVPTXInstrInfo.td

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,47 @@ def BF16RT : RegTyInfo<bf16, Int16Regs, bf16imm, fpimm, supports_imm = 0>;
237237
def F16X2RT : RegTyInfo<v2f16, Int32Regs, ?, ?, supports_imm = 0>;
238238
def BF16X2RT : RegTyInfo<v2bf16, Int32Regs, ?, ?, supports_imm = 0>;
239239

240+
// This class provides a basic wrapper around an NVPTXInst that abstracts the
241+
// specific syntax of most PTX instructions. It automatically handles the
242+
// construction of the asm string based on the provided dag arguments.
243+
// For example, the following asm-strings would be computed:
244+
//
245+
// * BasicFlagsNVPTXInst<(outs Int32Regs:$dst),
246+
// (ins Int32Regs:$a, Int32Regs:$b), (ins),
247+
// "add.s32">;
248+
// ---> "add.s32 \t$dst, $a, $b;"
249+
//
250+
// * BasicFlagsNVPTXInst<(outs Int32Regs:$d),
251+
// (ins Int32Regs:$a, Int32Regs:$b, Hexu32imm:$c),
252+
// (ins PrmtMode:$mode),
253+
// "prmt.b32${mode}">;
254+
// ---> "prmt.b32${mode} \t$d, $a, $b, $c;"
255+
//
256+
class BasicFlagsNVPTXInst<dag outs_dag, dag ins_dag, dag flags_dag, string asmstr,
257+
list<dag> pattern = []>
258+
: NVPTXInst<
259+
outs_dag,
260+
!con(ins_dag, flags_dag),
261+
!strconcat(
262+
asmstr,
263+
!if(!and(!empty(ins_dag), !empty(outs_dag)), "",
264+
!strconcat(
265+
" \t",
266+
!interleave(
267+
!foreach(i, !range(!size(outs_dag)),
268+
"$" # !getdagname(outs_dag, i)),
269+
"|"),
270+
!if(!or(!empty(ins_dag), !empty(outs_dag)), "", ", "),
271+
!interleave(
272+
!foreach(i, !range(!size(ins_dag)),
273+
"$" # !getdagname(ins_dag, i)),
274+
", "))),
275+
";"),
276+
pattern>;
277+
278+
class BasicNVPTXInst<dag outs, dag insv, string asmstr, list<dag> pattern = []>
279+
: BasicFlagsNVPTXInst<outs, insv, (ins), asmstr, pattern>;
280+
240281

241282
multiclass I3Inst<string op_str, SDPatternOperator op_node, RegTyInfo t,
242283
bit commutative, list<Predicate> requires = []> {

llvm/lib/Target/NVPTX/NVPTXIntrinsics.td

Lines changed: 37 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -67,15 +67,6 @@ class THREADMASK_INFO<bit sync> {
6767
// Synchronization and shuffle functions
6868
//-----------------------------------
6969
let isConvergent = true in {
70-
def INT_BARRIER0 : NVPTXInst<(outs), (ins),
71-
"bar.sync \t0;",
72-
[(int_nvvm_barrier0)]>;
73-
def INT_BARRIERN : NVPTXInst<(outs), (ins Int32Regs:$src1),
74-
"bar.sync \t$src1;",
75-
[(int_nvvm_barrier_n i32:$src1)]>;
76-
def INT_BARRIER : NVPTXInst<(outs), (ins Int32Regs:$src1, Int32Regs:$src2),
77-
"bar.sync \t$src1, $src2;",
78-
[(int_nvvm_barrier i32:$src1, i32:$src2)]>;
7970
def INT_BARRIER0_POPC : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred),
8071
!strconcat("{{ \n\t",
8172
".reg .pred \t%p1; \n\t",
@@ -102,39 +93,51 @@ def INT_BARRIER0_OR : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred),
10293
"}}"),
10394
[(set i32:$dst, (int_nvvm_barrier0_or i32:$pred))]>;
10495

105-
def INT_BAR_SYNC : NVPTXInst<(outs), (ins i32imm:$i), "bar.sync \t$i;",
106-
[(int_nvvm_bar_sync imm:$i)]>;
107-
10896
def INT_BAR_WARP_SYNC_I : NVPTXInst<(outs), (ins i32imm:$i), "bar.warp.sync \t$i;",
10997
[(int_nvvm_bar_warp_sync imm:$i)]>,
11098
Requires<[hasPTX<60>, hasSM<30>]>;
11199
def INT_BAR_WARP_SYNC_R : NVPTXInst<(outs), (ins Int32Regs:$i), "bar.warp.sync \t$i;",
112100
[(int_nvvm_bar_warp_sync i32:$i)]>,
113101
Requires<[hasPTX<60>, hasSM<30>]>;
114102

115-
def INT_BARRIER_SYNC_I : NVPTXInst<(outs), (ins i32imm:$i), "barrier.sync \t$i;",
116-
[(int_nvvm_barrier_sync imm:$i)]>,
117-
Requires<[hasPTX<60>, hasSM<30>]>;
118-
def INT_BARRIER_SYNC_R : NVPTXInst<(outs), (ins Int32Regs:$i), "barrier.sync \t$i;",
119-
[(int_nvvm_barrier_sync i32:$i)]>,
120-
Requires<[hasPTX<60>, hasSM<30>]>;
103+
multiclass BARRIER1<string asmstr, Intrinsic intrinsic, list<Predicate> requires = []> {
104+
def _i : BasicNVPTXInst<(outs), (ins i32imm:$i), asmstr,
105+
[(intrinsic imm:$i)]>,
106+
Requires<requires>;
121107

122-
def INT_BARRIER_SYNC_CNT_RR : NVPTXInst<(outs), (ins Int32Regs:$id, Int32Regs:$cnt),
123-
"barrier.sync \t$id, $cnt;",
124-
[(int_nvvm_barrier_sync_cnt i32:$id, i32:$cnt)]>,
125-
Requires<[hasPTX<60>, hasSM<30>]>;
126-
def INT_BARRIER_SYNC_CNT_RI : NVPTXInst<(outs), (ins Int32Regs:$id, i32imm:$cnt),
127-
"barrier.sync \t$id, $cnt;",
128-
[(int_nvvm_barrier_sync_cnt i32:$id, imm:$cnt)]>,
129-
Requires<[hasPTX<60>, hasSM<30>]>;
130-
def INT_BARRIER_SYNC_CNT_IR : NVPTXInst<(outs), (ins i32imm:$id, Int32Regs:$cnt),
131-
"barrier.sync \t$id, $cnt;",
132-
[(int_nvvm_barrier_sync_cnt imm:$id, i32:$cnt)]>,
133-
Requires<[hasPTX<60>, hasSM<30>]>;
134-
def INT_BARRIER_SYNC_CNT_II : NVPTXInst<(outs), (ins i32imm:$id, i32imm:$cnt),
135-
"barrier.sync \t$id, $cnt;",
136-
[(int_nvvm_barrier_sync_cnt imm:$id, imm:$cnt)]>,
137-
Requires<[hasPTX<60>, hasSM<30>]>;
108+
def _r : BasicNVPTXInst<(outs), (ins Int32Regs:$i), asmstr,
109+
[(intrinsic i32:$i)]>,
110+
Requires<requires>;
111+
}
112+
113+
multiclass BARRIER2<string asmstr, Intrinsic intrinsic, list<Predicate> requires = []> {
114+
def _rr : BasicNVPTXInst<(outs), (ins Int32Regs:$i, Int32Regs:$j), asmstr,
115+
[(intrinsic i32:$i, i32:$j)]>,
116+
Requires<requires>;
117+
118+
def _ri : BasicNVPTXInst<(outs), (ins Int32Regs:$i, i32imm:$j), asmstr,
119+
[(intrinsic i32:$i, imm:$j)]>,
120+
Requires<requires>;
121+
122+
def _ir : BasicNVPTXInst<(outs), (ins i32imm:$i, Int32Regs:$j), asmstr,
123+
[(intrinsic imm:$i, i32:$j)]>,
124+
Requires<requires>;
125+
126+
def _ii : BasicNVPTXInst<(outs), (ins i32imm:$i, i32imm:$j), asmstr,
127+
[(intrinsic imm:$i, imm:$j)]>,
128+
Requires<requires>;
129+
}
130+
131+
// Note the "bar.sync" variants could be renamed to the equivalent corresponding
132+
// "barrier.*.aligned" variants. We use the older syntax for compatibility with
133+
// older versions of the PTX ISA.
134+
defm BARRIER_CTA_SYNC_ALIGNED_ALL : BARRIER1<"bar.sync", int_nvvm_barrier_cta_sync_aligned_all>;
135+
defm BARRIER_CTA_SYNC_ALIGNED : BARRIER2<"bar.sync", int_nvvm_barrier_cta_sync_aligned>;
136+
defm BARRIER_CTA_ARRIVE_ALIGNED : BARRIER2<"bar.arrive", int_nvvm_barrier_cta_arrive_aligned>;
137+
138+
defm BARRIER_CTA_SYNC_ALL : BARRIER1<"barrier.sync", int_nvvm_barrier_cta_sync_all, [hasPTX<60>]>;
139+
defm BARRIER_CTA_SYNC : BARRIER2<"barrier.sync", int_nvvm_barrier_cta_sync, [hasPTX<60>]>;
140+
defm BARRIER_CTA_ARRIVE : BARRIER2<"barrier.arrive", int_nvvm_barrier_cta_arrive, [hasPTX<60>]>;
138141

139142
class INT_BARRIER_CLUSTER<string variant, Intrinsic Intr,
140143
list<Predicate> Preds = [hasPTX<78>, hasSM<90>]>:

llvm/lib/Transforms/IPO/AttributorAttributes.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2150,7 +2150,8 @@ struct AANoUnwindCallSite final
21502150

21512151
bool AANoSync::isAlignedBarrier(const CallBase &CB, bool ExecutedAligned) {
21522152
switch (CB.getIntrinsicID()) {
2153-
case Intrinsic::nvvm_barrier0:
2153+
case Intrinsic::nvvm_barrier_cta_sync_aligned_all:
2154+
case Intrinsic::nvvm_barrier_cta_sync_aligned:
21542155
case Intrinsic::nvvm_barrier0_and:
21552156
case Intrinsic::nvvm_barrier0_or:
21562157
case Intrinsic::nvvm_barrier0_popc:

0 commit comments

Comments
 (0)