Skip to content

Commit d0ee820

Browse files
authored
[AMDGPU] Add s_barrier_init|join|leave instructions (#153296)
1 parent 8710571 commit d0ee820

19 files changed

+625
-3
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -503,6 +503,9 @@ TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal, "vIi", "n", "gfx12-insts")
503503
TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal_var, "vv*i", "n", "gfx12-insts")
504504
TARGET_BUILTIN(__builtin_amdgcn_s_barrier_wait, "vIs", "n", "gfx12-insts")
505505
TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal_isfirst, "bIi", "n", "gfx12-insts")
506+
TARGET_BUILTIN(__builtin_amdgcn_s_barrier_init, "vv*i", "n", "gfx12-insts")
507+
TARGET_BUILTIN(__builtin_amdgcn_s_barrier_join, "vv*", "n", "gfx12-insts")
508+
TARGET_BUILTIN(__builtin_amdgcn_s_barrier_leave, "vIs", "n", "gfx12-insts")
506509
TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts")
507510
TARGET_BUILTIN(__builtin_amdgcn_s_get_named_barrier_state, "Uiv*", "n", "gfx12-insts")
508511
TARGET_BUILTIN(__builtin_amdgcn_s_prefetch_data, "vvC*Ui", "nc", "gfx12-insts")

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-param-err.cl

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,13 @@ kernel void builtins_amdgcn_s_barrier_signal_isfirst_err(global int* in, global
2323
*out = *in;
2424
}
2525

26+
kernel void builtins_amdgcn_s_barrier_leave_err(global int* in, global int* out, int barrier) {
27+
28+
__builtin_amdgcn_s_barrier_signal(-1);
29+
__builtin_amdgcn_s_barrier_leave(barrier); // expected-error {{'__builtin_amdgcn_s_barrier_leave' must be a constant integer}}
30+
*out = *in;
31+
}
32+
2633
void test_s_buffer_prefetch_data(__amdgpu_buffer_rsrc_t rsrc, unsigned int off)
2734
{
2835
__builtin_amdgcn_s_buffer_prefetch_data(rsrc, off, 31); // expected-error {{'__builtin_amdgcn_s_buffer_prefetch_data' must be a constant integer}}

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,50 @@ void test_s_barrier_signal_isfirst(int* a, int* b, int *c)
139139
__builtin_amdgcn_s_barrier_wait(1);
140140
}
141141

142+
// CHECK-LABEL: @test_s_barrier_init(
143+
// CHECK-NEXT: entry:
144+
// CHECK-NEXT: [[BAR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
145+
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
146+
// CHECK-NEXT: [[BAR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BAR_ADDR]] to ptr
147+
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
148+
// CHECK-NEXT: store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8
149+
// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
150+
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8
151+
// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
152+
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
153+
// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) [[TMP1]], i32 [[TMP2]])
154+
// CHECK-NEXT: ret void
155+
//
156+
void test_s_barrier_init(void *bar, int a)
157+
{
158+
__builtin_amdgcn_s_barrier_init(bar, a);
159+
}
160+
161+
// CHECK-LABEL: @test_s_barrier_join(
162+
// CHECK-NEXT: entry:
163+
// CHECK-NEXT: [[BAR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
164+
// CHECK-NEXT: [[BAR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BAR_ADDR]] to ptr
165+
// CHECK-NEXT: store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8
166+
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8
167+
// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
168+
// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) [[TMP1]])
169+
// CHECK-NEXT: ret void
170+
//
171+
void test_s_barrier_join(void *bar)
172+
{
173+
__builtin_amdgcn_s_barrier_join(bar);
174+
}
175+
176+
// CHECK-LABEL: @test_s_barrier_leave(
177+
// CHECK-NEXT: entry:
178+
// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.leave(i16 1)
179+
// CHECK-NEXT: ret void
180+
//
181+
void test_s_barrier_leave()
182+
{
183+
__builtin_amdgcn_s_barrier_leave(1);
184+
}
185+
142186
// CHECK-LABEL: @test_s_get_barrier_state(
143187
// CHECK-NEXT: entry:
144188
// CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -290,11 +290,29 @@ def int_amdgcn_s_barrier_signal_isfirst : ClangBuiltin<"__builtin_amdgcn_s_barri
290290
Intrinsic<[llvm_i1_ty], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent,
291291
IntrWillReturn, IntrNoCallback, IntrNoFree]>;
292292

293+
// void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) %barrier, i32 %memberCnt)
294+
// The %barrier and %memberCnt argument must be uniform, otherwise behavior is undefined.
295+
def int_amdgcn_s_barrier_init : ClangBuiltin<"__builtin_amdgcn_s_barrier_init">,
296+
Intrinsic<[], [local_ptr_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent,
297+
IntrWillReturn, IntrNoCallback, IntrNoFree]>;
298+
299+
// void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) %barrier)
300+
// The %barrier argument must be uniform, otherwise behavior is undefined.
301+
def int_amdgcn_s_barrier_join : ClangBuiltin<"__builtin_amdgcn_s_barrier_join">,
302+
Intrinsic<[], [local_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
303+
IntrNoCallback, IntrNoFree]>;
304+
293305
// void @llvm.amdgcn.s.barrier.wait(i16 %barrierType)
294306
def int_amdgcn_s_barrier_wait : ClangBuiltin<"__builtin_amdgcn_s_barrier_wait">,
295307
Intrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent,
296308
IntrWillReturn, IntrNoCallback, IntrNoFree]>;
297309

310+
311+
// void @llvm.amdgcn.s.barrier.leave(i16 %barrierType)
312+
def int_amdgcn_s_barrier_leave : ClangBuiltin<"__builtin_amdgcn_s_barrier_leave">,
313+
Intrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent,
314+
IntrWillReturn, IntrNoCallback, IntrNoFree]>;
315+
298316
// uint32_t @llvm.amdgcn.s.get.barrier.state(i32 %barrierId)
299317
// The %barrierType argument must be uniform, otherwise behavior is undefined.
300318
def int_amdgcn_s_get_barrier_state : ClangBuiltin<"__builtin_amdgcn_s_get_barrier_state">,

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2368,8 +2368,10 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
23682368
case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
23692369
case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
23702370
return selectDSBvhStackIntrinsic(I);
2371+
case Intrinsic::amdgcn_s_barrier_init:
23712372
case Intrinsic::amdgcn_s_barrier_signal_var:
23722373
return selectNamedBarrierInit(I, IntrinsicID);
2374+
case Intrinsic::amdgcn_s_barrier_join:
23732375
case Intrinsic::amdgcn_s_get_named_barrier_state:
23742376
return selectNamedBarrierInst(I, IntrinsicID);
23752377
case Intrinsic::amdgcn_s_get_barrier_state:
@@ -6772,13 +6774,17 @@ unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
67726774
switch (IntrID) {
67736775
default:
67746776
llvm_unreachable("not a named barrier op");
6777+
case Intrinsic::amdgcn_s_barrier_join:
6778+
return AMDGPU::S_BARRIER_JOIN_IMM;
67756779
case Intrinsic::amdgcn_s_get_named_barrier_state:
67766780
return AMDGPU::S_GET_BARRIER_STATE_IMM;
67776781
};
67786782
} else {
67796783
switch (IntrID) {
67806784
default:
67816785
llvm_unreachable("not a named barrier op");
6786+
case Intrinsic::amdgcn_s_barrier_join:
6787+
return AMDGPU::S_BARRIER_JOIN_M0;
67826788
case Intrinsic::amdgcn_s_get_named_barrier_state:
67836789
return AMDGPU::S_GET_BARRIER_STATE_M0;
67846790
};
@@ -6829,8 +6835,11 @@ bool AMDGPUInstructionSelector::selectNamedBarrierInit(
68296835
BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(TmpReg4);
68306836
constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
68316837

6838+
unsigned Opc = IntrID == Intrinsic::amdgcn_s_barrier_init
6839+
? AMDGPU::S_BARRIER_INIT_M0
6840+
: AMDGPU::S_BARRIER_SIGNAL_M0;
68326841
MachineInstrBuilder MIB;
6833-
MIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_M0));
6842+
MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
68346843

68356844
I.eraseFromParent();
68366845
return true;

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
156156
bool selectNamedBarrierInst(MachineInstr &I, Intrinsic::ID IID) const;
157157
bool selectSBarrierSignalIsfirst(MachineInstr &I, Intrinsic::ID IID) const;
158158
bool selectSGetBarrierState(MachineInstr &I, Intrinsic::ID IID) const;
159+
bool selectSBarrierLeave(MachineInstr &I) const;
159160

160161
std::pair<Register, unsigned> selectVOP3ModsImpl(Register Src,
161162
bool IsCanonicalizing = true,

llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -352,7 +352,10 @@ bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) {
352352
case Intrinsic::amdgcn_s_barrier_signal:
353353
case Intrinsic::amdgcn_s_barrier_signal_var:
354354
case Intrinsic::amdgcn_s_barrier_signal_isfirst:
355+
case Intrinsic::amdgcn_s_barrier_init:
356+
case Intrinsic::amdgcn_s_barrier_join:
355357
case Intrinsic::amdgcn_s_barrier_wait:
358+
case Intrinsic::amdgcn_s_barrier_leave:
356359
case Intrinsic::amdgcn_s_get_barrier_state:
357360
case Intrinsic::amdgcn_wave_barrier:
358361
case Intrinsic::amdgcn_sched_barrier:

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3342,6 +3342,10 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
33423342
assert(OpdMapper.getVRegs(1).empty());
33433343
constrainOpWithReadfirstlane(B, MI, 1);
33443344
return;
3345+
case Intrinsic::amdgcn_s_barrier_join:
3346+
constrainOpWithReadfirstlane(B, MI, 1);
3347+
return;
3348+
case Intrinsic::amdgcn_s_barrier_init:
33453349
case Intrinsic::amdgcn_s_barrier_signal_var:
33463350
constrainOpWithReadfirstlane(B, MI, 1);
33473351
constrainOpWithReadfirstlane(B, MI, 2);
@@ -5515,6 +5519,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
55155519
case Intrinsic::amdgcn_s_sleep_var:
55165520
OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
55175521
break;
5522+
case Intrinsic::amdgcn_s_barrier_join:
5523+
OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5524+
break;
5525+
case Intrinsic::amdgcn_s_barrier_init:
55185526
case Intrinsic::amdgcn_s_barrier_signal_var:
55195527
OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
55205528
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10825,13 +10825,17 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
1082510825
return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
1082610826
Op->getOperand(2), Chain),
1082710827
0);
10828+
case Intrinsic::amdgcn_s_barrier_init:
1082810829
case Intrinsic::amdgcn_s_barrier_signal_var: {
1082910830
// these two intrinsics have two operands: barrier pointer and member count
1083010831
SDValue Chain = Op->getOperand(0);
1083110832
SmallVector<SDValue, 2> Ops;
1083210833
SDValue BarOp = Op->getOperand(2);
1083310834
SDValue CntOp = Op->getOperand(3);
1083410835
SDValue M0Val;
10836+
unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
10837+
? AMDGPU::S_BARRIER_INIT_M0
10838+
: AMDGPU::S_BARRIER_SIGNAL_M0;
1083510839
// extract the BarrierID from bits 4-9 of BarOp
1083610840
SDValue BarID;
1083710841
BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
@@ -10855,8 +10859,40 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
1085510859

1085610860
Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
1085710861

10858-
auto *NewMI = DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_M0, DL,
10859-
Op->getVTList(), Ops);
10862+
auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10863+
return SDValue(NewMI, 0);
10864+
}
10865+
case Intrinsic::amdgcn_s_barrier_join: {
10866+
// these three intrinsics have one operand: barrier pointer
10867+
SDValue Chain = Op->getOperand(0);
10868+
SmallVector<SDValue, 2> Ops;
10869+
SDValue BarOp = Op->getOperand(2);
10870+
unsigned Opc;
10871+
10872+
if (isa<ConstantSDNode>(BarOp)) {
10873+
uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
10874+
Opc = AMDGPU::S_BARRIER_JOIN_IMM;
10875+
10876+
// extract the BarrierID from bits 4-9 of the immediate
10877+
unsigned BarID = (BarVal >> 4) & 0x3F;
10878+
SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
10879+
Ops.push_back(K);
10880+
Ops.push_back(Chain);
10881+
} else {
10882+
Opc = AMDGPU::S_BARRIER_JOIN_M0;
10883+
10884+
// extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
10885+
SDValue M0Val;
10886+
M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
10887+
DAG.getShiftAmountConstant(4, MVT::i32, DL));
10888+
M0Val =
10889+
SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
10890+
DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10891+
0);
10892+
Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10893+
}
10894+
10895+
auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
1086010896
return SDValue(NewMI, 0);
1086110897
}
1086210898
case Intrinsic::amdgcn_s_prefetch_data: {

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2341,6 +2341,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
23412341
case AMDGPU::S_MEMREALTIME:
23422342
case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0:
23432343
case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM:
2344+
case AMDGPU::S_BARRIER_LEAVE:
23442345
case AMDGPU::S_GET_BARRIER_STATE_M0:
23452346
case AMDGPU::S_GET_BARRIER_STATE_IMM:
23462347
ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);

0 commit comments

Comments
 (0)