Skip to content

Commit b7b3fab

Browse files
committed
[AMDGPU] Add s_wakeup_barrier instruction for gfx1250
1 parent bd21095 commit b7b3fab

File tree

13 files changed

+141
-7
lines changed

13 files changed

+141
-7
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -749,6 +749,7 @@ TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr16_b128_v8bf16, "V8yV8y*3", "nc", "gfx
749749

750750
TARGET_BUILTIN(__builtin_amdgcn_s_setprio_inc_wg, "vIs", "n", "setprio-inc-wg-inst")
751751
TARGET_BUILTIN(__builtin_amdgcn_s_monitor_sleep, "vIs", "n", "gfx1250-insts")
752+
TARGET_BUILTIN(__builtin_amdgcn_s_wakeup_barrier, "vv*", "n", "gfx1250-insts")
752753

753754
TARGET_BUILTIN(__builtin_amdgcn_s_wait_asynccnt, "vIUs", "n", "gfx1250-insts")
754755
TARGET_BUILTIN(__builtin_amdgcn_s_wait_tensorcnt, "vIUs", "n", "gfx1250-insts")

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1489,6 +1489,21 @@ void test_s_cluster_barrier()
14891489
__builtin_amdgcn_s_cluster_barrier();
14901490
}
14911491

1492+
// CHECK-LABEL: @test_s_wakeup_barrier(
1493+
// CHECK-NEXT: entry:
1494+
// CHECK-NEXT: [[BAR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1495+
// CHECK-NEXT: [[BAR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BAR_ADDR]] to ptr
1496+
// CHECK-NEXT: store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8
1497+
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8
1498+
// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
1499+
// CHECK-NEXT: call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) [[TMP1]])
1500+
// CHECK-NEXT: ret void
1501+
//
1502+
void test_s_wakeup_barrier(void *bar)
1503+
{
1504+
__builtin_amdgcn_s_wakeup_barrier(bar);
1505+
}
1506+
14921507
// CHECK-LABEL: @test_global_add_f32(
14931508
// CHECK-NEXT: entry:
14941509
// CHECK-NEXT: [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,12 @@ def int_amdgcn_s_barrier_join : ClangBuiltin<"__builtin_amdgcn_s_barrier_join">,
314314
Intrinsic<[], [local_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
315315
IntrNoCallback, IntrNoFree]>;
316316

317+
// void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) %barrier)
318+
// The %barrier argument must be uniform, otherwise behavior is undefined.
319+
def int_amdgcn_s_wakeup_barrier : ClangBuiltin<"__builtin_amdgcn_s_wakeup_barrier">,
320+
Intrinsic<[], [local_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
321+
IntrNoCallback, IntrNoFree]>;
322+
317323
// void @llvm.amdgcn.s.barrier.wait(i16 %barrierType)
318324
def int_amdgcn_s_barrier_wait : ClangBuiltin<"__builtin_amdgcn_s_barrier_wait">,
319325
Intrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent,

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2392,6 +2392,16 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
23922392
case Intrinsic::amdgcn_s_barrier_init:
23932393
case Intrinsic::amdgcn_s_barrier_signal_var:
23942394
return selectNamedBarrierInit(I, IntrinsicID);
2395+
case Intrinsic::amdgcn_s_wakeup_barrier: {
2396+
if (!AMDGPU::isGFX1250(STI)) {
2397+
Function &F = I.getMF()->getFunction();
2398+
F.getContext().diagnose(
2399+
DiagnosticInfoUnsupported(F, "intrinsic not supported on subtarget",
2400+
I.getDebugLoc(), DS_Error));
2401+
return false;
2402+
}
2403+
return selectNamedBarrierInst(I, IntrinsicID);
2404+
}
23952405
case Intrinsic::amdgcn_s_barrier_join:
23962406
case Intrinsic::amdgcn_s_get_named_barrier_state:
23972407
return selectNamedBarrierInst(I, IntrinsicID);
@@ -6830,6 +6840,8 @@ unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
68306840
llvm_unreachable("not a named barrier op");
68316841
case Intrinsic::amdgcn_s_barrier_join:
68326842
return AMDGPU::S_BARRIER_JOIN_IMM;
6843+
case Intrinsic::amdgcn_s_wakeup_barrier:
6844+
return AMDGPU::S_WAKEUP_BARRIER_IMM;
68336845
case Intrinsic::amdgcn_s_get_named_barrier_state:
68346846
return AMDGPU::S_GET_BARRIER_STATE_IMM;
68356847
};
@@ -6839,6 +6851,8 @@ unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
68396851
llvm_unreachable("not a named barrier op");
68406852
case Intrinsic::amdgcn_s_barrier_join:
68416853
return AMDGPU::S_BARRIER_JOIN_M0;
6854+
case Intrinsic::amdgcn_s_wakeup_barrier:
6855+
return AMDGPU::S_WAKEUP_BARRIER_M0;
68426856
case Intrinsic::amdgcn_s_get_named_barrier_state:
68436857
return AMDGPU::S_GET_BARRIER_STATE_M0;
68446858
};

llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -371,6 +371,7 @@ bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) {
371371
case Intrinsic::amdgcn_s_barrier_wait:
372372
case Intrinsic::amdgcn_s_barrier_leave:
373373
case Intrinsic::amdgcn_s_get_barrier_state:
374+
case Intrinsic::amdgcn_s_wakeup_barrier:
374375
case Intrinsic::amdgcn_wave_barrier:
375376
case Intrinsic::amdgcn_sched_barrier:
376377
case Intrinsic::amdgcn_sched_group_barrier:

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3345,6 +3345,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
33453345
constrainOpWithReadfirstlane(B, MI, 1);
33463346
return;
33473347
case Intrinsic::amdgcn_s_barrier_join:
3348+
case Intrinsic::amdgcn_s_wakeup_barrier:
33483349
constrainOpWithReadfirstlane(B, MI, 1);
33493350
return;
33503351
case Intrinsic::amdgcn_s_barrier_init:
@@ -5579,6 +5580,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
55795580
OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
55805581
break;
55815582
case Intrinsic::amdgcn_s_barrier_join:
5583+
case Intrinsic::amdgcn_s_wakeup_barrier:
55825584
OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
55835585
break;
55845586
case Intrinsic::amdgcn_s_barrier_init:

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11503,6 +11503,11 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
1150311503
auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
1150411504
return SDValue(NewMI, 0);
1150511505
}
11506+
case Intrinsic::amdgcn_s_wakeup_barrier: {
11507+
if (!AMDGPU::isGFX1250(*Subtarget))
11508+
return SDValue();
11509+
[[fallthrough]];
11510+
}
1150611511
case Intrinsic::amdgcn_s_barrier_join: {
1150711512
// these three intrinsics have one operand: barrier pointer
1150811513
SDValue Chain = Op->getOperand(0);
@@ -11512,16 +11517,32 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
1151211517

1151311518
if (isa<ConstantSDNode>(BarOp)) {
1151411519
uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
11515-
Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11516-
11520+
switch (IntrinsicID) {
11521+
default:
11522+
return SDValue();
11523+
case Intrinsic::amdgcn_s_barrier_join:
11524+
Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11525+
break;
11526+
case Intrinsic::amdgcn_s_wakeup_barrier:
11527+
Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
11528+
break;
11529+
}
1151711530
// extract the BarrierID from bits 4-9 of the immediate
1151811531
unsigned BarID = (BarVal >> 4) & 0x3F;
1151911532
SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
1152011533
Ops.push_back(K);
1152111534
Ops.push_back(Chain);
1152211535
} else {
11523-
Opc = AMDGPU::S_BARRIER_JOIN_M0;
11524-
11536+
switch (IntrinsicID) {
11537+
default:
11538+
return SDValue();
11539+
case Intrinsic::amdgcn_s_barrier_join:
11540+
Opc = AMDGPU::S_BARRIER_JOIN_M0;
11541+
break;
11542+
case Intrinsic::amdgcn_s_wakeup_barrier:
11543+
Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
11544+
break;
11545+
}
1152511546
// extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
1152611547
SDValue M0Val;
1152711548
M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,

llvm/lib/Target/AMDGPU/SOPInstructions.td

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -504,6 +504,12 @@ def S_BARRIER_JOIN_M0 : SOP1_Pseudo <"s_barrier_join m0", (outs), (ins),
504504
let isConvergent = 1;
505505
}
506506

507+
def S_WAKEUP_BARRIER_M0 : SOP1_Pseudo <"s_wakeup_barrier m0", (outs), (ins),
508+
"", []>{
509+
let SchedRW = [WriteBarrier];
510+
let isConvergent = 1;
511+
let SubtargetPredicate = isGFX1250Plus;
512+
}
507513
} // End Uses = [M0]
508514

509515
def S_BARRIER_SIGNAL_IMM : SOP1_Pseudo <"s_barrier_signal", (outs),
@@ -527,6 +533,12 @@ def S_BARRIER_JOIN_IMM : SOP1_Pseudo <"s_barrier_join", (outs),
527533
let isConvergent = 1;
528534
}
529535

536+
def S_WAKEUP_BARRIER_IMM : SOP1_Pseudo <"s_wakeup_barrier", (outs),
537+
(ins SplitBarrier:$src0), "$src0", []>{
538+
let SchedRW = [WriteBarrier];
539+
let isConvergent = 1;
540+
let SubtargetPredicate = isGFX1250Plus;
541+
}
530542
} // End has_sdst = 0
531543

532544
def S_GET_BARRIER_STATE_IMM : SOP1_Pseudo <"s_get_barrier_state", (outs SSrc_b32:$sdst),
@@ -2226,6 +2238,8 @@ defm S_SLEEP_VAR : SOP1_IMM_Real_gfx12<0x058>;
22262238
// GFX1250
22272239
defm S_GET_SHADER_CYCLES_U64 : SOP1_Real_gfx12<0x06>;
22282240
defm S_ADD_PC_I64 : SOP1_Real_gfx12<0x04b>;
2241+
defm S_WAKEUP_BARRIER_M0 : SOP1_M0_Real_gfx12<0x057>;
2242+
defm S_WAKEUP_BARRIER_IMM : SOP1_IMM_Real_gfx12<0x057>;
22292243

22302244
//===----------------------------------------------------------------------===//
22312245
// SOP1 - GFX1150, GFX12

llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-module-lds.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,8 +112,7 @@ attributes #2 = { nounwind readnone }
112112
; CHECK: attributes #[[ATTR0]] = { nounwind }
113113
; CHECK: attributes #[[ATTR1]] = { nounwind "amdgpu-lds-size"="1" }
114114
; CHECK: attributes #[[ATTR2:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
115-
; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nounwind }
116-
; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
115+
; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
117116
;.
118117
; CHECK: [[META0]] = !{i32 8396816, i32 8396817}
119118
; CHECK: [[META1]] = !{i32 8396912, i32 8396913}

llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,6 @@ attributes #2 = { nounwind readnone }
9696
;.
9797
; CHECK: attributes #[[ATTR0]] = { nounwind }
9898
; CHECK: attributes #[[ATTR1:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
99-
; CHECK: attributes #[[ATTR2:[0-9]+]] = { convergent nounwind }
10099
;.
101100
; CHECK: [[META0]] = !{i32 8396816, i32 8396817}
102101
; CHECK: [[META1]] = !{i32 8396912, i32 8396913}

0 commit comments

Comments
 (0)