Skip to content

Commit 1b91fa0

Browse files
arsenmmemfrob
authored andcommitted
AMDGPU: Define raw/struct variants of buffer atomic fadd
Somehow the new FP atomic buffer intrinsics ended up using the legacy style for buffer intrinsics.
1 parent 152efc4 commit 1b91fa0

12 files changed

+721
-29
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -973,9 +973,9 @@ class AMDGPUStructBufferStore<LLVMType data_ty = llvm_any_ty> : Intrinsic <
973973
def int_amdgcn_struct_buffer_store_format : AMDGPUStructBufferStore;
974974
def int_amdgcn_struct_buffer_store : AMDGPUStructBufferStore;
975975

976-
class AMDGPURawBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic <
977-
[data_ty],
978-
[LLVMMatchType<0>, // vdata(VGPR)
976+
class AMDGPURawBufferAtomic<LLVMType data_ty = llvm_any_ty, bit NoRtn = 0> : Intrinsic <
977+
!if(NoRtn, [], [data_ty]),
978+
[!if(NoRtn, data_ty, LLVMMatchType<0>), // vdata(VGPR)
979979
llvm_v4i32_ty, // rsrc(SGPR)
980980
llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
981981
llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
@@ -1005,9 +1005,12 @@ def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic<
10051005
[ImmArg<ArgIndex<5>>, IntrWillReturn], "", [SDNPMemOperand]>,
10061006
AMDGPURsrcIntrinsic<2, 0>;
10071007

1008-
class AMDGPUStructBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic <
1009-
[data_ty],
1010-
[LLVMMatchType<0>, // vdata(VGPR)
1008+
// gfx908 intrinsic
1009+
def int_amdgcn_raw_buffer_atomic_fadd : AMDGPURawBufferAtomic<llvm_anyfloat_ty, /*NoRtn*/1>;
1010+
1011+
class AMDGPUStructBufferAtomic<LLVMType data_ty = llvm_any_ty, bit NoRtn = 0> : Intrinsic <
1012+
!if(NoRtn, [], [data_ty]),
1013+
[!if(NoRtn, data_ty, LLVMMatchType<0>), // vdata(VGPR)
10111014
llvm_v4i32_ty, // rsrc(SGPR)
10121015
llvm_i32_ty, // vindex(VGPR)
10131016
llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
@@ -1039,6 +1042,10 @@ def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic<
10391042
[ImmArg<ArgIndex<6>>, IntrWillReturn], "", [SDNPMemOperand]>,
10401043
AMDGPURsrcIntrinsic<2, 0>;
10411044

1045+
// gfx908 intrinsic
1046+
def int_amdgcn_struct_buffer_atomic_fadd : AMDGPUStructBufferAtomic<llvm_anyfloat_ty, /*NoRtn*/1>;
1047+
1048+
10421049
// Obsolescent tbuffer intrinsics.
10431050
def int_amdgcn_tbuffer_load : Intrinsic <
10441051
[llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
@@ -1804,9 +1811,11 @@ class AMDGPUGlobalAtomicNoRtn : Intrinsic <
18041811
[IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>], "",
18051812
[SDNPMemOperand]>;
18061813

1807-
def int_amdgcn_buffer_atomic_fadd : AMDGPUBufferAtomicNoRtn;
18081814
def int_amdgcn_global_atomic_fadd : AMDGPUGlobalAtomicNoRtn;
18091815

1816+
// Legacy form of the intrinsic. raw and struct forms should be preferred.
1817+
def int_amdgcn_buffer_atomic_fadd : AMDGPUBufferAtomicNoRtn;
1818+
18101819
// llvm.amdgcn.mfma.f32.* vdst, srcA, srcB, srcC, cbsz, abid, blgp
18111820
def int_amdgcn_mfma_f32_32x32x1f32 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_32x32x1f32">,
18121821
Intrinsic<[llvm_v32f32_ty],

llvm/lib/Target/AMDGPU/AMDGPUGISel.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,7 @@ def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_OR, SIbuffer_atomic_or>;
204204
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_XOR, SIbuffer_atomic_xor>;
205205
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_INC, SIbuffer_atomic_inc>;
206206
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_DEC, SIbuffer_atomic_dec>;
207+
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FADD, SIbuffer_atomic_fadd>;
207208
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_CMPSWAP, SIbuffer_atomic_cmpswap>;
208209
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD, SIsbuffer_load>;
209210

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3647,6 +3647,9 @@ static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
36473647
case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
36483648
case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
36493649
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3650+
case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
3651+
case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
3652+
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
36503653
default:
36513654
llvm_unreachable("unhandled atomic opcode");
36523655
}
@@ -3657,20 +3660,28 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
36573660
Intrinsic::ID IID) const {
36583661
const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
36593662
IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3663+
const bool HasReturn = MI.getNumExplicitDefs() != 0;
36603664

3661-
Register Dst = MI.getOperand(0).getReg();
3662-
Register VData = MI.getOperand(2).getReg();
3665+
Register Dst;
36633666

3664-
Register CmpVal;
36653667
int OpOffset = 0;
3668+
if (HasReturn) {
3669+
// A few FP atomics do not support return values.
3670+
Dst = MI.getOperand(0).getReg();
3671+
} else {
3672+
OpOffset = -1;
3673+
}
3674+
3675+
Register VData = MI.getOperand(2 + OpOffset).getReg();
3676+
Register CmpVal;
36663677

36673678
if (IsCmpSwap) {
36683679
CmpVal = MI.getOperand(3 + OpOffset).getReg();
36693680
++OpOffset;
36703681
}
36713682

36723683
Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3673-
const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3684+
const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
36743685

36753686
// The struct intrinsic variants add one additional operand over raw.
36763687
const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
@@ -3695,9 +3706,12 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
36953706
if (!VIndex)
36963707
VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
36973708

3698-
auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3699-
.addDef(Dst)
3700-
.addUse(VData); // vdata
3709+
auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
3710+
3711+
if (HasReturn)
3712+
MIB.addDef(Dst);
3713+
3714+
MIB.addUse(VData); // vdata
37013715

37023716
if (IsCmpSwap)
37033717
MIB.addReg(CmpVal);
@@ -4462,6 +4476,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
44624476
case Intrinsic::amdgcn_struct_buffer_atomic_inc:
44634477
case Intrinsic::amdgcn_raw_buffer_atomic_dec:
44644478
case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4479+
case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4480+
case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
44654481
case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
44664482
case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
44674483
return legalizeBufferAtomic(MI, B, IntrID);

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2957,6 +2957,11 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
29572957
executeInWaterfallLoop(MI, MRI, {2, 5});
29582958
return;
29592959
}
2960+
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: {
2961+
applyDefaultMapping(OpdMapper);
2962+
executeInWaterfallLoop(MI, MRI, {1, 4});
2963+
return;
2964+
}
29602965
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
29612966
applyDefaultMapping(OpdMapper);
29622967
executeInWaterfallLoop(MI, MRI, {3, 6});
@@ -3933,6 +3938,23 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
39333938
// initialized.
39343939
break;
39353940
}
3941+
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: {
3942+
// vdata_in
3943+
OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3944+
3945+
// rsrc
3946+
OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3947+
3948+
// vindex
3949+
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3950+
3951+
// voffset
3952+
OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3953+
3954+
// soffset
3955+
OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3956+
break;
3957+
}
39363958
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
39373959
// vdata_out
39383960
OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);

llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,7 @@ def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_or>;
225225
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_xor>;
226226
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_inc>;
227227
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_dec>;
228+
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fadd>;
228229
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_cmpswap>;
229230
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_swap>;
230231
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_add>;
@@ -238,6 +239,7 @@ def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_or>;
238239
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_xor>;
239240
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_inc>;
240241
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_dec>;
242+
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fadd>;
241243
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cmpswap>;
242244
def : SourceOfDivergence<int_amdgcn_buffer_atomic_csub>;
243245
def : SourceOfDivergence<int_amdgcn_ps_live>;

llvm/lib/Target/AMDGPU/BUFInstructions.td

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1400,34 +1400,34 @@ multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt,
14001400
(name vt:$vdata_in, v4i32:$rsrc, 0,
14011401
0, i32:$soffset, timm:$offset,
14021402
timm:$cachepolicy, 0),
1403-
(!cast<MUBUF_Pseudo>(opcode # _OFFSET) $vdata_in, $rsrc, $soffset,
1404-
(as_i16imm $offset), (extract_slc $cachepolicy))
1403+
(!cast<MUBUF_Pseudo>(opcode # _OFFSET) getVregSrcForVT<vt>.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset,
1404+
(as_i16timm $offset), (extract_slc $cachepolicy))
14051405
>;
14061406

14071407
def : GCNPat<
14081408
(name vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
14091409
0, i32:$soffset, timm:$offset,
14101410
timm:$cachepolicy, timm),
1411-
(!cast<MUBUF_Pseudo>(opcode # _IDXEN) $vdata_in, $vindex, $rsrc, $soffset,
1412-
(as_i16imm $offset), (extract_slc $cachepolicy))
1411+
(!cast<MUBUF_Pseudo>(opcode # _IDXEN) getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset,
1412+
(as_i16timm $offset), (extract_slc $cachepolicy))
14131413
>;
14141414

14151415
def : GCNPat<
14161416
(name vt:$vdata_in, v4i32:$rsrc, 0,
14171417
i32:$voffset, i32:$soffset, timm:$offset,
14181418
timm:$cachepolicy, 0),
1419-
(!cast<MUBUF_Pseudo>(opcode # _OFFEN) $vdata_in, $voffset, $rsrc, $soffset,
1420-
(as_i16imm $offset), (extract_slc $cachepolicy))
1419+
(!cast<MUBUF_Pseudo>(opcode # _OFFEN) getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset,
1420+
(as_i16timm $offset), (extract_slc $cachepolicy))
14211421
>;
14221422

14231423
def : GCNPat<
14241424
(name vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
14251425
i32:$voffset, i32:$soffset, timm:$offset,
14261426
timm:$cachepolicy, timm),
14271427
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN)
1428-
$vdata_in,
1429-
(REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
1430-
$rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy))
1428+
getVregSrcForVT<vt>.ret:$vdata_in,
1429+
(REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
1430+
SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_slc $cachepolicy))
14311431
>;
14321432
}
14331433

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1082,8 +1082,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
10821082
Info.flags |= MachineMemOperand::MOStore;
10831083
} else {
10841084
// Atomic
1085-
Info.opc = ISD::INTRINSIC_W_CHAIN;
1086-
Info.memVT = MVT::getVT(CI.getType());
1085+
Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID :
1086+
ISD::INTRINSIC_W_CHAIN;
1087+
Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
10871088
Info.flags = MachineMemOperand::MOLoad |
10881089
MachineMemOperand::MOStore |
10891090
MachineMemOperand::MODereferenceable;
@@ -7062,7 +7063,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
70627063
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
70637064
case Intrinsic::amdgcn_raw_buffer_atomic_dec:
70647065
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
7065-
70667066
case Intrinsic::amdgcn_struct_buffer_atomic_swap:
70677067
return lowerStructBufferAtomicIntrin(Op, DAG,
70687068
AMDGPUISD::BUFFER_ATOMIC_SWAP);
@@ -7485,7 +7485,10 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
74857485
return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
74867486
M->getMemoryVT(), M->getMemOperand());
74877487
}
7488-
7488+
case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
7489+
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
7490+
case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
7491+
return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
74897492
case Intrinsic::amdgcn_buffer_atomic_fadd: {
74907493
unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
74917494
unsigned IdxEn = 1;

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2410,8 +2410,8 @@ def G_AMDGPU_ATOMIC_FMIN : G_ATOMICRMW_OP;
24102410
def G_AMDGPU_ATOMIC_FMAX : G_ATOMICRMW_OP;
24112411
}
24122412

2413-
class BufferAtomicGenericInstruction : AMDGPUGenericInstruction {
2414-
let OutOperandList = (outs type0:$dst);
2413+
class BufferAtomicGenericInstruction<bit NoRtn = 0> : AMDGPUGenericInstruction {
2414+
let OutOperandList = !if(NoRtn, (outs), (outs type0:$dst));
24152415
let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset,
24162416
type2:$soffset, untyped_imm_0:$offset,
24172417
untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
@@ -2432,6 +2432,7 @@ def G_AMDGPU_BUFFER_ATOMIC_OR : BufferAtomicGenericInstruction;
24322432
def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction;
24332433
def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction;
24342434
def G_AMDGPU_BUFFER_ATOMIC_DEC : BufferAtomicGenericInstruction;
2435+
def G_AMDGPU_BUFFER_ATOMIC_FADD : BufferAtomicGenericInstruction<1/*NoRtn*/>;
24352436

24362437
def G_AMDGPU_BUFFER_ATOMIC_CMPSWAP : AMDGPUGenericInstruction {
24372438
let OutOperandList = (outs type0:$dst);

0 commit comments

Comments
 (0)