Skip to content

Commit d604ab6

Browse files
authored
[AMDGPU] Support image atomic no return instructions (#150742)
Add support for no-return variants of image atomic operations (e.g. IMAGE_ATOMIC_ADD_NORTN, IMAGE_ATOMIC_CMPSWAP_NORTN). These variants are generated when the return value of the intrinsic is unused, allowing the backend to select no return type instructions.
1 parent 5378584 commit d604ab6

10 files changed

+786
-160
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ const D16ImageDimIntrinsic *lookupD16ImageDimIntrinsic(unsigned Intr);
5050
struct ImageDimIntrinsicInfo {
5151
unsigned Intr;
5252
unsigned BaseOpcode;
53+
unsigned AtomicNoRetBaseOpcode;
5354
MIMGDim Dim;
5455

5556
uint8_t NumOffsetArgs;

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2006,19 +2006,27 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
20062006
MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
20072007
MachineBasicBlock *MBB = MI.getParent();
20082008
const DebugLoc &DL = MI.getDebugLoc();
2009+
unsigned IntrOpcode = Intr->BaseOpcode;
2010+
2011+
// For image atomic: use no-return opcode if result is unused.
2012+
if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode) {
2013+
Register ResultDef = MI.getOperand(0).getReg();
2014+
if (MRI->use_nodbg_empty(ResultDef))
2015+
IntrOpcode = Intr->AtomicNoRetBaseOpcode;
2016+
}
20092017

20102018
const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
2011-
AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
2019+
AMDGPU::getMIMGBaseOpcodeInfo(IntrOpcode);
20122020

20132021
const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
2014-
unsigned IntrOpcode = Intr->BaseOpcode;
20152022
const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
20162023
const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
20172024
const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
20182025

20192026
const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
20202027

2021-
Register VDataIn, VDataOut;
2028+
Register VDataIn = AMDGPU::NoRegister;
2029+
Register VDataOut = AMDGPU::NoRegister;
20222030
LLT VDataTy;
20232031
int NumVDataDwords = -1;
20242032
bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
@@ -2049,7 +2057,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
20492057
unsigned DMaskLanes = 0;
20502058

20512059
if (BaseOpcode->Atomic) {
2052-
VDataOut = MI.getOperand(0).getReg();
2060+
if (!BaseOpcode->NoReturn)
2061+
VDataOut = MI.getOperand(0).getReg();
20532062
VDataIn = MI.getOperand(2).getReg();
20542063
LLT Ty = MRI->getType(VDataIn);
20552064

@@ -2099,8 +2108,9 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
20992108
assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
21002109

21012110
unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
2102-
if (BaseOpcode->Atomic)
2103-
CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
2111+
// Keep GLC only when the atomic's result is actually used.
2112+
if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
2113+
CPol |= AMDGPU::CPol::GLC;
21042114
if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
21052115
AMDGPU::CPol::VOLATILE))
21062116
return false;

llvm/lib/Target/AMDGPU/MIMGInstructions.td

Lines changed: 102 additions & 83 deletions
Large diffs are not rendered by default.

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9134,16 +9134,23 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
91349134
SDLoc DL(Op);
91359135
MachineFunction &MF = DAG.getMachineFunction();
91369136
const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
9137+
unsigned IntrOpcode = Intr->BaseOpcode;
9138+
// For image atomic: use no-return opcode if result is unused.
9139+
if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode &&
9140+
!Op.getNode()->hasAnyUseOfValue(0))
9141+
IntrOpcode = Intr->AtomicNoRetBaseOpcode;
91379142
const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
9138-
AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
9143+
AMDGPU::getMIMGBaseOpcodeInfo(IntrOpcode);
91399144
const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
9140-
unsigned IntrOpcode = Intr->BaseOpcode;
91419145
bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
91429146
bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
91439147
bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
91449148

91459149
SmallVector<EVT, 3> ResultTypes(Op->values());
91469150
SmallVector<EVT, 3> OrigResultTypes(Op->values());
9151+
if (BaseOpcode->NoReturn && BaseOpcode->Atomic)
9152+
ResultTypes.erase(&ResultTypes[0]);
9153+
91479154
bool IsD16 = false;
91489155
bool IsG16 = false;
91499156
bool IsA16 = false;
@@ -9162,8 +9169,10 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
91629169
VData = Op.getOperand(2);
91639170

91649171
IsAtomicPacked16Bit =
9165-
(Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
9166-
Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
9172+
(IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
9173+
IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN ||
9174+
IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 ||
9175+
IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN);
91679176

91689177
bool Is64Bit = VData.getValueSizeInBits() == 64;
91699178
if (BaseOpcode->AtomicX2) {
@@ -9173,7 +9182,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
91739182
if (Is64Bit)
91749183
VData = DAG.getBitcast(MVT::v4i32, VData);
91759184

9176-
ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9185+
if (!BaseOpcode->NoReturn)
9186+
ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9187+
91779188
DMask = Is64Bit ? 0xf : 0x3;
91789189
NumVDataDwords = Is64Bit ? 4 : 2;
91799190
} else {
@@ -9399,8 +9410,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
93999410
}
94009411

94019412
unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
9402-
if (BaseOpcode->Atomic)
9403-
CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
9413+
// Keep GLC only when the atomic's result is actually used.
9414+
if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
9415+
CPol |= AMDGPU::CPol::GLC;
94049416
if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
94059417
AMDGPU::CPol::VOLATILE))
94069418
return Op;
@@ -9512,13 +9524,20 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
95129524
DAG.setNodeMemRefs(NewNode, {MemRef});
95139525
}
95149526

9527+
if (BaseOpcode->NoReturn) {
9528+
if (BaseOpcode->Atomic)
9529+
return DAG.getMergeValues(
9530+
{DAG.getPOISON(OrigResultTypes[0]), SDValue(NewNode, 0)}, DL);
9531+
9532+
return SDValue(NewNode, 0);
9533+
}
9534+
95159535
if (BaseOpcode->AtomicX2) {
95169536
SmallVector<SDValue, 1> Elt;
95179537
DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
95189538
return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
95199539
}
9520-
if (BaseOpcode->NoReturn)
9521-
return SDValue(NewNode, 0);
9540+
95229541
return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
95239542
Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
95249543
NumVDataDwords, IsAtomicPacked16Bit, DL);

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1200,7 +1200,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3
12001200
; GFX6-NEXT: s_mov_b32 s5, s7
12011201
; GFX6-NEXT: s_mov_b32 s6, s8
12021202
; GFX6-NEXT: s_mov_b32 s7, s9
1203-
; GFX6-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm glc
1203+
; GFX6-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm
12041204
; GFX6-NEXT: s_endpgm
12051205
;
12061206
; GFX8-LABEL: atomic_cmpswap_i32_1d_no_return:
@@ -1213,7 +1213,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3
12131213
; GFX8-NEXT: s_mov_b32 s5, s7
12141214
; GFX8-NEXT: s_mov_b32 s6, s8
12151215
; GFX8-NEXT: s_mov_b32 s7, s9
1216-
; GFX8-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm glc
1216+
; GFX8-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm
12171217
; GFX8-NEXT: s_endpgm
12181218
;
12191219
; GFX900-LABEL: atomic_cmpswap_i32_1d_no_return:
@@ -1226,7 +1226,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3
12261226
; GFX900-NEXT: s_mov_b32 s5, s7
12271227
; GFX900-NEXT: s_mov_b32 s6, s8
12281228
; GFX900-NEXT: s_mov_b32 s7, s9
1229-
; GFX900-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm glc
1229+
; GFX900-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm
12301230
; GFX900-NEXT: s_endpgm
12311231
;
12321232
; GFX90A-LABEL: atomic_cmpswap_i32_1d_no_return:
@@ -1239,7 +1239,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3
12391239
; GFX90A-NEXT: s_mov_b32 s5, s7
12401240
; GFX90A-NEXT: s_mov_b32 s6, s8
12411241
; GFX90A-NEXT: s_mov_b32 s7, s9
1242-
; GFX90A-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm glc
1242+
; GFX90A-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm
12431243
; GFX90A-NEXT: s_endpgm
12441244
;
12451245
; GFX10PLUS-LABEL: atomic_cmpswap_i32_1d_no_return:
@@ -1252,7 +1252,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3
12521252
; GFX10PLUS-NEXT: s_mov_b32 s5, s7
12531253
; GFX10PLUS-NEXT: s_mov_b32 s6, s8
12541254
; GFX10PLUS-NEXT: s_mov_b32 s7, s9
1255-
; GFX10PLUS-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc
1255+
; GFX10PLUS-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm
12561256
; GFX10PLUS-NEXT: s_endpgm
12571257
;
12581258
; GFX12-LABEL: atomic_cmpswap_i32_1d_no_return:
@@ -1265,7 +1265,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3
12651265
; GFX12-NEXT: s_mov_b32 s5, s7
12661266
; GFX12-NEXT: s_mov_b32 s6, s8
12671267
; GFX12-NEXT: s_mov_b32 s7, s9
1268-
; GFX12-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
1268+
; GFX12-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D
12691269
; GFX12-NEXT: s_endpgm
12701270
main_body:
12711271
%v = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32 %cmp, i32 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -3194,7 +3194,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6
31943194
; GFX6-NEXT: s_mov_b32 s5, s7
31953195
; GFX6-NEXT: s_mov_b32 s6, s8
31963196
; GFX6-NEXT: s_mov_b32 s7, s9
3197-
; GFX6-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm glc
3197+
; GFX6-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm
31983198
; GFX6-NEXT: s_endpgm
31993199
;
32003200
; GFX8-LABEL: atomic_cmpswap_i64_1d_no_return:
@@ -3207,7 +3207,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6
32073207
; GFX8-NEXT: s_mov_b32 s5, s7
32083208
; GFX8-NEXT: s_mov_b32 s6, s8
32093209
; GFX8-NEXT: s_mov_b32 s7, s9
3210-
; GFX8-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm glc
3210+
; GFX8-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm
32113211
; GFX8-NEXT: s_endpgm
32123212
;
32133213
; GFX900-LABEL: atomic_cmpswap_i64_1d_no_return:
@@ -3220,7 +3220,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6
32203220
; GFX900-NEXT: s_mov_b32 s5, s7
32213221
; GFX900-NEXT: s_mov_b32 s6, s8
32223222
; GFX900-NEXT: s_mov_b32 s7, s9
3223-
; GFX900-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm glc
3223+
; GFX900-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm
32243224
; GFX900-NEXT: s_endpgm
32253225
;
32263226
; GFX90A-LABEL: atomic_cmpswap_i64_1d_no_return:
@@ -3233,7 +3233,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6
32333233
; GFX90A-NEXT: s_mov_b32 s5, s7
32343234
; GFX90A-NEXT: s_mov_b32 s6, s8
32353235
; GFX90A-NEXT: s_mov_b32 s7, s9
3236-
; GFX90A-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm glc
3236+
; GFX90A-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm
32373237
; GFX90A-NEXT: s_endpgm
32383238
;
32393239
; GFX10PLUS-LABEL: atomic_cmpswap_i64_1d_no_return:
@@ -3246,7 +3246,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6
32463246
; GFX10PLUS-NEXT: s_mov_b32 s5, s7
32473247
; GFX10PLUS-NEXT: s_mov_b32 s6, s8
32483248
; GFX10PLUS-NEXT: s_mov_b32 s7, s9
3249-
; GFX10PLUS-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc
3249+
; GFX10PLUS-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
32503250
; GFX10PLUS-NEXT: s_endpgm
32513251
;
32523252
; GFX12-LABEL: atomic_cmpswap_i64_1d_no_return:
@@ -3259,7 +3259,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6
32593259
; GFX12-NEXT: s_mov_b32 s5, s7
32603260
; GFX12-NEXT: s_mov_b32 s6, s8
32613261
; GFX12-NEXT: s_mov_b32 s7, s9
3262-
; GFX12-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
3262+
; GFX12-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D
32633263
; GFX12-NEXT: s_endpgm
32643264
main_body:
32653265
%v = call i64 @llvm.amdgcn.image.atomic.cmpswap.1d.i64.i32(i64 %cmp, i64 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)

0 commit comments

Comments
 (0)