Skip to content

Commit 295561a

Browse files
committed
AMDGPU: Handle folding frame indexes into add with immediate
Frame index materialization can fold the constant offset into adds with immediates. The mubuf expansion is more complicated because we have to also insert the shift, so restrict this to one use for now. This is preparation to avoid regressions in a future patch. This also misses some cases due to visitation order. It depends on the immediate already folding into the instruction.
1 parent 489d516 commit 295561a

File tree

6 files changed

+32
-27
lines changed

6 files changed

+32
-27
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,20 @@ bool SIFoldOperandsImpl::frameIndexMayFold(
194194
return false;
195195

196196
const unsigned Opc = UseMI.getOpcode();
197+
switch (Opc) {
198+
case AMDGPU::S_ADD_I32:
199+
case AMDGPU::V_ADD_U32_e32:
200+
case AMDGPU::V_ADD_CO_U32_e32:
201+
// TODO: Handle e64 variants
202+
// TODO: Possibly relax hasOneUse. It matters more for mubuf, since we have
203+
// to insert the wave size shift at every point we use the index.
204+
// TODO: Fix depending on visit order to fold immediates into the operand
205+
return UseMI.getOperand(OpNo == 1 ? 2 : 1).isImm() &&
206+
MRI->hasOneNonDBGUse(UseMI.getOperand(OpNo).getReg());
207+
default:
208+
break;
209+
}
210+
197211
if (TII->isMUBUF(UseMI))
198212
return OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
199213
if (!TII->isFLATScratch(UseMI))

llvm/test/CodeGen/AMDGPU/flat-scratch.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4705,8 +4705,7 @@ define amdgpu_ps void @large_offset() {
47054705
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
47064706
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
47074707
; GFX10-NEXT: v_mov_b32_e32 v0, 0
4708-
; GFX10-NEXT: s_movk_i32 s0, 0x810
4709-
; GFX10-NEXT: s_addk_i32 s0, 0x3c0
4708+
; GFX10-NEXT: s_movk_i32 s0, 0xbd0
47104709
; GFX10-NEXT: v_mov_b32_e32 v1, v0
47114710
; GFX10-NEXT: v_mov_b32_e32 v2, v0
47124711
; GFX10-NEXT: v_mov_b32_e32 v3, v0
@@ -4823,8 +4822,7 @@ define amdgpu_ps void @large_offset() {
48234822
; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
48244823
; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
48254824
; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 0
4826-
; GFX10-PAL-NEXT: s_movk_i32 s0, 0x810
4827-
; GFX10-PAL-NEXT: s_addk_i32 s0, 0x3c0
4825+
; GFX10-PAL-NEXT: s_movk_i32 s0, 0xbd0
48284826
; GFX10-PAL-NEXT: v_mov_b32_e32 v1, v0
48294827
; GFX10-PAL-NEXT: v_mov_b32_e32 v2, v0
48304828
; GFX10-PAL-NEXT: v_mov_b32_e32 v3, v0

llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.gfx10.mir

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,7 @@ stack:
1313
body: |
1414
bb.0:
1515
; CHECK-LABEL: name: fold_frame_index__v_add_u32_e32__const_v_fi
16-
; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
17-
; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 128, [[V_MOV_B32_e32_]], implicit $exec
16+
; CHECK: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 128, %stack.0, implicit $exec
1817
; CHECK-NEXT: $vgpr0 = COPY [[V_ADD_U32_e32_]]
1918
; CHECK-NEXT: SI_RETURN implicit $vgpr0
2019
%0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec

llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,7 @@ stack:
1414
body: |
1515
bb.0:
1616
; CHECK-LABEL: name: fold_frame_index__s_add_i32__fi_const
17-
; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.0
18-
; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_MOV_B32_]], 128, implicit-def $scc
17+
; CHECK: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, 128, implicit-def $scc
1918
; CHECK-NEXT: $sgpr4 = COPY [[S_ADD_I32_]]
2019
; CHECK-NEXT: SI_RETURN implicit $sgpr4
2120
%0:sreg_32 = S_MOV_B32 %stack.0
@@ -35,8 +34,7 @@ stack:
3534
body: |
3635
bb.0:
3736
; CHECK-LABEL: name: fold_frame_index__s_add_i32__const_fi
38-
; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.0
39-
; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 128, [[S_MOV_B32_]], implicit-def $scc
37+
; CHECK: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 128, %stack.0, implicit-def $scc
4038
; CHECK-NEXT: $sgpr4 = COPY [[S_ADD_I32_]]
4139
; CHECK-NEXT: SI_RETURN implicit $sgpr4
4240
%0:sreg_32 = S_MOV_B32 %stack.0
@@ -56,8 +54,7 @@ stack:
5654
body: |
5755
bb.0:
5856
; CHECK-LABEL: name: fold_frame_index__s_add_i32__materializedconst_fi
59-
; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.0
60-
; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, [[S_MOV_B32_]], implicit-def $scc
57+
; CHECK: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc
6158
; CHECK-NEXT: $sgpr4 = COPY [[S_ADD_I32_]]
6259
; CHECK-NEXT: SI_RETURN implicit $sgpr4
6360
%0:sreg_32 = S_MOV_B32 256
@@ -101,8 +98,7 @@ stack:
10198
body: |
10299
bb.0:
103100
; CHECK-LABEL: name: fold_frame_index__s_add_i32__fi_materializedconst_1
104-
; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.0
105-
; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, [[S_MOV_B32_]], implicit-def $scc
101+
; CHECK: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc
106102
; CHECK-NEXT: $sgpr4 = COPY [[S_ADD_I32_]]
107103
; CHECK-NEXT: SI_RETURN implicit $sgpr4
108104
%0:sreg_32 = S_MOV_B32 256
@@ -173,8 +169,7 @@ stack:
173169
body: |
174170
bb.0:
175171
; CHECK-LABEL: name: fold_frame_index__v_add_u32_e32__const_v_fi
176-
; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
177-
; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 128, [[V_MOV_B32_e32_]], implicit $exec
172+
; CHECK: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 128, %stack.0, implicit $exec
178173
; CHECK-NEXT: $sgpr4 = COPY [[V_ADD_U32_e32_]]
179174
; CHECK-NEXT: SI_RETURN implicit $sgpr4
180175
%0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
@@ -278,8 +273,7 @@ stack:
278273
body: |
279274
bb.0:
280275
; CHECK-LABEL: name: fold_frame_index__v_add_co_u32_e32__const_v_fi
281-
; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
282-
; CHECK-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 128, [[V_MOV_B32_e32_]], implicit-def $vcc, implicit $exec
276+
; CHECK: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 128, %stack.0, implicit-def $vcc, implicit $exec
283277
; CHECK-NEXT: $vgpr0 = COPY [[V_ADD_CO_U32_e32_]]
284278
; CHECK-NEXT: SI_RETURN implicit $vgpr0
285279
%0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec

llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1550,10 +1550,10 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset(
15501550
; GFX8-NEXT: s_add_i32 s6, s32, 0x201000
15511551
; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s6 ; 4-byte Folded Spill
15521552
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
1553-
; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32
1553+
; GFX8-NEXT: v_lshrrev_b32_e64 v1, 6, s32
15541554
; GFX8-NEXT: s_movk_i32 vcc_lo, 0x4040
1555-
; GFX8-NEXT: v_add_u32_e32 v0, vcc, vcc_lo, v0
1556-
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x3ec, v0
1555+
; GFX8-NEXT: v_add_u32_e32 v1, vcc, vcc_lo, v1
1556+
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x3ec, v1
15571557
; GFX8-NEXT: v_writelane_b32 v2, s59, 0
15581558
; GFX8-NEXT: v_lshrrev_b32_e64 v1, 6, s32
15591559
; GFX8-NEXT: v_readfirstlane_b32 s59, v0

llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1582,12 +1582,12 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
15821582
; GFX7-NEXT: buffer_store_dword v15, v16, s[0:3], s32 offen offset:60 ; 4-byte Folded Spill
15831583
; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32
15841584
; GFX7-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
1585-
; GFX7-NEXT: v_lshr_b32_e64 v0, s32, 6
1585+
; GFX7-NEXT: v_lshr_b32_e64 v1, s32, 6
15861586
; GFX7-NEXT: v_writelane_b32 v22, vcc_lo, 0
15871587
; GFX7-NEXT: v_writelane_b32 v22, vcc_hi, 1
15881588
; GFX7-NEXT: s_movk_i32 vcc_lo, 0x4040
1589-
; GFX7-NEXT: v_add_i32_e32 v0, vcc, vcc_lo, v0
1590-
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x200, v0
1589+
; GFX7-NEXT: v_add_i32_e32 v1, vcc, vcc_lo, v1
1590+
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x200, v1
15911591
; GFX7-NEXT: v_writelane_b32 v23, s59, 27
15921592
; GFX7-NEXT: v_readfirstlane_b32 s59, v0
15931593
; GFX7-NEXT: s_and_b64 vcc, 0, exec
@@ -1723,12 +1723,12 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
17231723
; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], s32 offen offset:60 ; 4-byte Folded Spill
17241724
; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32
17251725
; GFX8-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
1726-
; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32
1726+
; GFX8-NEXT: v_lshrrev_b32_e64 v1, 6, s32
17271727
; GFX8-NEXT: v_writelane_b32 v22, vcc_lo, 0
17281728
; GFX8-NEXT: v_writelane_b32 v22, vcc_hi, 1
17291729
; GFX8-NEXT: s_movk_i32 vcc_lo, 0x4040
1730-
; GFX8-NEXT: v_add_u32_e32 v0, vcc, vcc_lo, v0
1731-
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x200, v0
1730+
; GFX8-NEXT: v_add_u32_e32 v1, vcc, vcc_lo, v1
1731+
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x200, v1
17321732
; GFX8-NEXT: v_writelane_b32 v23, s59, 27
17331733
; GFX8-NEXT: v_readfirstlane_b32 s59, v0
17341734
; GFX8-NEXT: s_and_b64 vcc, 0, exec

0 commit comments

Comments
 (0)