Skip to content

Commit 09583de

Browse files
authored
AMDGPU: Reduce 64-bit add width if low bits are known 0 (#122049)
If one of the inputs has all 0 bits, the low part cannot carry and we can just pass through the original value. Add case: https://alive2.llvm.org/ce/z/TNc7hf Sub case: https://alive2.llvm.org/ce/z/AjH2-J We could do this in the general case with computeKnownBits, but add is so common this could be potentially expensive for something which will fire infrequently. One potential concern is this could break the 64-bit add we expect to see for addressing mode matching, but these constants shouldn't appear often in addressing expressions. One test for large offset expressions changes but isn't worse. Fixes ROCm#237
1 parent 6376418 commit 09583de

File tree

6 files changed

+177
-183
lines changed

6 files changed

+177
-183
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13985,6 +13985,43 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
1398513985
return Accum;
1398613986
}
1398713987

13988+
SDValue
13989+
SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
13990+
DAGCombinerInfo &DCI) const {
13991+
SDValue RHS = N->getOperand(1);
13992+
auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
13993+
if (!CRHS)
13994+
return SDValue();
13995+
13996+
// TODO: Worth using computeKnownBits? Maybe expensive since it's so
13997+
// common.
13998+
uint64_t Val = CRHS->getZExtValue();
13999+
if (countr_zero(Val) >= 32) {
14000+
SelectionDAG &DAG = DCI.DAG;
14001+
SDLoc SL(N);
14002+
SDValue LHS = N->getOperand(0);
14003+
14004+
// Avoid carry machinery if we know the low half of the add does not
14005+
// contribute to the final result.
14006+
//
14007+
// add i64:x, K if computeTrailingZeros(K) >= 32
14008+
// => build_pair (add x.hi, K.hi), x.lo
14009+
14010+
// Breaking the 64-bit add here with this strange constant is unlikely
14011+
// to interfere with addressing mode patterns.
14012+
14013+
SDValue Hi = getHiHalf64(LHS, DAG);
14014+
SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
14015+
SDValue AddHi =
14016+
DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags());
14017+
14018+
SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
14019+
return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
14020+
}
14021+
14022+
return SDValue();
14023+
}
14024+
1398814025
// Collect the ultimate src of each of the mul node's operands, and confirm
1398914026
// each operand is 8 bytes.
1399014027
static std::optional<ByteProvider<SDValue>>
@@ -14261,6 +14298,11 @@ SDValue SITargetLowering::performAddCombine(SDNode *N,
1426114298
return V;
1426214299
}
1426314300

14301+
if (VT == MVT::i64) {
14302+
if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
14303+
return Folded;
14304+
}
14305+
1426414306
if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
1426514307
(Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
1426614308
SDValue TempNode(N, 0);
@@ -14446,6 +14488,11 @@ SDValue SITargetLowering::performSubCombine(SDNode *N,
1444614488
SelectionDAG &DAG = DCI.DAG;
1444714489
EVT VT = N->getValueType(0);
1444814490

14491+
if (VT == MVT::i64) {
14492+
if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
14493+
return Folded;
14494+
}
14495+
1444914496
if (VT != MVT::i32)
1445014497
return SDValue();
1445114498

llvm/lib/Target/AMDGPU/SIISelLowering.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,9 @@ class SITargetLowering final : public AMDGPUTargetLowering {
212212
unsigned getFusedOpcode(const SelectionDAG &DAG,
213213
const SDNode *N0, const SDNode *N1) const;
214214
SDValue tryFoldToMad64_32(SDNode *N, DAGCombinerInfo &DCI) const;
215+
SDValue foldAddSub64WithZeroLowBitsTo32(SDNode *N,
216+
DAGCombinerInfo &DCI) const;
217+
215218
SDValue performAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
216219
SDValue performAddCarrySubCarryCombine(SDNode *N, DAGCombinerInfo &DCI) const;
217220
SDValue performSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;

llvm/test/CodeGen/AMDGPU/add64-low-32-bits-known-zero.ll

Lines changed: 18 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,7 @@
1010
define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_0(i64 inreg %reg) {
1111
; GFX9-LABEL: s_add_i64_const_low_bits_known0_0:
1212
; GFX9: ; %bb.0:
13-
; GFX9-NEXT: s_add_u32 s0, s0, 0
14-
; GFX9-NEXT: s_addc_u32 s1, s1, 0x40000
13+
; GFX9-NEXT: s_add_i32 s1, s1, 0x40000
1514
; GFX9-NEXT: ; return to shader part epilog
1615
%add = add i64 %reg, 1125899906842624 ; (1 << 50)
1716
ret i64 %add
@@ -20,8 +19,7 @@ define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_0(i64 inreg %reg) {
2019
define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_1(i64 inreg %reg) {
2120
; GFX9-LABEL: s_add_i64_const_low_bits_known0_1:
2221
; GFX9: ; %bb.0:
23-
; GFX9-NEXT: s_add_u32 s0, s0, 0
24-
; GFX9-NEXT: s_addc_u32 s1, s1, 1
22+
; GFX9-NEXT: s_add_i32 s1, s1, 1
2523
; GFX9-NEXT: ; return to shader part epilog
2624
%add = add i64 %reg, 4294967296 ; (1 << 32)
2725
ret i64 %add
@@ -30,8 +28,7 @@ define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_1(i64 inreg %reg) {
3028
define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_2(i64 inreg %reg) {
3129
; GFX9-LABEL: s_add_i64_const_low_bits_known0_2:
3230
; GFX9: ; %bb.0:
33-
; GFX9-NEXT: s_add_u32 s0, s0, 0
34-
; GFX9-NEXT: s_addc_u32 s1, s1, 2
31+
; GFX9-NEXT: s_add_i32 s1, s1, 2
3532
; GFX9-NEXT: ; return to shader part epilog
3633
%add = add i64 %reg, 8589934592 ; (1 << 33)
3734
ret i64 %add
@@ -40,8 +37,7 @@ define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_2(i64 inreg %reg) {
4037
define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_3(i64 inreg %reg) {
4138
; GFX9-LABEL: s_add_i64_const_low_bits_known0_3:
4239
; GFX9: ; %bb.0:
43-
; GFX9-NEXT: s_add_u32 s0, s0, 0
44-
; GFX9-NEXT: s_addc_u32 s1, s1, 0x80000000
40+
; GFX9-NEXT: s_add_i32 s1, s1, 0x80000000
4541
; GFX9-NEXT: ; return to shader part epilog
4642
%add = add i64 %reg, -9223372036854775808 ; (1 << 63)
4743
ret i64 %add
@@ -50,8 +46,7 @@ define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_3(i64 inreg %reg) {
5046
define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_4(i64 inreg %reg) {
5147
; GFX9-LABEL: s_add_i64_const_low_bits_known0_4:
5248
; GFX9: ; %bb.0:
53-
; GFX9-NEXT: s_add_u32 s0, s0, 0
54-
; GFX9-NEXT: s_addc_u32 s1, s1, -1
49+
; GFX9-NEXT: s_add_i32 s1, s1, -1
5550
; GFX9-NEXT: ; return to shader part epilog
5651
%add = add i64 %reg, -4294967296 ; 0xffffffff00000000
5752
ret i64 %add
@@ -61,9 +56,7 @@ define i64 @v_add_i64_const_low_bits_known0_0(i64 %reg) {
6156
; GFX9-LABEL: v_add_i64_const_low_bits_known0_0:
6257
; GFX9: ; %bb.0:
6358
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
64-
; GFX9-NEXT: v_mov_b32_e32 v2, 0x40000
65-
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0
66-
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
59+
; GFX9-NEXT: v_add_u32_e32 v1, 0x40000, v1
6760
; GFX9-NEXT: s_setpc_b64 s[30:31]
6861
%add = add i64 %reg, 1125899906842624 ; (1 << 50)
6962
ret i64 %add
@@ -73,8 +66,7 @@ define i64 @v_add_i64_const_low_bits_known0_1(i64 %reg) {
7366
; GFX9-LABEL: v_add_i64_const_low_bits_known0_1:
7467
; GFX9: ; %bb.0:
7568
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
76-
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0
77-
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 1, v1, vcc
69+
; GFX9-NEXT: v_add_u32_e32 v1, 1, v1
7870
; GFX9-NEXT: s_setpc_b64 s[30:31]
7971
%add = add i64 %reg, 4294967296 ; (1 << 32)
8072
ret i64 %add
@@ -84,8 +76,7 @@ define i64 @v_add_i64_const_low_bits_known0_2(i64 %reg) {
8476
; GFX9-LABEL: v_add_i64_const_low_bits_known0_2:
8577
; GFX9: ; %bb.0:
8678
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
87-
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0
88-
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
79+
; GFX9-NEXT: v_add_u32_e32 v1, 2, v1
8980
; GFX9-NEXT: s_setpc_b64 s[30:31]
9081
%add = add i64 %reg, 8589934592 ; (1 << 33)
9182
ret i64 %add
@@ -95,9 +86,7 @@ define i64 @v_add_i64_const_low_bits_known0_3(i64 %reg) {
9586
; GFX9-LABEL: v_add_i64_const_low_bits_known0_3:
9687
; GFX9: ; %bb.0:
9788
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
98-
; GFX9-NEXT: v_bfrev_b32_e32 v2, 1
99-
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0
100-
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
89+
; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v1
10190
; GFX9-NEXT: s_setpc_b64 s[30:31]
10291
%add = add i64 %reg, -9223372036854775808 ; (1 << 63)
10392
ret i64 %add
@@ -107,8 +96,7 @@ define i64 @v_add_i64_const_low_bits_known0_4(i64 %reg) {
10796
; GFX9-LABEL: v_add_i64_const_low_bits_known0_4:
10897
; GFX9: ; %bb.0:
10998
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
110-
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0
111-
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
99+
; GFX9-NEXT: v_add_u32_e32 v1, -1, v1
112100
; GFX9-NEXT: s_setpc_b64 s[30:31]
113101
%add = add i64 %reg, -4294967296 ; 0xffffffff00000000
114102
ret i64 %add
@@ -139,10 +127,8 @@ define <2 x i64> @v_add_v2i64_splat_const_low_bits_known0_0(<2 x i64> %reg) {
139127
; GFX9-LABEL: v_add_v2i64_splat_const_low_bits_known0_0:
140128
; GFX9: ; %bb.0:
141129
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
142-
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0
143-
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 1, v1, vcc
144-
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0, v2
145-
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 1, v3, vcc
130+
; GFX9-NEXT: v_add_u32_e32 v1, 1, v1
131+
; GFX9-NEXT: v_add_u32_e32 v3, 1, v3
146132
; GFX9-NEXT: s_setpc_b64 s[30:31]
147133
%add = add <2 x i64> %reg, <i64 4294967296, i64 4294967296> ; (1 << 32)
148134
ret <2 x i64> %add
@@ -152,10 +138,8 @@ define <2 x i64> @v_add_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> %reg) {
152138
; GFX9-LABEL: v_add_v2i64_nonsplat_const_low_bits_known0_0:
153139
; GFX9: ; %bb.0:
154140
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
155-
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0
156-
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 1, v1, vcc
157-
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0, v2
158-
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 2, v3, vcc
141+
; GFX9-NEXT: v_add_u32_e32 v1, 1, v1
142+
; GFX9-NEXT: v_add_u32_e32 v3, 2, v3
159143
; GFX9-NEXT: s_setpc_b64 s[30:31]
160144
%add = add <2 x i64> %reg, <i64 4294967296, i64 8589934592> ; (1 << 32), (1 << 33)
161145
ret <2 x i64> %add
@@ -164,10 +148,8 @@ define <2 x i64> @v_add_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> %reg) {
164148
define amdgpu_ps <2 x i64> @s_add_v2i64_splat_const_low_bits_known0_0(<2 x i64> inreg %reg) {
165149
; GFX9-LABEL: s_add_v2i64_splat_const_low_bits_known0_0:
166150
; GFX9: ; %bb.0:
167-
; GFX9-NEXT: s_add_u32 s0, s0, 0
168-
; GFX9-NEXT: s_addc_u32 s1, s1, 1
169-
; GFX9-NEXT: s_add_u32 s2, s2, 0
170-
; GFX9-NEXT: s_addc_u32 s3, s3, 1
151+
; GFX9-NEXT: s_add_i32 s1, s1, 1
152+
; GFX9-NEXT: s_add_i32 s3, s3, 1
171153
; GFX9-NEXT: ; return to shader part epilog
172154
%add = add <2 x i64> %reg, <i64 4294967296, i64 4294967296> ; (1 << 32)
173155
ret <2 x i64> %add
@@ -176,10 +158,8 @@ define amdgpu_ps <2 x i64> @s_add_v2i64_splat_const_low_bits_known0_0(<2 x i64>
176158
define amdgpu_ps <2 x i64> @s_add_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> inreg %reg) {
177159
; GFX9-LABEL: s_add_v2i64_nonsplat_const_low_bits_known0_0:
178160
; GFX9: ; %bb.0:
179-
; GFX9-NEXT: s_add_u32 s0, s0, 0
180-
; GFX9-NEXT: s_addc_u32 s1, s1, 1
181-
; GFX9-NEXT: s_add_u32 s2, s2, 0
182-
; GFX9-NEXT: s_addc_u32 s3, s3, 2
161+
; GFX9-NEXT: s_add_i32 s1, s1, 1
162+
; GFX9-NEXT: s_add_i32 s3, s3, 2
183163
; GFX9-NEXT: ; return to shader part epilog
184164
%add = add <2 x i64> %reg, <i64 4294967296, i64 8589934592> ; (1 << 32), (1 << 33)
185165
ret <2 x i64> %add

llvm/test/CodeGen/AMDGPU/global-saddr-load.ll

Lines changed: 22 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -668,37 +668,32 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0xFFFFFFFF(ptr addrspace(1)
668668
define amdgpu_ps float @global_load_saddr_i8_offset_0x100000000(ptr addrspace(1) inreg %sbase) {
669669
; GFX9-LABEL: global_load_saddr_i8_offset_0x100000000:
670670
; GFX9: ; %bb.0:
671-
; GFX9-NEXT: v_mov_b32_e32 v1, s3
672-
; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2
673-
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 1, v1, vcc
674-
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
671+
; GFX9-NEXT: s_add_i32 s3, s3, 1
672+
; GFX9-NEXT: v_mov_b32_e32 v0, 0
673+
; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3]
675674
; GFX9-NEXT: s_waitcnt vmcnt(0)
676675
; GFX9-NEXT: ; return to shader part epilog
677676
;
678677
; GFX10-LABEL: global_load_saddr_i8_offset_0x100000000:
679678
; GFX10: ; %bb.0:
680-
; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2
681-
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1]
682-
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
679+
; GFX10-NEXT: v_mov_b32_e32 v0, 0
680+
; GFX10-NEXT: s_add_i32 s3, s3, 1
681+
; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3]
683682
; GFX10-NEXT: s_waitcnt vmcnt(0)
684683
; GFX10-NEXT: ; return to shader part epilog
685684
;
686685
; GFX11-LABEL: global_load_saddr_i8_offset_0x100000000:
687686
; GFX11: ; %bb.0:
688-
; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0, s2
689-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
690-
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1]
691-
; GFX11-NEXT: global_load_u8 v0, v[0:1], off
687+
; GFX11-NEXT: v_mov_b32_e32 v0, 0
688+
; GFX11-NEXT: s_add_i32 s3, s3, 1
689+
; GFX11-NEXT: global_load_u8 v0, v0, s[2:3]
692690
; GFX11-NEXT: s_waitcnt vmcnt(0)
693691
; GFX11-NEXT: ; return to shader part epilog
694692
;
695693
; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_0x100000000:
696694
; GFX12-SDAG: ; %bb.0:
697-
; GFX12-SDAG-NEXT: s_mov_b32 s0, 0
698-
; GFX12-SDAG-NEXT: s_mov_b32 s1, 1
699-
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
700-
; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1]
701-
; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0
695+
; GFX12-SDAG-NEXT: s_add_co_i32 s3, s3, 1
696+
; GFX12-SDAG-NEXT: s_load_u8 s0, s[2:3], 0x0
702697
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
703698
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
704699
; GFX12-SDAG-NEXT: ; return to shader part epilog
@@ -934,37 +929,32 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg0xFFFFFFFF(ptr addrspace(
934929
define amdgpu_ps float @global_load_saddr_i8_offset_neg0x100000000(ptr addrspace(1) inreg %sbase) {
935930
; GFX9-LABEL: global_load_saddr_i8_offset_neg0x100000000:
936931
; GFX9: ; %bb.0:
937-
; GFX9-NEXT: v_mov_b32_e32 v1, s3
938-
; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2
939-
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
940-
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
932+
; GFX9-NEXT: s_add_i32 s3, s3, -1
933+
; GFX9-NEXT: v_mov_b32_e32 v0, 0
934+
; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3]
941935
; GFX9-NEXT: s_waitcnt vmcnt(0)
942936
; GFX9-NEXT: ; return to shader part epilog
943937
;
944938
; GFX10-LABEL: global_load_saddr_i8_offset_neg0x100000000:
945939
; GFX10: ; %bb.0:
946-
; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2
947-
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
948-
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
940+
; GFX10-NEXT: v_mov_b32_e32 v0, 0
941+
; GFX10-NEXT: s_add_i32 s3, s3, -1
942+
; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3]
949943
; GFX10-NEXT: s_waitcnt vmcnt(0)
950944
; GFX10-NEXT: ; return to shader part epilog
951945
;
952946
; GFX11-LABEL: global_load_saddr_i8_offset_neg0x100000000:
953947
; GFX11: ; %bb.0:
954-
; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0, s2
955-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
956-
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1]
957-
; GFX11-NEXT: global_load_u8 v0, v[0:1], off
948+
; GFX11-NEXT: v_mov_b32_e32 v0, 0
949+
; GFX11-NEXT: s_add_i32 s3, s3, -1
950+
; GFX11-NEXT: global_load_u8 v0, v0, s[2:3]
958951
; GFX11-NEXT: s_waitcnt vmcnt(0)
959952
; GFX11-NEXT: ; return to shader part epilog
960953
;
961954
; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_neg0x100000000:
962955
; GFX12-SDAG: ; %bb.0:
963-
; GFX12-SDAG-NEXT: s_mov_b32 s0, 0
964-
; GFX12-SDAG-NEXT: s_mov_b32 s1, -1
965-
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
966-
; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1]
967-
; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0
956+
; GFX12-SDAG-NEXT: s_add_co_i32 s3, s3, -1
957+
; GFX12-SDAG-NEXT: s_load_u8 s0, s[2:3], 0x0
968958
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
969959
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
970960
; GFX12-SDAG-NEXT: ; return to shader part epilog

0 commit comments

Comments
 (0)