Skip to content

Commit 01b4b2a

Browse files
authored
[AMDGPU][SDAG] Handle ISD::PTRADD in VOP3 patterns (#143881)
This patch mirrors similar patterns for ISD::ADD. The main difference is that ISD::ADD is commutative, so that a pattern definition for, e.g., (add (mul x, y), z), automatically also handles (add z, (mul x, y)). ISD::PTRADD is not commutative, so we would need to handle these cases explicitly. This patch only implements (ptradd z, (op x, y)) patterns, where the nested operation (shift or multiply) is the offset of the ptradd (i.e., the right operand), since base pointers that are the result of a shift or multiply seem less likely. For SWDEV-516125.
1 parent 72596b3 commit 01b4b2a

10 files changed

+108
-205
lines changed

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@
99
def BITOP3_32 : ComplexPattern<i32, 4, "SelectBITOP3", [and, or, xor]>;
1010
def BITOP3_16 : ComplexPattern<i16, 4, "SelectBITOP3", [and, or, xor]>;
1111

12+
// Matches PTRADD as a commutative operation.
13+
def ptradd_commutative : PatFrags<(ops node:$src0, node:$src1),
14+
[(ptradd node:$src0, node:$src1), (ptradd node:$src1, node:$src0)]>;
15+
1216
// Special case for v_div_fmas_{f32|f64}, since it seems to be the
1317
// only VOP instruction that implicitly reads VCC.
1418
let Asm64 = " $vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$clamp$omod" in {
@@ -938,12 +942,18 @@ def : GCNPat<
938942
(DivergentBinFrag<mul> i32:$src0, IsPow2Plus1:$src1),
939943
(V_LSHL_ADD_U32_e64 i32:$src0, (i32 (Log2_32 imm:$src1)), i32:$src0)>;
940944

941-
let SubtargetPredicate = HasLshlAddU64Inst in
945+
let SubtargetPredicate = HasLshlAddU64Inst in {
942946
def : GCNPat<
943947
(ThreeOpFrag<shl_0_to_4, add> i64:$src0, i32:$src1, i64:$src2),
944948
(V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$src1, VSrc_b64:$src2)
945949
>;
946950

951+
def : GCNPat <
952+
// (ptradd z, (shl x, y)) or (ptradd (shl x, y), z) -> ((x << y) + z)
953+
(ThreeOpFrag<shl_0_to_4, ptradd_commutative> i64:$src0, i32:$src1, i64:$src2),
954+
(V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$src1, VSrc_b64:$src2)>;
955+
} // End SubtargetPredicate = HasLshlAddU64Inst
956+
947957
let SubtargetPredicate = HasAddMinMaxInsts in {
948958
def : ThreeOp_i32_Pats<add, smax, V_ADD_MAX_I32_e64>;
949959
def : ThreeOp_i32_Pats<add, umax, V_ADD_MAX_U32_e64>;
@@ -1019,19 +1029,24 @@ multiclass IMAD32_Pats <VOP3_Pseudo inst> {
10191029

10201030
// Handle cases where amdgpu-codegenprepare-mul24 made a mul24 instead of a normal mul.
10211031
// We need to separate this because otherwise OtherPredicates would be overriden.
1022-
class IMAD32_Mul24_Pat<VOP3_Pseudo inst>: GCNPat <
1023-
(i64 (add (i64 (AMDGPUmul_u24 i32:$src0, i32:$src1)), i64:$src2)),
1032+
class IMAD32_Mul24_Pats_Impl<VOP3_Pseudo inst, SDPatternOperator AddOp> : GCNPat <
1033+
(i64 (AddOp (i64 (AMDGPUmul_u24 i32:$src0, i32:$src1)), i64:$src2)),
10241034
(inst $src0, $src1, $src2, 0 /* clamp */)
10251035
>;
10261036

1037+
multiclass IMAD32_Mul24_Pats<VOP3_Pseudo inst> {
1038+
def : IMAD32_Mul24_Pats_Impl<inst, add>;
1039+
def : IMAD32_Mul24_Pats_Impl<inst, ptradd_commutative>;
1040+
}
1041+
10271042
// exclude pre-GFX9 where it was slow
10281043
let OtherPredicates = [HasNotMADIntraFwdBug], SubtargetPredicate = isGFX9Plus in {
10291044
defm : IMAD32_Pats<V_MAD_U64_U32_e64>;
1030-
def : IMAD32_Mul24_Pat<V_MAD_U64_U32_e64>;
1045+
defm : IMAD32_Mul24_Pats<V_MAD_U64_U32_e64>;
10311046
}
10321047
let OtherPredicates = [HasMADIntraFwdBug], SubtargetPredicate = isGFX11Only in {
10331048
defm : IMAD32_Pats<V_MAD_U64_U32_gfx11_e64>;
1034-
def : IMAD32_Mul24_Pat<V_MAD_U64_U32_gfx11_e64>;
1049+
defm : IMAD32_Mul24_Pats<V_MAD_U64_U32_gfx11_e64>;
10351050
}
10361051

10371052
def VOP3_PERMLANE_Profile : VOP3_Profile<VOPProfile <[i32, i32, i32, i32]>, VOP3_OPSEL> {

llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -119,10 +119,7 @@ define i32 @flat_offset_maybe_oob(ptr %p, i32 %i) {
119119
; GFX942-GISEL: ; %bb.0:
120120
; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
121121
; GFX942-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2
122-
; GFX942-GISEL-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3]
123-
; GFX942-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
124-
; GFX942-GISEL-NEXT: s_nop 1
125-
; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
122+
; GFX942-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
126123
; GFX942-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 12, v0
127124
; GFX942-GISEL-NEXT: s_nop 1
128125
; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.load.async.to.lds.ll

Lines changed: 9 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -270,29 +270,15 @@ entry:
270270
}
271271

272272
define amdgpu_ps void @cluster_load_async_to_lds_b64_saddr_no_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 inreg %mask, i32 %idx) {
273-
; GFX1250-SDAG-LABEL: cluster_load_async_to_lds_b64_saddr_no_scale_offset:
274-
; GFX1250-SDAG: ; %bb.0: ; %entry
275-
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, v1
276-
; GFX1250-SDAG-NEXT: s_mov_b32 m0, s2
277-
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
278-
; GFX1250-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v2
279-
; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 2, s[0:1]
280-
; GFX1250-SDAG-NEXT: cluster_load_async_to_lds_b64 v0, v[2:3], off offset:16 th:TH_LOAD_NT
281-
; GFX1250-SDAG-NEXT: s_endpgm
282-
;
283-
; GFX1250-GISEL-LABEL: cluster_load_async_to_lds_b64_saddr_no_scale_offset:
284-
; GFX1250-GISEL: ; %bb.0: ; %entry
285-
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, v1
286-
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
287-
; GFX1250-GISEL-NEXT: s_mov_b32 m0, s2
288-
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
289-
; GFX1250-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2
290-
; GFX1250-GISEL-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3]
291-
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
292-
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2
293-
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, v5, v3, vcc_lo
294-
; GFX1250-GISEL-NEXT: cluster_load_async_to_lds_b64 v0, v[2:3], off offset:16 th:TH_LOAD_NT
295-
; GFX1250-GISEL-NEXT: s_endpgm
273+
; GFX1250-LABEL: cluster_load_async_to_lds_b64_saddr_no_scale_offset:
274+
; GFX1250: ; %bb.0: ; %entry
275+
; GFX1250-NEXT: v_mov_b32_e32 v2, v1
276+
; GFX1250-NEXT: s_mov_b32 m0, s2
277+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
278+
; GFX1250-NEXT: v_ashrrev_i32_e32 v3, 31, v2
279+
; GFX1250-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 2, s[0:1]
280+
; GFX1250-NEXT: cluster_load_async_to_lds_b64 v0, v[2:3], off offset:16 th:TH_LOAD_NT
281+
; GFX1250-NEXT: s_endpgm
296282
entry:
297283
%idxprom = sext i32 %idx to i64
298284
%gep = getelementptr i32, ptr addrspace(1) %gaddr, i64 %idxprom

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.async.to.lds.ll

Lines changed: 8 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -160,27 +160,14 @@ entry:
160160
}
161161

162162
define amdgpu_ps void @global_load_async_to_lds_b64_saddr_no_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 %idx) {
163-
; GFX1250-SDAG-LABEL: global_load_async_to_lds_b64_saddr_no_scale_offset:
164-
; GFX1250-SDAG: ; %bb.0: ; %entry
165-
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, v1
166-
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
167-
; GFX1250-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v2
168-
; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 2, s[0:1]
169-
; GFX1250-SDAG-NEXT: global_load_async_to_lds_b64 v0, v[2:3], off offset:16 th:TH_LOAD_NT
170-
; GFX1250-SDAG-NEXT: s_endpgm
171-
;
172-
; GFX1250-GISEL-LABEL: global_load_async_to_lds_b64_saddr_no_scale_offset:
173-
; GFX1250-GISEL: ; %bb.0: ; %entry
174-
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, v1
175-
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
176-
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
177-
; GFX1250-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2
178-
; GFX1250-GISEL-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3]
179-
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
180-
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2
181-
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, v5, v3, vcc_lo
182-
; GFX1250-GISEL-NEXT: global_load_async_to_lds_b64 v0, v[2:3], off offset:16 th:TH_LOAD_NT
183-
; GFX1250-GISEL-NEXT: s_endpgm
163+
; GFX1250-LABEL: global_load_async_to_lds_b64_saddr_no_scale_offset:
164+
; GFX1250: ; %bb.0: ; %entry
165+
; GFX1250-NEXT: v_mov_b32_e32 v2, v1
166+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
167+
; GFX1250-NEXT: v_ashrrev_i32_e32 v3, 31, v2
168+
; GFX1250-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 2, s[0:1]
169+
; GFX1250-NEXT: global_load_async_to_lds_b64 v0, v[2:3], off offset:16 th:TH_LOAD_NT
170+
; GFX1250-NEXT: s_endpgm
184171
entry:
185172
%idxprom = sext i32 %idx to i64
186173
%gep = getelementptr i32, ptr addrspace(1) %gaddr, i64 %idxprom

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.store.async.from.lds.ll

Lines changed: 8 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -160,27 +160,14 @@ entry:
160160
}
161161

162162
define amdgpu_ps void @global_store_async_from_lds_b64_saddr_no_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 %idx) {
163-
; GFX1250-SDAG-LABEL: global_store_async_from_lds_b64_saddr_no_scale_offset:
164-
; GFX1250-SDAG: ; %bb.0: ; %entry
165-
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, v1
166-
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
167-
; GFX1250-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v2
168-
; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 2, s[0:1]
169-
; GFX1250-SDAG-NEXT: global_store_async_from_lds_b64 v[2:3], v0, off offset:16 th:TH_STORE_NT
170-
; GFX1250-SDAG-NEXT: s_endpgm
171-
;
172-
; GFX1250-GISEL-LABEL: global_store_async_from_lds_b64_saddr_no_scale_offset:
173-
; GFX1250-GISEL: ; %bb.0: ; %entry
174-
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, v1
175-
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
176-
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
177-
; GFX1250-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2
178-
; GFX1250-GISEL-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3]
179-
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
180-
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2
181-
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, v5, v3, vcc_lo
182-
; GFX1250-GISEL-NEXT: global_store_async_from_lds_b64 v[2:3], v0, off offset:16 th:TH_STORE_NT
183-
; GFX1250-GISEL-NEXT: s_endpgm
163+
; GFX1250-LABEL: global_store_async_from_lds_b64_saddr_no_scale_offset:
164+
; GFX1250: ; %bb.0: ; %entry
165+
; GFX1250-NEXT: v_mov_b32_e32 v2, v1
166+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
167+
; GFX1250-NEXT: v_ashrrev_i32_e32 v3, 31, v2
168+
; GFX1250-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 2, s[0:1]
169+
; GFX1250-NEXT: global_store_async_from_lds_b64 v[2:3], v0, off offset:16 th:TH_STORE_NT
170+
; GFX1250-NEXT: s_endpgm
184171
entry:
185172
%idxprom = sext i32 %idx to i64
186173
%gep = getelementptr i32, ptr addrspace(1) %gaddr, i64 %idxprom

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.monitor.gfx1250.ll

Lines changed: 12 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -169,33 +169,22 @@ entry:
169169
}
170170

171171
define amdgpu_ps void @global_load_monitor_b64_saddr_no_scale_offset(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 %idx) {
172-
; GFX1250-SDAG-LABEL: global_load_monitor_b64_saddr_no_scale_offset:
173-
; GFX1250-SDAG: ; %bb.0: ; %entry
174-
; GFX1250-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v2
175-
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
176-
; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 2, s[0:1]
177-
; GFX1250-SDAG-NEXT: global_load_monitor_b64 v[2:3], v[2:3], off th:TH_LOAD_NT
178-
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
179-
; GFX1250-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off
180-
; GFX1250-SDAG-NEXT: s_endpgm
181-
;
182-
; GFX1250-GISEL-LABEL: global_load_monitor_b64_saddr_no_scale_offset:
183-
; GFX1250-GISEL: ; %bb.0: ; %entry
184-
; GFX1250-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2
185-
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
186-
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
187-
; GFX1250-GISEL-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3]
188-
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2
189-
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
190-
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, v5, v3, vcc_lo
191-
; GFX1250-GISEL-NEXT: global_load_monitor_b64 v[2:3], v[2:3], off th:TH_LOAD_NT
192-
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
193-
; GFX1250-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
194-
; GFX1250-GISEL-NEXT: s_endpgm
172+
; GFX1250-LABEL: global_load_monitor_b64_saddr_no_scale_offset:
173+
; GFX1250: ; %bb.0: ; %entry
174+
; GFX1250-NEXT: v_ashrrev_i32_e32 v3, 31, v2
175+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
176+
; GFX1250-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 2, s[0:1]
177+
; GFX1250-NEXT: global_load_monitor_b64 v[2:3], v[2:3], off th:TH_LOAD_NT
178+
; GFX1250-NEXT: s_wait_loadcnt 0x0
179+
; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off
180+
; GFX1250-NEXT: s_endpgm
195181
entry:
196182
%idxprom = sext i32 %idx to i64
197183
%gep = getelementptr i32, ptr addrspace(1) %addr, i64 %idxprom
198184
%val = call <2 x i32> @llvm.amdgcn.global.load.monitor.b64.v2i32(ptr addrspace(1) %gep, i32 1)
199185
store <2 x i32> %val, ptr addrspace(1) %use
200186
ret void
201187
}
188+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
189+
; GFX1250-GISEL: {{.*}}
190+
; GFX1250-SDAG: {{.*}}

llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll

Lines changed: 12 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -265,42 +265,25 @@ define amdgpu_kernel void @fold_mad64(ptr addrspace(1) %p) {
265265

266266
; Use non-zero shift amounts in v_lshl_add_u64.
267267
define ptr @select_v_lshl_add_u64(ptr %base, i64 %voffset) {
268-
; GFX942_PTRADD-LABEL: select_v_lshl_add_u64:
269-
; GFX942_PTRADD: ; %bb.0:
270-
; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
271-
; GFX942_PTRADD-NEXT: v_lshlrev_b64 v[2:3], 3, v[2:3]
272-
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
273-
; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
274-
;
275-
; GFX942_LEGACY-LABEL: select_v_lshl_add_u64:
276-
; GFX942_LEGACY: ; %bb.0:
277-
; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
278-
; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 3, v[0:1]
279-
; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31]
268+
; GFX942-LABEL: select_v_lshl_add_u64:
269+
; GFX942: ; %bb.0:
270+
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
271+
; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 3, v[0:1]
272+
; GFX942-NEXT: s_setpc_b64 s[30:31]
280273
%gep = getelementptr inbounds i64, ptr %base, i64 %voffset
281274
ret ptr %gep
282275
}
283276

284277
; Fold mul and add into v_mad, even if amdgpu-codegenprepare-mul24 turned the
285278
; mul into a mul24.
286279
define ptr @fold_mul24_into_mad(ptr %base, i64 %a, i64 %b) {
287-
; GFX942_PTRADD-LABEL: fold_mul24_into_mad:
288-
; GFX942_PTRADD: ; %bb.0:
289-
; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
290-
; GFX942_PTRADD-NEXT: v_and_b32_e32 v2, 0xfffff, v2
291-
; GFX942_PTRADD-NEXT: v_and_b32_e32 v4, 0xfffff, v4
292-
; GFX942_PTRADD-NEXT: v_mul_hi_u32_u24_e32 v3, v2, v4
293-
; GFX942_PTRADD-NEXT: v_mul_u32_u24_e32 v2, v2, v4
294-
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
295-
; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
296-
;
297-
; GFX942_LEGACY-LABEL: fold_mul24_into_mad:
298-
; GFX942_LEGACY: ; %bb.0:
299-
; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
300-
; GFX942_LEGACY-NEXT: v_and_b32_e32 v2, 0xfffff, v2
301-
; GFX942_LEGACY-NEXT: v_and_b32_e32 v3, 0xfffff, v4
302-
; GFX942_LEGACY-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v3, v[0:1]
303-
; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31]
280+
; GFX942-LABEL: fold_mul24_into_mad:
281+
; GFX942: ; %bb.0:
282+
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
283+
; GFX942-NEXT: v_and_b32_e32 v2, 0xfffff, v2
284+
; GFX942-NEXT: v_and_b32_e32 v3, 0xfffff, v4
285+
; GFX942-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v3, v[0:1]
286+
; GFX942-NEXT: s_setpc_b64 s[30:31]
304287
%a_masked = and i64 %a, u0xfffff
305288
%b_masked = and i64 %b, u0xfffff
306289
%mul = mul i64 %a_masked, %b_masked

llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll

Lines changed: 14 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -25,20 +25,12 @@ define ptr @gep_as0(ptr %p, i64 %offset) {
2525
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2626
; GFX8-NEXT: s_setpc_b64 s[30:31]
2727
;
28-
; GFX942_PTRADD-LABEL: gep_as0:
29-
; GFX942_PTRADD: ; %bb.0: ; %entry
30-
; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31-
; GFX942_PTRADD-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3]
32-
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
33-
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 5
34-
; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
35-
;
36-
; GFX942_LEGACY-LABEL: gep_as0:
37-
; GFX942_LEGACY: ; %bb.0: ; %entry
38-
; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
39-
; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
40-
; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 5
41-
; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31]
28+
; GFX942-LABEL: gep_as0:
29+
; GFX942: ; %bb.0: ; %entry
30+
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31+
; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
32+
; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 5
33+
; GFX942-NEXT: s_setpc_b64 s[30:31]
4234
;
4335
; GFX10-LABEL: gep_as0:
4436
; GFX10: ; %bb.0: ; %entry
@@ -188,20 +180,12 @@ define ptr @multi_gep_as0(ptr %p, i64 %offset) {
188180
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
189181
; GFX8-NEXT: s_setpc_b64 s[30:31]
190182
;
191-
; GFX942_PTRADD-LABEL: multi_gep_as0:
192-
; GFX942_PTRADD: ; %bb.0: ; %entry
193-
; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
194-
; GFX942_PTRADD-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3]
195-
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
196-
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 5
197-
; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
198-
;
199-
; GFX942_LEGACY-LABEL: multi_gep_as0:
200-
; GFX942_LEGACY: ; %bb.0: ; %entry
201-
; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
202-
; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
203-
; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 5
204-
; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31]
183+
; GFX942-LABEL: multi_gep_as0:
184+
; GFX942: ; %bb.0: ; %entry
185+
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
186+
; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
187+
; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 5
188+
; GFX942-NEXT: s_setpc_b64 s[30:31]
205189
;
206190
; GFX10-LABEL: multi_gep_as0:
207191
; GFX10: ; %bb.0: ; %entry
@@ -537,3 +521,5 @@ entry:
537521
; GFX12_PTRADD: {{.*}}
538522
; GFX8_LEGACY: {{.*}}
539523
; GFX8_PTRADD: {{.*}}
524+
; GFX942_LEGACY: {{.*}}
525+
; GFX942_PTRADD: {{.*}}

0 commit comments

Comments
 (0)