Skip to content

Commit 125b6b5

Browse files
authored
[AMDGPU] Generate s_lshl?_add_u32 (llvm#167032)
Generate s_lshl?_add_u32 through SDAG. --------- Signed-off-by: John Lu <[email protected]>
1 parent 863730f commit 125b6b5

24 files changed

+787
-802
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7633,6 +7633,8 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
76337633
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
76347634
unsigned Opcode = Inst.getOpcode();
76357635
unsigned NewOpcode = getVALUOp(Inst);
7636+
const DebugLoc &DL = Inst.getDebugLoc();
7637+
76367638
// Handle some special cases
76377639
switch (Opcode) {
76387640
default:
@@ -7870,7 +7872,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
78707872
return;
78717873
case AMDGPU::S_UADDO_PSEUDO:
78727874
case AMDGPU::S_USUBO_PSEUDO: {
7873-
const DebugLoc &DL = Inst.getDebugLoc();
78747875
MachineOperand &Dest0 = Inst.getOperand(0);
78757876
MachineOperand &Dest1 = Inst.getOperand(1);
78767877
MachineOperand &Src0 = Inst.getOperand(2);
@@ -7890,12 +7891,37 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
78907891

78917892
legalizeOperands(*NewInstr, MDT);
78927893
MRI.replaceRegWith(Dest0.getReg(), DestReg);
7893-
addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
7894-
Worklist);
7894+
addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
78957895
Inst.eraseFromParent();
78967896
}
78977897
return;
7898+
case AMDGPU::S_LSHL1_ADD_U32:
7899+
case AMDGPU::S_LSHL2_ADD_U32:
7900+
case AMDGPU::S_LSHL3_ADD_U32:
7901+
case AMDGPU::S_LSHL4_ADD_U32: {
7902+
MachineOperand &Dest = Inst.getOperand(0);
7903+
MachineOperand &Src0 = Inst.getOperand(1);
7904+
MachineOperand &Src1 = Inst.getOperand(2);
7905+
unsigned ShiftAmt = (Opcode == AMDGPU::S_LSHL1_ADD_U32 ? 1
7906+
: Opcode == AMDGPU::S_LSHL2_ADD_U32 ? 2
7907+
: Opcode == AMDGPU::S_LSHL3_ADD_U32 ? 3
7908+
: 4);
7909+
7910+
const TargetRegisterClass *NewRC =
7911+
RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg()));
7912+
Register DestReg = MRI.createVirtualRegister(NewRC);
7913+
MachineInstr *NewInstr =
7914+
BuildMI(*MBB, &Inst, DL, get(AMDGPU::V_LSHL_ADD_U32_e64), DestReg)
7915+
.add(Src0)
7916+
.addImm(ShiftAmt)
7917+
.add(Src1);
78987918

7919+
legalizeOperands(*NewInstr, MDT);
7920+
MRI.replaceRegWith(Dest.getReg(), DestReg);
7921+
addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
7922+
Inst.eraseFromParent();
7923+
}
7924+
return;
78997925
case AMDGPU::S_CSELECT_B32:
79007926
case AMDGPU::S_CSELECT_B64:
79017927
lowerSelect(Worklist, Inst, MDT);
@@ -7992,7 +8018,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
79928018
return;
79938019
}
79948020
case AMDGPU::S_CVT_HI_F32_F16: {
7995-
const DebugLoc &DL = Inst.getDebugLoc();
79968021
Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
79978022
Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
79988023
if (ST.useRealTrue16Insts()) {
@@ -8022,7 +8047,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
80228047
}
80238048
case AMDGPU::S_MINIMUM_F32:
80248049
case AMDGPU::S_MAXIMUM_F32: {
8025-
const DebugLoc &DL = Inst.getDebugLoc();
80268050
Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
80278051
MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
80288052
.addImm(0) // src0_modifiers
@@ -8040,7 +8064,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
80408064
}
80418065
case AMDGPU::S_MINIMUM_F16:
80428066
case AMDGPU::S_MAXIMUM_F16: {
8043-
const DebugLoc &DL = Inst.getDebugLoc();
80448067
Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
80458068
? &AMDGPU::VGPR_16RegClass
80468069
: &AMDGPU::VGPR_32RegClass);
@@ -8064,7 +8087,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
80648087
case AMDGPU::V_S_RCP_F16_e64:
80658088
case AMDGPU::V_S_RSQ_F16_e64:
80668089
case AMDGPU::V_S_SQRT_F16_e64: {
8067-
const DebugLoc &DL = Inst.getDebugLoc();
80688090
Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
80698091
? &AMDGPU::VGPR_16RegClass
80708092
: &AMDGPU::VGPR_32RegClass);

llvm/lib/Target/AMDGPU/SIInstrInfo.td

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -776,11 +776,7 @@ def xnor : PatFrag <
776776
foreach I = 1-4 in {
777777
def shl#I#_add : PatFrag <
778778
(ops node:$src0, node:$src1),
779-
(add (shl_oneuse $src0, (i32 I)), $src1)> {
780-
// FIXME: Poor substitute for disabling pattern in SelectionDAG
781-
let PredicateCode = [{return false;}];
782-
let GISelPredicateCode = [{return true;}];
783-
}
779+
(add (shl_oneuse $src0, (i32 I)), $src1)>;
784780
}
785781

786782
multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0,

llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll

Lines changed: 8 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -51,10 +51,8 @@ define amdgpu_cs_chain void @test_alloca_var_uniform(i32 inreg %count) {
5151
; GFX12-NEXT: s_wait_samplecnt 0x0
5252
; GFX12-NEXT: s_wait_bvhcnt 0x0
5353
; GFX12-NEXT: s_wait_kmcnt 0x0
54-
; GFX12-NEXT: s_lshl_b32 s0, s0, 2
54+
; GFX12-NEXT: s_lshl2_add_u32 s0, s0, 15
5555
; GFX12-NEXT: v_mov_b32_e32 v0, 0
56-
; GFX12-NEXT: s_wait_alu 0xfffe
57-
; GFX12-NEXT: s_add_co_i32 s0, s0, 15
5856
; GFX12-NEXT: s_mov_b32 s32, 16
5957
; GFX12-NEXT: s_wait_alu 0xfffe
6058
; GFX12-NEXT: s_and_b32 s0, s0, -16
@@ -69,8 +67,7 @@ define amdgpu_cs_chain void @test_alloca_var_uniform(i32 inreg %count) {
6967
; GFX942-LABEL: test_alloca_var_uniform:
7068
; GFX942: ; %bb.0:
7169
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
72-
; GFX942-NEXT: s_lshl_b32 s0, s0, 2
73-
; GFX942-NEXT: s_add_i32 s0, s0, 15
70+
; GFX942-NEXT: s_lshl2_add_u32 s0, s0, 15
7471
; GFX942-NEXT: s_mov_b32 s32, 16
7572
; GFX942-NEXT: s_and_b32 s0, s0, -16
7673
; GFX942-NEXT: v_mov_b32_e32 v0, 0
@@ -211,15 +208,13 @@ define amdgpu_cs_chain void @test_alloca_and_call_var_uniform(i32 inreg %count)
211208
; GFX12-NEXT: s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12
212209
; GFX12-NEXT: s_wait_alu 0xfffe
213210
; GFX12-NEXT: s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24
214-
; GFX12-NEXT: s_lshl_b32 s0, s0, 2
211+
; GFX12-NEXT: s_lshl2_add_u32 s0, s0, 15
215212
; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
216-
; GFX12-NEXT: s_add_co_i32 s0, s0, 15
217213
; GFX12-NEXT: v_mov_b32_e32 v0, 0
218214
; GFX12-NEXT: s_mov_b32 s32, 16
219-
; GFX12-NEXT: s_wait_alu 0xfffe
220215
; GFX12-NEXT: s_and_b32 s0, s0, -16
221-
; GFX12-NEXT: s_mov_b32 s1, s32
222216
; GFX12-NEXT: s_wait_alu 0xfffe
217+
; GFX12-NEXT: s_mov_b32 s1, s32
223218
; GFX12-NEXT: s_lshl_b32 s0, s0, 5
224219
; GFX12-NEXT: scratch_store_b32 off, v0, s1
225220
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -232,8 +227,7 @@ define amdgpu_cs_chain void @test_alloca_and_call_var_uniform(i32 inreg %count)
232227
; GFX942-LABEL: test_alloca_and_call_var_uniform:
233228
; GFX942: ; %bb.0:
234229
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
235-
; GFX942-NEXT: s_lshl_b32 s0, s0, 2
236-
; GFX942-NEXT: s_add_i32 s0, s0, 15
230+
; GFX942-NEXT: s_lshl2_add_u32 s0, s0, 15
237231
; GFX942-NEXT: s_and_b32 s0, s0, -16
238232
; GFX942-NEXT: s_lshl_b32 s2, s0, 6
239233
; GFX942-NEXT: s_getpc_b64 s[0:1]
@@ -396,14 +390,12 @@ define amdgpu_cs_chain void @test_call_and_alloca_var_uniform(i32 inreg %count)
396390
; GFX12-NEXT: s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12
397391
; GFX12-NEXT: s_wait_alu 0xfffe
398392
; GFX12-NEXT: s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24
399-
; GFX12-NEXT: s_lshl_b32 s0, s0, 2
393+
; GFX12-NEXT: s_lshl2_add_u32 s0, s0, 15
400394
; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
401-
; GFX12-NEXT: s_add_co_i32 s0, s0, 15
402395
; GFX12-NEXT: s_mov_b32 s32, 16
403-
; GFX12-NEXT: s_wait_alu 0xfffe
404396
; GFX12-NEXT: s_and_b32 s0, s0, -16
405-
; GFX12-NEXT: s_mov_b32 s4, s32
406397
; GFX12-NEXT: s_wait_alu 0xfffe
398+
; GFX12-NEXT: s_mov_b32 s4, s32
407399
; GFX12-NEXT: s_lshl_b32 s0, s0, 5
408400
; GFX12-NEXT: v_mov_b32_e32 v40, 0
409401
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -417,8 +409,7 @@ define amdgpu_cs_chain void @test_call_and_alloca_var_uniform(i32 inreg %count)
417409
; GFX942-LABEL: test_call_and_alloca_var_uniform:
418410
; GFX942: ; %bb.0:
419411
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
420-
; GFX942-NEXT: s_lshl_b32 s0, s0, 2
421-
; GFX942-NEXT: s_add_i32 s0, s0, 15
412+
; GFX942-NEXT: s_lshl2_add_u32 s0, s0, 15
422413
; GFX942-NEXT: s_and_b32 s0, s0, -16
423414
; GFX942-NEXT: s_lshl_b32 s2, s0, 6
424415
; GFX942-NEXT: s_getpc_b64 s[0:1]

llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -265,8 +265,7 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(ptr addrspac
265265
; GFX9-NEXT: v_mov_b32_e32 v0, 7
266266
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
267267
; GFX9-NEXT: s_sub_i32 s2, s2, s3
268-
; GFX9-NEXT: s_lshl_b32 s2, s2, 2
269-
; GFX9-NEXT: s_add_i32 s0, s0, s2
268+
; GFX9-NEXT: s_lshl2_add_u32 s0, s2, s0
270269
; GFX9-NEXT: v_mov_b32_e32 v1, s0
271270
; GFX9-NEXT: v_mov_b32_e32 v2, s1
272271
; GFX9-NEXT: ds_cmpst_rtn_b32 v0, v1, v0, v2 offset:16
@@ -282,9 +281,8 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(ptr addrspac
282281
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
283282
; GFX11-NEXT: s_sub_i32 s2, s2, s3
284283
; GFX11-NEXT: v_mov_b32_e32 v2, s1
285-
; GFX11-NEXT: s_lshl_b32 s2, s2, 2
286-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
287-
; GFX11-NEXT: s_add_i32 s0, s0, s2
284+
; GFX11-NEXT: s_lshl2_add_u32 s0, s2, s0
285+
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
288286
; GFX11-NEXT: v_mov_b32_e32 v1, s0
289287
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
290288
; GFX11-NEXT: s_waitcnt lgkmcnt(0)

0 commit comments

Comments
 (0)