Skip to content

Commit 48f36c6

Browse files
authored
[LLVM] Make use of s_flbit_i32_b64 and s_ff1_i32_b64 (#75158)
Update DAG ISel to support 64bit versions S_FF1_I32_B64 and S_FLBIT_I32_B664 --------- Co-authored-by: Acim Maravic <[email protected]>
1 parent 0fbc728 commit 48f36c6

22 files changed

+1567
-934
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3070,18 +3070,26 @@ SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) cons
30703070

30713071
bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
30723072
Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
3073+
bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
30733074

3074-
if (Src.getValueType() == MVT::i32) {
3075+
if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
30753076
// (ctlz hi:lo) -> (umin (ffbh src), 32)
30763077
// (cttz hi:lo) -> (umin (ffbl src), 32)
30773078
// (ctlz_zero_undef src) -> (ffbh src)
30783079
// (cttz_zero_undef src) -> (ffbl src)
3080+
3081+
// 64-bit scalar version produce 32-bit result
3082+
// (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3083+
// (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3084+
// (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
3085+
// (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
30793086
SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
30803087
if (!ZeroUndef) {
3081-
const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
3082-
NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const32);
3088+
const SDValue ConstVal = DAG.getConstant(
3089+
Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
3090+
NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);
30833091
}
3084-
return NewOpr;
3092+
return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr);
30853093
}
30863094

30873095
SDValue Lo, Hi;

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6912,6 +6912,15 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
69126912
Inst.eraseFromParent();
69136913
return;
69146914

6915+
case AMDGPU::S_FLBIT_I32_B64:
6916+
splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
6917+
Inst.eraseFromParent();
6918+
return;
6919+
case AMDGPU::S_FF1_I32_B64:
6920+
splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
6921+
Inst.eraseFromParent();
6922+
return;
6923+
69156924
case AMDGPU::S_LSHL_B32:
69166925
if (ST.hasOnlyRevVALUShifts()) {
69176926
NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
@@ -7845,6 +7854,61 @@ void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
78457854
addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
78467855
}
78477856

7857+
void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
7858+
MachineInstr &Inst, unsigned Opcode,
7859+
MachineDominatorTree *MDT) const {
7860+
// (S_FLBIT_I32_B64 hi:lo) ->
7861+
// -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
7862+
// (S_FF1_I32_B64 hi:lo) ->
7863+
// ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
7864+
7865+
MachineBasicBlock &MBB = *Inst.getParent();
7866+
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7867+
MachineBasicBlock::iterator MII = Inst;
7868+
const DebugLoc &DL = Inst.getDebugLoc();
7869+
7870+
MachineOperand &Dest = Inst.getOperand(0);
7871+
MachineOperand &Src = Inst.getOperand(1);
7872+
7873+
const MCInstrDesc &InstDesc = get(Opcode);
7874+
7875+
bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
7876+
unsigned OpcodeAdd =
7877+
ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
7878+
7879+
const TargetRegisterClass *SrcRC =
7880+
Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
7881+
const TargetRegisterClass *SrcSubRC =
7882+
RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
7883+
7884+
MachineOperand SrcRegSub0 =
7885+
buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
7886+
MachineOperand SrcRegSub1 =
7887+
buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
7888+
7889+
Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7890+
Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7891+
Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7892+
Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7893+
7894+
BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
7895+
7896+
BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
7897+
7898+
BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
7899+
.addReg(IsCtlz ? MidReg1 : MidReg2)
7900+
.addImm(32)
7901+
.addImm(1); // enable clamp
7902+
7903+
BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
7904+
.addReg(MidReg3)
7905+
.addReg(IsCtlz ? MidReg2 : MidReg1);
7906+
7907+
MRI.replaceRegWith(Dest.getReg(), MidReg4);
7908+
7909+
addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
7910+
}
7911+
78487912
void SIInstrInfo::addUsersToMoveToVALUWorklist(
78497913
Register DstReg, MachineRegisterInfo &MRI,
78507914
SIInstrWorklist &Worklist) const {

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
144144
void splitScalar64BitBCNT(SIInstrWorklist &Worklist,
145145
MachineInstr &Inst) const;
146146
void splitScalar64BitBFE(SIInstrWorklist &Worklist, MachineInstr &Inst) const;
147+
void splitScalar64BitCountOp(SIInstrWorklist &Worklist, MachineInstr &Inst,
148+
unsigned Opcode,
149+
MachineDominatorTree *MDT = nullptr) const;
147150
void movePackToVALU(SIInstrWorklist &Worklist, MachineRegisterInfo &MRI,
148151
MachineInstr &Inst) const;
149152

llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll

Lines changed: 27 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -589,13 +589,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
589589
; GFX8-NEXT: ; implicit-def: $vgpr1
590590
; GFX8-NEXT: .LBB2_1: ; %ComputeLoop
591591
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
592-
; GFX8-NEXT: s_ff1_i32_b32 s5, s3
593-
; GFX8-NEXT: s_ff1_i32_b32 s6, s2
594-
; GFX8-NEXT: s_add_i32 s5, s5, 32
595-
; GFX8-NEXT: s_min_u32 s5, s6, s5
592+
; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3]
593+
; GFX8-NEXT: s_mov_b32 m0, s5
596594
; GFX8-NEXT: v_readlane_b32 s8, v0, s5
597595
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5
598-
; GFX8-NEXT: s_mov_b32 m0, s5
599596
; GFX8-NEXT: v_writelane_b32 v1, s4, m0
600597
; GFX8-NEXT: s_add_i32 s4, s4, s8
601598
; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -633,13 +630,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
633630
; GFX9-NEXT: ; implicit-def: $vgpr1
634631
; GFX9-NEXT: .LBB2_1: ; %ComputeLoop
635632
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
636-
; GFX9-NEXT: s_ff1_i32_b32 s5, s3
637-
; GFX9-NEXT: s_ff1_i32_b32 s6, s2
638-
; GFX9-NEXT: s_add_i32 s5, s5, 32
639-
; GFX9-NEXT: s_min_u32 s5, s6, s5
633+
; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3]
634+
; GFX9-NEXT: s_mov_b32 m0, s5
640635
; GFX9-NEXT: v_readlane_b32 s8, v0, s5
641636
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5
642-
; GFX9-NEXT: s_mov_b32 m0, s5
643637
; GFX9-NEXT: v_writelane_b32 v1, s4, m0
644638
; GFX9-NEXT: s_add_i32 s4, s4, s8
645639
; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -676,10 +670,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
676670
; GFX10W64-NEXT: ; implicit-def: $vgpr1
677671
; GFX10W64-NEXT: .LBB2_1: ; %ComputeLoop
678672
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
679-
; GFX10W64-NEXT: s_ff1_i32_b32 s5, s3
680-
; GFX10W64-NEXT: s_ff1_i32_b32 s6, s2
681-
; GFX10W64-NEXT: s_add_i32 s5, s5, 32
682-
; GFX10W64-NEXT: s_min_u32 s5, s6, s5
673+
; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[2:3]
683674
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5
684675
; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5
685676
; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5
@@ -758,16 +749,12 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
758749
; GFX11W64-NEXT: ; implicit-def: $vgpr1
759750
; GFX11W64-NEXT: .LBB2_1: ; %ComputeLoop
760751
; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1
761-
; GFX11W64-NEXT: s_ctz_i32_b32 s5, s3
762-
; GFX11W64-NEXT: s_ctz_i32_b32 s6, s2
763-
; GFX11W64-NEXT: s_add_i32 s5, s5, 32
764-
; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
765-
; GFX11W64-NEXT: s_min_u32 s5, s6, s5
752+
; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[2:3]
753+
; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
766754
; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5
767755
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5
768756
; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5
769757
; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
770-
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
771758
; GFX11W64-NEXT: s_add_i32 s4, s4, s8
772759
; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0
773760
; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1
@@ -849,16 +836,12 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
849836
; GFX12W64-NEXT: ; implicit-def: $vgpr1
850837
; GFX12W64-NEXT: .LBB2_1: ; %ComputeLoop
851838
; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1
852-
; GFX12W64-NEXT: s_ctz_i32_b32 s5, s3
853-
; GFX12W64-NEXT: s_ctz_i32_b32 s6, s2
854-
; GFX12W64-NEXT: s_add_co_i32 s5, s5, 32
855-
; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
856-
; GFX12W64-NEXT: s_min_u32 s5, s6, s5
839+
; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[2:3]
840+
; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
857841
; GFX12W64-NEXT: v_readlane_b32 s8, v0, s5
858842
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5
859843
; GFX12W64-NEXT: v_writelane_b32 v1, s4, s5
860844
; GFX12W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
861-
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
862845
; GFX12W64-NEXT: s_add_co_i32 s4, s4, s8
863846
; GFX12W64-NEXT: s_cmp_lg_u64 s[2:3], 0
864847
; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1
@@ -961,13 +944,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
961944
; GFX8-NEXT: ; implicit-def: $vgpr1
962945
; GFX8-NEXT: .LBB3_1: ; %ComputeLoop
963946
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
964-
; GFX8-NEXT: s_ff1_i32_b32 s5, s3
965-
; GFX8-NEXT: s_ff1_i32_b32 s6, s2
966-
; GFX8-NEXT: s_add_i32 s5, s5, 32
967-
; GFX8-NEXT: s_min_u32 s5, s6, s5
947+
; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3]
948+
; GFX8-NEXT: s_mov_b32 m0, s5
968949
; GFX8-NEXT: v_readlane_b32 s8, v0, s5
969950
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5
970-
; GFX8-NEXT: s_mov_b32 m0, s5
971951
; GFX8-NEXT: v_writelane_b32 v1, s4, m0
972952
; GFX8-NEXT: s_add_i32 s4, s4, s8
973953
; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -1007,13 +987,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
1007987
; GFX9-NEXT: ; implicit-def: $vgpr1
1008988
; GFX9-NEXT: .LBB3_1: ; %ComputeLoop
1009989
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
1010-
; GFX9-NEXT: s_ff1_i32_b32 s5, s3
1011-
; GFX9-NEXT: s_ff1_i32_b32 s6, s2
1012-
; GFX9-NEXT: s_add_i32 s5, s5, 32
1013-
; GFX9-NEXT: s_min_u32 s5, s6, s5
990+
; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3]
991+
; GFX9-NEXT: s_mov_b32 m0, s5
1014992
; GFX9-NEXT: v_readlane_b32 s8, v0, s5
1015993
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5
1016-
; GFX9-NEXT: s_mov_b32 m0, s5
1017994
; GFX9-NEXT: v_writelane_b32 v1, s4, m0
1018995
; GFX9-NEXT: s_add_i32 s4, s4, s8
1019996
; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -1052,10 +1029,7 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
10521029
; GFX10W64-NEXT: ; implicit-def: $vgpr1
10531030
; GFX10W64-NEXT: .LBB3_1: ; %ComputeLoop
10541031
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
1055-
; GFX10W64-NEXT: s_ff1_i32_b32 s5, s3
1056-
; GFX10W64-NEXT: s_ff1_i32_b32 s6, s2
1057-
; GFX10W64-NEXT: s_add_i32 s5, s5, 32
1058-
; GFX10W64-NEXT: s_min_u32 s5, s6, s5
1032+
; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[2:3]
10591033
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5
10601034
; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5
10611035
; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5
@@ -1140,16 +1114,12 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
11401114
; GFX11W64-NEXT: ; implicit-def: $vgpr1
11411115
; GFX11W64-NEXT: .LBB3_1: ; %ComputeLoop
11421116
; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1
1143-
; GFX11W64-NEXT: s_ctz_i32_b32 s5, s3
1144-
; GFX11W64-NEXT: s_ctz_i32_b32 s6, s2
1145-
; GFX11W64-NEXT: s_add_i32 s5, s5, 32
1146-
; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1147-
; GFX11W64-NEXT: s_min_u32 s5, s6, s5
1117+
; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[2:3]
1118+
; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
11481119
; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5
11491120
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5
11501121
; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5
11511122
; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
1152-
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
11531123
; GFX11W64-NEXT: s_add_i32 s4, s4, s8
11541124
; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0
11551125
; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_1
@@ -1237,16 +1207,12 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
12371207
; GFX12W64-NEXT: ; implicit-def: $vgpr1
12381208
; GFX12W64-NEXT: .LBB3_1: ; %ComputeLoop
12391209
; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1
1240-
; GFX12W64-NEXT: s_ctz_i32_b32 s5, s3
1241-
; GFX12W64-NEXT: s_ctz_i32_b32 s6, s2
1242-
; GFX12W64-NEXT: s_add_co_i32 s5, s5, 32
1243-
; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1244-
; GFX12W64-NEXT: s_min_u32 s5, s6, s5
1210+
; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[2:3]
1211+
; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
12451212
; GFX12W64-NEXT: v_readlane_b32 s8, v0, s5
12461213
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5
12471214
; GFX12W64-NEXT: v_writelane_b32 v1, s4, s5
12481215
; GFX12W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
1249-
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
12501216
; GFX12W64-NEXT: s_add_co_i32 s4, s4, s8
12511217
; GFX12W64-NEXT: s_cmp_lg_u64 s[2:3], 0
12521218
; GFX12W64-NEXT: s_cbranch_scc1 .LBB3_1
@@ -2005,13 +1971,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
20051971
; GFX8-NEXT: ; implicit-def: $vgpr1
20061972
; GFX8-NEXT: .LBB7_1: ; %ComputeLoop
20071973
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
2008-
; GFX8-NEXT: s_ff1_i32_b32 s5, s3
2009-
; GFX8-NEXT: s_ff1_i32_b32 s6, s2
2010-
; GFX8-NEXT: s_add_i32 s5, s5, 32
2011-
; GFX8-NEXT: s_min_u32 s5, s6, s5
1974+
; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3]
1975+
; GFX8-NEXT: s_mov_b32 m0, s5
20121976
; GFX8-NEXT: v_readlane_b32 s8, v0, s5
20131977
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5
2014-
; GFX8-NEXT: s_mov_b32 m0, s5
20151978
; GFX8-NEXT: v_writelane_b32 v1, s4, m0
20161979
; GFX8-NEXT: s_add_i32 s4, s4, s8
20171980
; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -2049,13 +2012,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
20492012
; GFX9-NEXT: ; implicit-def: $vgpr1
20502013
; GFX9-NEXT: .LBB7_1: ; %ComputeLoop
20512014
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
2052-
; GFX9-NEXT: s_ff1_i32_b32 s5, s3
2053-
; GFX9-NEXT: s_ff1_i32_b32 s6, s2
2054-
; GFX9-NEXT: s_add_i32 s5, s5, 32
2055-
; GFX9-NEXT: s_min_u32 s5, s6, s5
2015+
; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3]
2016+
; GFX9-NEXT: s_mov_b32 m0, s5
20562017
; GFX9-NEXT: v_readlane_b32 s8, v0, s5
20572018
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5
2058-
; GFX9-NEXT: s_mov_b32 m0, s5
20592019
; GFX9-NEXT: v_writelane_b32 v1, s4, m0
20602020
; GFX9-NEXT: s_add_i32 s4, s4, s8
20612021
; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -2092,10 +2052,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
20922052
; GFX10W64-NEXT: ; implicit-def: $vgpr1
20932053
; GFX10W64-NEXT: .LBB7_1: ; %ComputeLoop
20942054
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
2095-
; GFX10W64-NEXT: s_ff1_i32_b32 s5, s3
2096-
; GFX10W64-NEXT: s_ff1_i32_b32 s6, s2
2097-
; GFX10W64-NEXT: s_add_i32 s5, s5, 32
2098-
; GFX10W64-NEXT: s_min_u32 s5, s6, s5
2055+
; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[2:3]
20992056
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5
21002057
; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5
21012058
; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5
@@ -2174,16 +2131,12 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
21742131
; GFX11W64-NEXT: ; implicit-def: $vgpr1
21752132
; GFX11W64-NEXT: .LBB7_1: ; %ComputeLoop
21762133
; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1
2177-
; GFX11W64-NEXT: s_ctz_i32_b32 s5, s3
2178-
; GFX11W64-NEXT: s_ctz_i32_b32 s6, s2
2179-
; GFX11W64-NEXT: s_add_i32 s5, s5, 32
2180-
; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
2181-
; GFX11W64-NEXT: s_min_u32 s5, s6, s5
2134+
; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[2:3]
2135+
; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
21822136
; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5
21832137
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5
21842138
; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5
21852139
; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
2186-
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
21872140
; GFX11W64-NEXT: s_add_i32 s4, s4, s8
21882141
; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0
21892142
; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1
@@ -2266,16 +2219,12 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
22662219
; GFX12W64-NEXT: ; implicit-def: $vgpr1
22672220
; GFX12W64-NEXT: .LBB7_1: ; %ComputeLoop
22682221
; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1
2269-
; GFX12W64-NEXT: s_ctz_i32_b32 s5, s3
2270-
; GFX12W64-NEXT: s_ctz_i32_b32 s6, s2
2271-
; GFX12W64-NEXT: s_add_co_i32 s5, s5, 32
2272-
; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
2273-
; GFX12W64-NEXT: s_min_u32 s5, s6, s5
2222+
; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[2:3]
2223+
; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
22742224
; GFX12W64-NEXT: v_readlane_b32 s8, v0, s5
22752225
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5
22762226
; GFX12W64-NEXT: v_writelane_b32 v1, s4, s5
22772227
; GFX12W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
2278-
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
22792228
; GFX12W64-NEXT: s_add_co_i32 s4, s4, s8
22802229
; GFX12W64-NEXT: s_cmp_lg_u64 s[2:3], 0
22812230
; GFX12W64-NEXT: s_cbranch_scc1 .LBB7_1

0 commit comments

Comments
 (0)