Skip to content

Commit b2f1080

Browse files
Ana MihajlovicAna Mihajlovic
authored andcommitted
[AMDGPU] Merge V_CNDMASKS into V_DUAL_CNDMASK
1 parent 51b12c3 commit b2f1080

File tree

11 files changed

+260
-177
lines changed

11 files changed

+260
-177
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 88 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,25 @@ class SIFoldOperandsImpl {
105105
}
106106
}
107107

108+
unsigned getInverseCompareOpcode(MachineInstr &MI) const {
109+
switch (MI.getOpcode()) {
110+
case AMDGPU::V_CMP_EQ_U32_e64:
111+
return AMDGPU::V_CMP_NE_U32_e64;
112+
case AMDGPU::V_CMP_NE_U32_e64:
113+
return AMDGPU::V_CMP_EQ_U32_e64;
114+
case AMDGPU::V_CMP_GE_U32_e64:
115+
return AMDGPU::V_CMP_LT_U32_e64;
116+
case AMDGPU::V_CMP_LE_U32_e64:
117+
return AMDGPU::V_CMP_GT_U32_e64;
118+
case AMDGPU::V_CMP_GT_U32_e64:
119+
return AMDGPU::V_CMP_LE_U32_e64;
120+
case AMDGPU::V_CMP_LT_U32_e64:
121+
return AMDGPU::V_CMP_GE_U32_e64;
122+
default:
123+
return 0;
124+
}
125+
}
126+
108127
bool foldCopyToVGPROfScalarAddOfFrameIndex(Register DstReg, Register SrcReg,
109128
MachineInstr &MI) const;
110129

@@ -133,7 +152,8 @@ class SIFoldOperandsImpl {
133152

134153
std::optional<int64_t> getImmOrMaterializedImm(MachineOperand &Op) const;
135154
bool tryConstantFoldOp(MachineInstr *MI) const;
136-
bool tryFoldCndMask(MachineInstr &MI) const;
155+
bool tryFoldCndMask(MachineInstr &MI, Register *RegVCC,
156+
Register *newVCC) const;
137157
bool tryFoldZeroHighBits(MachineInstr &MI) const;
138158
bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
139159

@@ -152,6 +172,9 @@ class SIFoldOperandsImpl {
152172

153173
bool tryOptimizeAGPRPhis(MachineBasicBlock &MBB);
154174

175+
bool shouldSwitchOperands(MachineRegisterInfo &MRI, MachineInstr &MI,
176+
const SIInstrInfo &TII) const;
177+
155178
public:
156179
SIFoldOperandsImpl() = default;
157180

@@ -1459,13 +1482,73 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const {
14591482
return false;
14601483
}
14611484

1485+
bool SIFoldOperandsImpl::shouldSwitchOperands(MachineRegisterInfo &MRI,
1486+
MachineInstr &MI,
1487+
const SIInstrInfo &TII) const {
1488+
auto allUses = MRI.use_nodbg_operands(MI.getOperand(5).getReg());
1489+
unsigned count = 0;
1490+
1491+
for (auto &Use : allUses) {
1492+
if (Use.getParent()->getOpcode() != AMDGPU::V_CNDMASK_B32_e64)
1493+
return false;
1494+
MachineOperand *Src0 =
1495+
TII.getNamedOperand(*Use.getParent(), AMDGPU::OpName::src0);
1496+
MachineOperand *Src1 =
1497+
TII.getNamedOperand(*Use.getParent(), AMDGPU::OpName::src1);
1498+
1499+
auto src0Imm = getImmOrMaterializedImm(*Src0);
1500+
auto src1Imm = getImmOrMaterializedImm(*Src1);
1501+
1502+
if (!src1Imm && src0Imm)
1503+
return false;
1504+
if (src1Imm && !src0Imm)
1505+
count++;
1506+
}
1507+
return (count >= 2);
1508+
}
1509+
14621510
// Try to fold an instruction into a simpler one
1463-
bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const {
1511+
bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI, Register *RegVCC,
1512+
Register *NewVCC) const {
14641513
unsigned Opc = MI.getOpcode();
14651514
if (Opc != AMDGPU::V_CNDMASK_B32_e32 && Opc != AMDGPU::V_CNDMASK_B32_e64 &&
14661515
Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
14671516
return false;
14681517

1518+
if (Opc == AMDGPU::V_CNDMASK_B32_e64) {
1519+
const DebugLoc &DL = MI.getDebugLoc();
1520+
auto Reg = MI.getOperand(5).getReg();
1521+
1522+
if (*RegVCC != Reg) {
1523+
MachineInstr *DefMI = MRI->getVRegDef(Reg);
1524+
if (DefMI) {
1525+
unsigned Opcode = getInverseCompareOpcode(*DefMI);
1526+
if (Opcode &&
1527+
SIFoldOperandsImpl::shouldSwitchOperands(*MRI, MI, *TII)) {
1528+
auto cmpDL = DefMI->getDebugLoc();
1529+
*NewVCC = MRI->createVirtualRegister(MRI->getRegClass(Reg));
1530+
*RegVCC = Reg;
1531+
MachineInstrBuilder inverseCompare = BuildMI(
1532+
*DefMI->getParent(), DefMI, cmpDL, TII->get(Opcode), *NewVCC);
1533+
1534+
inverseCompare.add(DefMI->getOperand(1));
1535+
inverseCompare.add(DefMI->getOperand(2));
1536+
}
1537+
}
1538+
}
1539+
if (*RegVCC == Reg) {
1540+
BuildMI(*MI.getParent(), MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64),
1541+
MI.getOperand(0).getReg())
1542+
.add(MI.getOperand(3))
1543+
.add(MI.getOperand(4))
1544+
.add(MI.getOperand(1))
1545+
.add(MI.getOperand(2))
1546+
.addReg(*NewVCC);
1547+
MI.eraseFromParent();
1548+
return true;
1549+
}
1550+
}
1551+
14691552
MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
14701553
MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
14711554
if (!Src1->isIdenticalTo(*Src0)) {
@@ -2533,10 +2616,12 @@ bool SIFoldOperandsImpl::run(MachineFunction &MF) {
25332616
bool HasNSZ = MFI->hasNoSignedZerosFPMath();
25342617

25352618
bool Changed = false;
2619+
Register Reg = 0;
2620+
Register newVCC = 0;
25362621
for (MachineBasicBlock *MBB : depth_first(&MF)) {
25372622
MachineOperand *CurrentKnownM0Val = nullptr;
25382623
for (auto &MI : make_early_inc_range(*MBB)) {
2539-
Changed |= tryFoldCndMask(MI);
2624+
Changed |= tryFoldCndMask(MI, &Reg, &newVCC);
25402625

25412626
if (tryFoldZeroHighBits(MI)) {
25422627
Changed = true;

llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2835,9 +2835,9 @@ define i48 @v_uaddsat_i48(i48 %lhs, i48 %rhs) {
28352835
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
28362836
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
28372837
; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
2838-
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2
2839-
; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
2840-
; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc
2838+
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
2839+
; GFX6-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
2840+
; GFX6-NEXT: v_cndmask_b32_e32 v1, -1, v2, vcc
28412841
; GFX6-NEXT: s_setpc_b64 s[30:31]
28422842
;
28432843
; GFX8-LABEL: v_uaddsat_i48:
@@ -2944,10 +2944,10 @@ define amdgpu_ps <2 x float> @uaddsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
29442944
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0
29452945
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
29462946
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
2947-
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2
2947+
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
29482948
; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
2949-
; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc
2950-
; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
2949+
; GFX6-NEXT: v_cndmask_b32_e32 v1, -1, v2, vcc
2950+
; GFX6-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
29512951
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
29522952
; GFX6-NEXT: ; return to shader part epilog
29532953
;
@@ -3003,10 +3003,10 @@ define amdgpu_ps <2 x float> @uaddsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
30033003
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0
30043004
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
30053005
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
3006-
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2
3006+
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
30073007
; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
3008-
; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc
3009-
; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
3008+
; GFX6-NEXT: v_cndmask_b32_e32 v1, -1, v2, vcc
3009+
; GFX6-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
30103010
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
30113011
; GFX6-NEXT: ; return to shader part epilog
30123012
;

llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2705,9 +2705,9 @@ define i48 @v_usubsat_i48(i48 %lhs, i48 %rhs) {
27052705
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
27062706
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
27072707
; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
2708-
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2
2709-
; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
2710-
; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
2708+
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
2709+
; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
2710+
; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
27112711
; GFX6-NEXT: s_setpc_b64 s[30:31]
27122712
;
27132713
; GFX8-LABEL: v_usubsat_i48:
@@ -2815,9 +2815,9 @@ define amdgpu_ps <2 x float> @usubsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
28152815
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
28162816
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
28172817
; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
2818-
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2
2819-
; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
2820-
; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
2818+
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
2819+
; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
2820+
; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
28212821
; GFX6-NEXT: ; return to shader part epilog
28222822
;
28232823
; GFX8-LABEL: usubsat_i48_sv:
@@ -2873,9 +2873,9 @@ define amdgpu_ps <2 x float> @usubsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
28732873
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
28742874
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
28752875
; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
2876-
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2
2877-
; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
2878-
; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
2876+
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
2877+
; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
2878+
; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
28792879
; GFX6-NEXT: ; return to shader part epilog
28802880
;
28812881
; GFX8-LABEL: usubsat_i48_vs:

llvm/test/CodeGen/AMDGPU/div_i128.ll

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1287,11 +1287,11 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
12871287
; GFX9-G-NEXT: v_xor_b32_e32 v6, 0x7f, v0
12881288
; GFX9-G-NEXT: v_or_b32_e32 v14, v6, v2
12891289
; GFX9-G-NEXT: v_and_b32_e32 v6, 1, v20
1290-
; GFX9-G-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
1291-
; GFX9-G-NEXT: v_cndmask_b32_e64 v6, v8, 0, vcc
1292-
; GFX9-G-NEXT: v_cndmask_b32_e64 v7, v9, 0, vcc
1293-
; GFX9-G-NEXT: v_cndmask_b32_e64 v12, v10, 0, vcc
1294-
; GFX9-G-NEXT: v_cndmask_b32_e64 v13, v11, 0, vcc
1290+
; GFX9-G-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6
1291+
; GFX9-G-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc
1292+
; GFX9-G-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc
1293+
; GFX9-G-NEXT: v_cndmask_b32_e32 v12, 0, v10, vcc
1294+
; GFX9-G-NEXT: v_cndmask_b32_e32 v13, 0, v11, vcc
12951295
; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15]
12961296
; GFX9-G-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
12971297
; GFX9-G-NEXT: v_or_b32_e32 v14, v20, v14
@@ -3414,11 +3414,11 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
34143414
; GFX9-G-NEXT: v_xor_b32_e32 v8, 0x7f, v12
34153415
; GFX9-G-NEXT: v_or_b32_e32 v16, v8, v14
34163416
; GFX9-G-NEXT: v_and_b32_e32 v8, 1, v18
3417-
; GFX9-G-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
3418-
; GFX9-G-NEXT: v_cndmask_b32_e64 v10, v0, 0, vcc
3419-
; GFX9-G-NEXT: v_cndmask_b32_e64 v11, v1, 0, vcc
3420-
; GFX9-G-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc
3421-
; GFX9-G-NEXT: v_cndmask_b32_e64 v9, v3, 0, vcc
3417+
; GFX9-G-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8
3418+
; GFX9-G-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc
3419+
; GFX9-G-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc
3420+
; GFX9-G-NEXT: v_cndmask_b32_e32 v8, 0, v2, vcc
3421+
; GFX9-G-NEXT: v_cndmask_b32_e32 v9, 0, v3, vcc
34223422
; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17]
34233423
; GFX9-G-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
34243424
; GFX9-G-NEXT: v_or_b32_e32 v16, v18, v16

llvm/test/CodeGen/AMDGPU/div_v2i128.ll

Lines changed: 40 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -495,13 +495,13 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
495495
; GISEL-NEXT: v_or_b32_e32 v8, v9, v8
496496
; GISEL-NEXT: v_and_b32_e32 v9, 1, v9
497497
; GISEL-NEXT: v_and_b32_e32 v8, 1, v8
498-
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9
499-
; GISEL-NEXT: v_cndmask_b32_e64 v22, v18, 0, vcc
498+
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9
499+
; GISEL-NEXT: v_cndmask_b32_e32 v22, 0, v18, vcc
500500
; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v8
501-
; GISEL-NEXT: v_cndmask_b32_e64 v8, v20, 0, vcc
502-
; GISEL-NEXT: v_cndmask_b32_e64 v9, v21, 0, vcc
501+
; GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v20, vcc
502+
; GISEL-NEXT: v_cndmask_b32_e32 v9, 0, v21, vcc
503503
; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1
504-
; GISEL-NEXT: v_cndmask_b32_e64 v23, v19, 0, vcc
504+
; GISEL-NEXT: v_cndmask_b32_e32 v23, 0, v19, vcc
505505
; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
506506
; GISEL-NEXT: s_cbranch_execz .LBB0_6
507507
; GISEL-NEXT: ; %bb.1: ; %udiv-bb15
@@ -685,12 +685,12 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
685685
; GISEL-NEXT: v_or_b32_e32 v11, v14, v15
686686
; GISEL-NEXT: v_and_b32_e32 v14, 1, v11
687687
; GISEL-NEXT: v_or_b32_e32 v10, v11, v10
688-
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14
689-
; GISEL-NEXT: v_cndmask_b32_e64 v14, v6, 0, vcc
688+
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14
689+
; GISEL-NEXT: v_cndmask_b32_e32 v14, 0, v6, vcc
690690
; GISEL-NEXT: v_and_b32_e32 v16, 1, v10
691-
; GISEL-NEXT: v_cndmask_b32_e64 v15, v7, 0, vcc
692-
; GISEL-NEXT: v_cndmask_b32_e64 v10, v12, 0, vcc
693-
; GISEL-NEXT: v_cndmask_b32_e64 v11, v13, 0, vcc
691+
; GISEL-NEXT: v_cndmask_b32_e32 v15, 0, v7, vcc
692+
; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v12, vcc
693+
; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc
694694
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
695695
; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
696696
; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
@@ -1251,13 +1251,13 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
12511251
; GISEL-NEXT: v_or_b32_e32 v2, v3, v2
12521252
; GISEL-NEXT: v_and_b32_e32 v3, 1, v3
12531253
; GISEL-NEXT: v_and_b32_e32 v2, 1, v2
1254-
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
1255-
; GISEL-NEXT: v_cndmask_b32_e64 v18, v0, 0, vcc
1254+
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
1255+
; GISEL-NEXT: v_cndmask_b32_e32 v18, 0, v0, vcc
12561256
; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v2
1257-
; GISEL-NEXT: v_cndmask_b32_e64 v2, v16, 0, vcc
1258-
; GISEL-NEXT: v_cndmask_b32_e64 v3, v17, 0, vcc
1257+
; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc
1258+
; GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc
12591259
; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1
1260-
; GISEL-NEXT: v_cndmask_b32_e64 v19, v1, 0, vcc
1260+
; GISEL-NEXT: v_cndmask_b32_e32 v19, 0, v1, vcc
12611261
; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
12621262
; GISEL-NEXT: s_cbranch_execz .LBB1_6
12631263
; GISEL-NEXT: ; %bb.1: ; %udiv-bb15
@@ -1423,12 +1423,12 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
14231423
; GISEL-NEXT: v_or_b32_e32 v9, v20, v10
14241424
; GISEL-NEXT: v_and_b32_e32 v10, 1, v9
14251425
; GISEL-NEXT: v_or_b32_e32 v8, v9, v8
1426-
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
1427-
; GISEL-NEXT: v_cndmask_b32_e64 v10, v4, 0, vcc
1426+
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10
1427+
; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v4, vcc
14281428
; GISEL-NEXT: v_and_b32_e32 v20, 1, v8
1429-
; GISEL-NEXT: v_cndmask_b32_e64 v11, v5, 0, vcc
1430-
; GISEL-NEXT: v_cndmask_b32_e64 v8, v6, 0, vcc
1431-
; GISEL-NEXT: v_cndmask_b32_e64 v9, v7, 0, vcc
1429+
; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v5, vcc
1430+
; GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v6, vcc
1431+
; GISEL-NEXT: v_cndmask_b32_e32 v9, 0, v7, vcc
14321432
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20
14331433
; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
14341434
; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
@@ -2093,13 +2093,13 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
20932093
; GISEL-NEXT: v_or_b32_e32 v18, v19, v18
20942094
; GISEL-NEXT: v_and_b32_e32 v19, 1, v19
20952095
; GISEL-NEXT: v_and_b32_e32 v18, 1, v18
2096-
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19
2097-
; GISEL-NEXT: v_cndmask_b32_e64 v31, v16, 0, vcc
2096+
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19
2097+
; GISEL-NEXT: v_cndmask_b32_e32 v31, 0, v16, vcc
20982098
; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v18
2099-
; GISEL-NEXT: v_cndmask_b32_e64 v18, v8, 0, vcc
2100-
; GISEL-NEXT: v_cndmask_b32_e64 v19, v9, 0, vcc
2099+
; GISEL-NEXT: v_cndmask_b32_e32 v18, 0, v8, vcc
2100+
; GISEL-NEXT: v_cndmask_b32_e32 v19, 0, v9, vcc
21012101
; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1
2102-
; GISEL-NEXT: v_cndmask_b32_e64 v32, v17, 0, vcc
2102+
; GISEL-NEXT: v_cndmask_b32_e32 v32, 0, v17, vcc
21032103
; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
21042104
; GISEL-NEXT: s_cbranch_execz .LBB2_6
21052105
; GISEL-NEXT: ; %bb.1: ; %udiv-bb15
@@ -2283,12 +2283,12 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
22832283
; GISEL-NEXT: v_or_b32_e32 v3, v20, v21
22842284
; GISEL-NEXT: v_and_b32_e32 v20, 1, v3
22852285
; GISEL-NEXT: v_or_b32_e32 v2, v3, v2
2286-
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20
2287-
; GISEL-NEXT: v_cndmask_b32_e64 v20, v12, 0, vcc
2286+
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v20
2287+
; GISEL-NEXT: v_cndmask_b32_e32 v20, 0, v12, vcc
22882288
; GISEL-NEXT: v_and_b32_e32 v22, 1, v2
2289-
; GISEL-NEXT: v_cndmask_b32_e64 v21, v13, 0, vcc
2290-
; GISEL-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc
2291-
; GISEL-NEXT: v_cndmask_b32_e64 v3, v7, 0, vcc
2289+
; GISEL-NEXT: v_cndmask_b32_e32 v21, 0, v13, vcc
2290+
; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc
2291+
; GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v7, vcc
22922292
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22
22932293
; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
22942294
; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
@@ -2920,13 +2920,13 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
29202920
; GISEL-NEXT: v_or_b32_e32 v20, v21, v20
29212921
; GISEL-NEXT: v_and_b32_e32 v21, 1, v21
29222922
; GISEL-NEXT: v_and_b32_e32 v20, 1, v20
2923-
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21
2924-
; GISEL-NEXT: v_cndmask_b32_e64 v32, v0, 0, vcc
2923+
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v21
2924+
; GISEL-NEXT: v_cndmask_b32_e32 v32, 0, v0, vcc
29252925
; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v20
2926-
; GISEL-NEXT: v_cndmask_b32_e64 v20, v2, 0, vcc
2927-
; GISEL-NEXT: v_cndmask_b32_e64 v21, v3, 0, vcc
2926+
; GISEL-NEXT: v_cndmask_b32_e32 v20, 0, v2, vcc
2927+
; GISEL-NEXT: v_cndmask_b32_e32 v21, 0, v3, vcc
29282928
; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1
2929-
; GISEL-NEXT: v_cndmask_b32_e64 v33, v1, 0, vcc
2929+
; GISEL-NEXT: v_cndmask_b32_e32 v33, 0, v1, vcc
29302930
; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
29312931
; GISEL-NEXT: s_cbranch_execz .LBB3_6
29322932
; GISEL-NEXT: ; %bb.1: ; %udiv-bb15
@@ -3092,12 +3092,12 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
30923092
; GISEL-NEXT: v_or_b32_e32 v19, v26, v24
30933093
; GISEL-NEXT: v_and_b32_e32 v24, 1, v19
30943094
; GISEL-NEXT: v_or_b32_e32 v18, v19, v18
3095-
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24
3096-
; GISEL-NEXT: v_cndmask_b32_e64 v24, v4, 0, vcc
3095+
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24
3096+
; GISEL-NEXT: v_cndmask_b32_e32 v24, 0, v4, vcc
30973097
; GISEL-NEXT: v_and_b32_e32 v26, 1, v18
3098-
; GISEL-NEXT: v_cndmask_b32_e64 v25, v5, 0, vcc
3099-
; GISEL-NEXT: v_cndmask_b32_e64 v18, v6, 0, vcc
3100-
; GISEL-NEXT: v_cndmask_b32_e64 v19, v7, 0, vcc
3098+
; GISEL-NEXT: v_cndmask_b32_e32 v25, 0, v5, vcc
3099+
; GISEL-NEXT: v_cndmask_b32_e32 v18, 0, v6, vcc
3100+
; GISEL-NEXT: v_cndmask_b32_e32 v19, 0, v7, vcc
31013101
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
31023102
; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
31033103
; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]

0 commit comments

Comments
 (0)