Skip to content

Commit 81bbe98

Browse files
authored
Revert "[AArch64][Machine-Combiner] Split gather patterns into neon regs to multiple vectors (#142941)" (#150505)
Reverting due to reported miscompiles, will reland once it is fixed.
1 parent 75346e3 commit 81bbe98

File tree

10 files changed

+346
-996
lines changed

10 files changed

+346
-996
lines changed

llvm/lib/Target/AArch64/AArch64InstrInfo.cpp

Lines changed: 0 additions & 265 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
#include "Utils/AArch64BaseInfo.h"
2121
#include "llvm/ADT/ArrayRef.h"
2222
#include "llvm/ADT/STLExtras.h"
23-
#include "llvm/ADT/SmallSet.h"
2423
#include "llvm/ADT/SmallVector.h"
2524
#include "llvm/CodeGen/CFIInstBuilder.h"
2625
#include "llvm/CodeGen/LivePhysRegs.h"
@@ -36,7 +35,6 @@
3635
#include "llvm/CodeGen/MachineRegisterInfo.h"
3736
#include "llvm/CodeGen/RegisterScavenging.h"
3837
#include "llvm/CodeGen/StackMaps.h"
39-
#include "llvm/CodeGen/TargetOpcodes.h"
4038
#include "llvm/CodeGen/TargetRegisterInfo.h"
4139
#include "llvm/CodeGen/TargetSubtargetInfo.h"
4240
#include "llvm/IR/DebugInfoMetadata.h"
@@ -7354,9 +7352,6 @@ bool AArch64InstrInfo::isThroughputPattern(unsigned Pattern) const {
73547352
case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
73557353
case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
73567354
case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
7357-
case AArch64MachineCombinerPattern::GATHER_LANE_i32:
7358-
case AArch64MachineCombinerPattern::GATHER_LANE_i16:
7359-
case AArch64MachineCombinerPattern::GATHER_LANE_i8:
73607355
return true;
73617356
} // end switch (Pattern)
73627357
return false;
@@ -7397,252 +7392,11 @@ static bool getMiscPatterns(MachineInstr &Root,
73977392
return false;
73987393
}
73997394

7400-
static bool getGatherPattern(MachineInstr &Root,
7401-
SmallVectorImpl<unsigned> &Patterns,
7402-
unsigned LoadLaneOpCode, unsigned NumLanes) {
7403-
const MachineFunction *MF = Root.getMF();
7404-
7405-
// Early exit if optimizing for size.
7406-
if (MF->getFunction().hasMinSize())
7407-
return false;
7408-
7409-
const MachineRegisterInfo &MRI = MF->getRegInfo();
7410-
const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
7411-
7412-
// The root of the pattern must load into the last lane of the vector.
7413-
if (Root.getOperand(2).getImm() != NumLanes - 1)
7414-
return false;
7415-
7416-
// Check that we have load into all lanes except lane 0.
7417-
// For each load we also want to check that:
7418-
// 1. It has a single non-debug use (since we will be replacing the virtual
7419-
// register)
7420-
// 2. That the addressing mode only uses a single offset register.
7421-
auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
7422-
auto Range = llvm::seq<unsigned>(1, NumLanes - 1);
7423-
SmallSet<unsigned, 4> RemainingLanes(Range.begin(), Range.end());
7424-
while (!RemainingLanes.empty() && CurrInstr &&
7425-
CurrInstr->getOpcode() == LoadLaneOpCode &&
7426-
MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) &&
7427-
CurrInstr->getNumOperands() == 4) {
7428-
RemainingLanes.erase(CurrInstr->getOperand(2).getImm());
7429-
CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
7430-
}
7431-
7432-
if (!RemainingLanes.empty())
7433-
return false;
7434-
7435-
// Match the SUBREG_TO_REG sequence.
7436-
if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG)
7437-
return false;
7438-
7439-
// Verify that the subreg to reg loads an integer into the first lane.
7440-
auto Lane0LoadReg = CurrInstr->getOperand(2).getReg();
7441-
unsigned SingleLaneSizeInBits = 128 / NumLanes;
7442-
if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != SingleLaneSizeInBits)
7443-
return false;
7444-
7445-
// Verify that it also has a single non debug use.
7446-
if (!MRI.hasOneNonDBGUse(Lane0LoadReg))
7447-
return false;
7448-
7449-
switch (NumLanes) {
7450-
case 4:
7451-
Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i32);
7452-
break;
7453-
case 8:
7454-
Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i16);
7455-
break;
7456-
case 16:
7457-
Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i8);
7458-
break;
7459-
default:
7460-
llvm_unreachable("Got bad number of lanes for gather pattern.");
7461-
}
7462-
7463-
return true;
7464-
}
7465-
7466-
/// Search for patterns where we use LD1 instructions to load into
7467-
/// separate lanes of an 128 bit Neon register. We can increase Memory Level
7468-
/// Parallelism by loading into 2 Neon registers instead.
7469-
static bool getLoadPatterns(MachineInstr &Root,
7470-
SmallVectorImpl<unsigned> &Patterns) {
7471-
7472-
// The pattern searches for loads into single lanes.
7473-
switch (Root.getOpcode()) {
7474-
case AArch64::LD1i32:
7475-
return getGatherPattern(Root, Patterns, Root.getOpcode(), 4);
7476-
case AArch64::LD1i16:
7477-
return getGatherPattern(Root, Patterns, Root.getOpcode(), 8);
7478-
case AArch64::LD1i8:
7479-
return getGatherPattern(Root, Patterns, Root.getOpcode(), 16);
7480-
default:
7481-
return false;
7482-
}
7483-
}
7484-
7485-
static void
7486-
generateGatherPattern(MachineInstr &Root,
7487-
SmallVectorImpl<MachineInstr *> &InsInstrs,
7488-
SmallVectorImpl<MachineInstr *> &DelInstrs,
7489-
DenseMap<Register, unsigned> &InstrIdxForVirtReg,
7490-
unsigned Pattern, unsigned NumLanes) {
7491-
7492-
MachineFunction &MF = *Root.getParent()->getParent();
7493-
MachineRegisterInfo &MRI = MF.getRegInfo();
7494-
const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
7495-
7496-
// Gather the initial load instructions to build the pattern
7497-
SmallVector<MachineInstr *, 16> LoadToLaneInstrs;
7498-
MachineInstr *CurrInstr = &Root;
7499-
for (unsigned i = 0; i < NumLanes - 1; ++i) {
7500-
LoadToLaneInstrs.push_back(CurrInstr);
7501-
CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
7502-
}
7503-
7504-
// Sort the load instructions according to the lane.
7505-
llvm::sort(LoadToLaneInstrs,
7506-
[](const MachineInstr *A, const MachineInstr *B) {
7507-
return A->getOperand(2).getImm() > B->getOperand(2).getImm();
7508-
});
7509-
7510-
MachineInstr *SubregToReg = CurrInstr;
7511-
LoadToLaneInstrs.push_back(
7512-
MRI.getUniqueVRegDef(SubregToReg->getOperand(2).getReg()));
7513-
auto LoadToLaneInstrsAscending = llvm::reverse(LoadToLaneInstrs);
7514-
7515-
const TargetRegisterClass *FPR128RegClass =
7516-
MRI.getRegClass(Root.getOperand(0).getReg());
7517-
7518-
auto LoadLaneToRegister = [&](MachineInstr *OriginalInstr,
7519-
Register SrcRegister, unsigned Lane,
7520-
Register OffsetRegister) {
7521-
auto NewRegister = MRI.createVirtualRegister(FPR128RegClass);
7522-
MachineInstrBuilder LoadIndexIntoRegister =
7523-
BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()),
7524-
NewRegister)
7525-
.addReg(SrcRegister)
7526-
.addImm(Lane)
7527-
.addReg(OffsetRegister, getKillRegState(true));
7528-
InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size()));
7529-
InsInstrs.push_back(LoadIndexIntoRegister);
7530-
return NewRegister;
7531-
};
7532-
7533-
// Helper to create load instruction based on opcode
7534-
auto CreateLoadInstruction = [&](unsigned NumLanes, Register DestReg,
7535-
Register OffsetReg) -> MachineInstrBuilder {
7536-
unsigned Opcode;
7537-
switch (NumLanes) {
7538-
case 4:
7539-
Opcode = AArch64::LDRSui;
7540-
break;
7541-
case 8:
7542-
Opcode = AArch64::LDRHui;
7543-
break;
7544-
case 16:
7545-
Opcode = AArch64::LDRBui;
7546-
break;
7547-
default:
7548-
llvm_unreachable(
7549-
"Got unsupported number of lanes in machine-combiner gather pattern");
7550-
}
7551-
// Immediate offset load
7552-
return BuildMI(MF, MIMetadata(Root), TII->get(Opcode), DestReg)
7553-
.addReg(OffsetReg)
7554-
.addImm(0); // immediate offset
7555-
};
7556-
7557-
// Load the remaining lanes into register 0.
7558-
auto LanesToLoadToReg0 =
7559-
llvm::make_range(LoadToLaneInstrsAscending.begin() + 1,
7560-
LoadToLaneInstrsAscending.begin() + NumLanes / 2);
7561-
auto PrevReg = SubregToReg->getOperand(0).getReg();
7562-
for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg0)) {
7563-
PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1,
7564-
LoadInstr->getOperand(3).getReg());
7565-
DelInstrs.push_back(LoadInstr);
7566-
}
7567-
auto LastLoadReg0 = PrevReg;
7568-
7569-
// First load into register 1. Perform a LDRSui to zero out the upper lanes in
7570-
// a single instruction.
7571-
auto Lane0Load = *LoadToLaneInstrsAscending.begin();
7572-
auto OriginalSplitLoad =
7573-
*std::next(LoadToLaneInstrsAscending.begin(), NumLanes / 2);
7574-
auto DestRegForMiddleIndex = MRI.createVirtualRegister(
7575-
MRI.getRegClass(Lane0Load->getOperand(0).getReg()));
7576-
7577-
MachineInstrBuilder MiddleIndexLoadInstr =
7578-
CreateLoadInstruction(NumLanes, DestRegForMiddleIndex,
7579-
OriginalSplitLoad->getOperand(3).getReg());
7580-
7581-
InstrIdxForVirtReg.insert(
7582-
std::make_pair(DestRegForMiddleIndex, InsInstrs.size()));
7583-
InsInstrs.push_back(MiddleIndexLoadInstr);
7584-
DelInstrs.push_back(OriginalSplitLoad);
7585-
7586-
// Subreg To Reg instruction for register 1.
7587-
auto DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass);
7588-
unsigned SubregType;
7589-
switch (NumLanes) {
7590-
case 4:
7591-
SubregType = AArch64::ssub;
7592-
break;
7593-
case 8:
7594-
SubregType = AArch64::hsub;
7595-
break;
7596-
case 16:
7597-
SubregType = AArch64::bsub;
7598-
break;
7599-
default:
7600-
llvm_unreachable(
7601-
"Got invalid NumLanes for machine-combiner gather pattern");
7602-
}
7603-
7604-
auto SubRegToRegInstr =
7605-
BuildMI(MF, MIMetadata(Root), TII->get(SubregToReg->getOpcode()),
7606-
DestRegForSubregToReg)
7607-
.addImm(0)
7608-
.addReg(DestRegForMiddleIndex, getKillRegState(true))
7609-
.addImm(SubregType);
7610-
InstrIdxForVirtReg.insert(
7611-
std::make_pair(DestRegForSubregToReg, InsInstrs.size()));
7612-
InsInstrs.push_back(SubRegToRegInstr);
7613-
7614-
// Load remaining lanes into register 1.
7615-
auto LanesToLoadToReg1 =
7616-
llvm::make_range(LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1,
7617-
LoadToLaneInstrsAscending.end());
7618-
PrevReg = SubRegToRegInstr->getOperand(0).getReg();
7619-
for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg1)) {
7620-
PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1,
7621-
LoadInstr->getOperand(3).getReg());
7622-
if (Index == NumLanes / 2 - 2) {
7623-
break;
7624-
}
7625-
DelInstrs.push_back(LoadInstr);
7626-
}
7627-
auto LastLoadReg1 = PrevReg;
7628-
7629-
// Create the final zip instruction to combine the results.
7630-
MachineInstrBuilder ZipInstr =
7631-
BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64),
7632-
Root.getOperand(0).getReg())
7633-
.addReg(LastLoadReg0)
7634-
.addReg(LastLoadReg1);
7635-
InsInstrs.push_back(ZipInstr);
7636-
}
7637-
76387395
CombinerObjective
76397396
AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const {
76407397
switch (Pattern) {
76417398
case AArch64MachineCombinerPattern::SUBADD_OP1:
76427399
case AArch64MachineCombinerPattern::SUBADD_OP2:
7643-
case AArch64MachineCombinerPattern::GATHER_LANE_i32:
7644-
case AArch64MachineCombinerPattern::GATHER_LANE_i16:
7645-
case AArch64MachineCombinerPattern::GATHER_LANE_i8:
76467400
return CombinerObjective::MustReduceDepth;
76477401
default:
76487402
return TargetInstrInfo::getCombinerObjective(Pattern);
@@ -7672,10 +7426,6 @@ bool AArch64InstrInfo::getMachineCombinerPatterns(
76727426
if (getMiscPatterns(Root, Patterns))
76737427
return true;
76747428

7675-
// Load patterns
7676-
if (getLoadPatterns(Root, Patterns))
7677-
return true;
7678-
76797429
return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
76807430
DoRegPressureReduce);
76817431
}
@@ -8931,21 +8681,6 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
89318681
MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
89328682
break;
89338683
}
8934-
case AArch64MachineCombinerPattern::GATHER_LANE_i32: {
8935-
generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
8936-
Pattern, 4);
8937-
break;
8938-
}
8939-
case AArch64MachineCombinerPattern::GATHER_LANE_i16: {
8940-
generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
8941-
Pattern, 8);
8942-
break;
8943-
}
8944-
case AArch64MachineCombinerPattern::GATHER_LANE_i8: {
8945-
generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
8946-
Pattern, 16);
8947-
break;
8948-
}
89498684

89508685
} // end switch (Pattern)
89518686
// Record MUL and ADD/SUB for deletion

llvm/lib/Target/AArch64/AArch64InstrInfo.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -172,10 +172,6 @@ enum AArch64MachineCombinerPattern : unsigned {
172172
FMULv8i16_indexed_OP2,
173173

174174
FNMADD,
175-
176-
GATHER_LANE_i32,
177-
GATHER_LANE_i16,
178-
GATHER_LANE_i8
179175
};
180176
class AArch64InstrInfo final : public AArch64GenInstrInfo {
181177
const AArch64RegisterInfo RI;

0 commit comments

Comments
 (0)