Skip to content

Commit 6dcb509

Browse files
hbrodinwizardengineer
authored andcommitted
[CT] Generate more efficient code when fp is from memory
Typical sequence was: 1. Load fp from mem -> fp reg 2. Store fp -> stack 3. Load int from stack New sequence is instead: 1. Load from stack (as int) This improves codegen for: - Global floating point variables - Global floating point constants - Stack based floating point variables
1 parent def8b7b commit 6dcb509

File tree

4 files changed

+622
-267
lines changed

4 files changed

+622
-267
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 292 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -38057,6 +38057,121 @@ emitCTSelectI386WithConditionMaterialization(MachineInstr &MI,
3805738057
return BB;
3805838058
}
3805938059

38060+
// Helper structure to hold memory operand information for FP loads
38061+
struct FPLoadMemOperands {
38062+
bool IsValid = false;
38063+
unsigned BaseReg = 0;
38064+
int64_t ScaleVal = 1;
38065+
unsigned IndexReg = 0;
38066+
int64_t Disp = 0;
38067+
unsigned SegReg = 0;
38068+
int FrameIndex = -1;
38069+
bool IsFrameIndex = false;
38070+
int ConstantPoolIndex = -1;
38071+
bool IsConstantPool = false;
38072+
const GlobalValue *Global = nullptr;
38073+
int64_t GlobalOffset = 0;
38074+
bool IsGlobal = false;
38075+
};
38076+
38077+
// Check if a virtual register is defined by a simple FP load instruction
38078+
// Returns the memory operands if it's a simple load, otherwise returns invalid
38079+
static FPLoadMemOperands getFPLoadMemOperands(Register Reg,
38080+
MachineRegisterInfo &MRI,
38081+
unsigned ExpectedLoadOpcode) {
38082+
FPLoadMemOperands Result;
38083+
38084+
if (!Reg.isVirtual())
38085+
return Result;
38086+
38087+
MachineInstr *DefMI = MRI.getVRegDef(Reg);
38088+
if (!DefMI)
38089+
return Result;
38090+
38091+
// Check if it's the expected load opcode (e.g., LD_Fp32m, LD_Fp64m, LD_Fp80m)
38092+
if (DefMI->getOpcode() != ExpectedLoadOpcode)
38093+
return Result;
38094+
38095+
// Check that this is a simple load - not volatile, not atomic, etc.
38096+
// FP loads have hasSideEffects = 0 in their definition for simple loads
38097+
if (DefMI->hasOrderedMemoryRef())
38098+
return Result;
38099+
38100+
// The load should have a single def (the destination register) and memory operands
38101+
// Format: %reg = LD_Fpxxm <fi#N>, 1, %noreg, 0, %noreg
38102+
// or: %reg = LD_Fpxxm %base, scale, %index, disp, %segment
38103+
if (DefMI->getNumOperands() < 6)
38104+
return Result;
38105+
38106+
// Operand 0 is the destination, operands 1-5 are the memory reference
38107+
MachineOperand &BaseMO = DefMI->getOperand(1);
38108+
MachineOperand &ScaleMO = DefMI->getOperand(2);
38109+
MachineOperand &IndexMO = DefMI->getOperand(3);
38110+
MachineOperand &DispMO = DefMI->getOperand(4);
38111+
MachineOperand &SegMO = DefMI->getOperand(5);
38112+
38113+
// Check if this is a frame index load
38114+
if (BaseMO.isFI()) {
38115+
Result.IsValid = true;
38116+
Result.IsFrameIndex = true;
38117+
Result.FrameIndex = BaseMO.getIndex();
38118+
Result.ScaleVal = ScaleMO.getImm();
38119+
Result.IndexReg = IndexMO.getReg();
38120+
Result.Disp = DispMO.getImm();
38121+
Result.SegReg = SegMO.getReg();
38122+
return Result;
38123+
}
38124+
38125+
// Check if this is a constant pool load
38126+
// Format: %reg = LD_Fpxxm $noreg, 1, $noreg, %const.N, $noreg
38127+
if (BaseMO.isReg() && BaseMO.getReg() == X86::NoRegister &&
38128+
ScaleMO.isImm() && IndexMO.isReg() &&
38129+
IndexMO.getReg() == X86::NoRegister &&
38130+
DispMO.isCPI() && SegMO.isReg()) {
38131+
Result.IsValid = true;
38132+
Result.IsConstantPool = true;
38133+
Result.ConstantPoolIndex = DispMO.getIndex();
38134+
Result.ScaleVal = ScaleMO.getImm();
38135+
Result.IndexReg = IndexMO.getReg();
38136+
Result.Disp = 0;
38137+
Result.SegReg = SegMO.getReg();
38138+
return Result;
38139+
}
38140+
38141+
// Check if this is a global variable load
38142+
// Format: %reg = LD_Fpxxm $noreg, 1, $noreg, @global_name, $noreg
38143+
if (BaseMO.isReg() && BaseMO.getReg() == X86::NoRegister &&
38144+
ScaleMO.isImm() && IndexMO.isReg() &&
38145+
IndexMO.getReg() == X86::NoRegister &&
38146+
DispMO.isGlobal() && SegMO.isReg()) {
38147+
Result.IsValid = true;
38148+
Result.IsGlobal = true;
38149+
Result.Global = DispMO.getGlobal();
38150+
Result.GlobalOffset = DispMO.getOffset();
38151+
Result.ScaleVal = ScaleMO.getImm();
38152+
Result.IndexReg = IndexMO.getReg();
38153+
Result.Disp = 0;
38154+
Result.SegReg = SegMO.getReg();
38155+
return Result;
38156+
}
38157+
38158+
// Regular memory operands (e.g., pointer loads)
38159+
if (BaseMO.isReg() && ScaleMO.isImm() && IndexMO.isReg() &&
38160+
DispMO.isImm() && SegMO.isReg()) {
38161+
Result.IsValid = true;
38162+
Result.IsFrameIndex = false;
38163+
Result.IsConstantPool = false;
38164+
Result.BaseReg = BaseMO.getReg();
38165+
Result.ScaleVal = ScaleMO.getImm();
38166+
Result.IndexReg = IndexMO.getReg();
38167+
Result.Disp = DispMO.getImm();
38168+
Result.SegReg = SegMO.getReg();
38169+
return Result;
38170+
}
38171+
38172+
return Result;
38173+
}
38174+
3806038175
static MachineBasicBlock *emitCTSelectI386WithFpType(MachineInstr &MI,
3806138176
MachineBasicBlock *BB,
3806238177
unsigned pseudoInstr) {
@@ -38084,6 +38199,85 @@ static MachineBasicBlock *emitCTSelectI386WithFpType(MachineInstr &MI,
3808438199
.addReg(Reg, RegState::Kill);
3808538200
};
3808638201

38202+
// Helper to load integer from memory operands
38203+
auto loadIntFromMemOperands = [&](const FPLoadMemOperands &MemOps,
38204+
unsigned Offset) -> unsigned {
38205+
unsigned IntReg = MRI.createVirtualRegister(&X86::GR32RegClass);
38206+
MachineInstrBuilder MIB =
38207+
BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), IntReg);
38208+
38209+
if (MemOps.IsFrameIndex) {
38210+
// Frame index: addFrameIndex + scale + index + disp + segment
38211+
MIB.addFrameIndex(MemOps.FrameIndex)
38212+
.addImm(MemOps.ScaleVal)
38213+
.addReg(MemOps.IndexReg)
38214+
.addImm(MemOps.Disp + Offset)
38215+
.addReg(MemOps.SegReg);
38216+
} else if (MemOps.IsConstantPool) {
38217+
// Constant pool: base_reg + scale + index + CP_index + segment
38218+
// MOV32rm format: base, scale, index, displacement, segment
38219+
MIB.addReg(X86::NoRegister) // Base register
38220+
.addImm(MemOps.ScaleVal) // Scale
38221+
.addReg(MemOps.IndexReg) // Index register
38222+
.addConstantPoolIndex(MemOps.ConstantPoolIndex, Offset) // Displacement (CP index)
38223+
.addReg(MemOps.SegReg); // Segment
38224+
} else if (MemOps.IsGlobal) {
38225+
// Global variable: base_reg + scale + index + global + segment
38226+
// MOV32rm format: base, scale, index, displacement, segment
38227+
MIB.addReg(X86::NoRegister) // Base register
38228+
.addImm(MemOps.ScaleVal) // Scale
38229+
.addReg(MemOps.IndexReg) // Index register
38230+
.addGlobalAddress(MemOps.Global, MemOps.GlobalOffset + Offset) // Displacement (global address)
38231+
.addReg(MemOps.SegReg); // Segment
38232+
} else {
38233+
// Regular memory: base_reg + scale + index + disp + segment
38234+
MIB.addReg(MemOps.BaseReg)
38235+
.addImm(MemOps.ScaleVal)
38236+
.addReg(MemOps.IndexReg)
38237+
.addImm(MemOps.Disp + Offset)
38238+
.addReg(MemOps.SegReg);
38239+
}
38240+
38241+
return IntReg;
38242+
};
38243+
38244+
// Optimized path: load integers directly from memory when both operands are
38245+
// memory loads, avoiding FP register round-trip
38246+
auto emitCtSelectFromMemory = [&](unsigned NumValues,
38247+
const FPLoadMemOperands &TrueMemOps,
38248+
const FPLoadMemOperands &FalseMemOps,
38249+
int ResultSlot) {
38250+
for (unsigned Val = 0; Val < NumValues; ++Val) {
38251+
unsigned Offset = Val * RegSizeInByte;
38252+
38253+
// Load true and false values directly from their memory locations as integers
38254+
unsigned TrueIntReg = loadIntFromMemOperands(TrueMemOps, Offset);
38255+
unsigned FalseIntReg = loadIntFromMemOperands(FalseMemOps, Offset);
38256+
38257+
// Use CTSELECT_I386_INT_GR32 pseudo instruction for constant-time selection
38258+
unsigned ResultIntReg = MRI.createVirtualRegister(&X86::GR32RegClass);
38259+
unsigned TmpByteReg = MRI.createVirtualRegister(&X86::GR8RegClass);
38260+
unsigned TmpMaskReg = MRI.createVirtualRegister(&X86::GR32RegClass);
38261+
38262+
BuildMI(*BB, MI, MIMD, TII->get(X86::CTSELECT_I386_INT_GR32rr))
38263+
.addDef(ResultIntReg) // dst (output)
38264+
.addDef(TmpByteReg) // tmp_byte (output)
38265+
.addDef(TmpMaskReg) // tmp_mask (output)
38266+
.addReg(FalseIntReg) // src1 (input) - false value
38267+
.addReg(TrueIntReg) // src2 (input) - true value
38268+
.addReg(CondByteReg); // pre-materialized condition byte (input)
38269+
38270+
// Store result back to result slot
38271+
BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32mr))
38272+
.addFrameIndex(ResultSlot)
38273+
.addImm(1)
38274+
.addReg(0)
38275+
.addImm(Offset)
38276+
.addReg(0)
38277+
.addReg(ResultIntReg, RegState::Kill);
38278+
}
38279+
};
38280+
3808738281
auto emitCtSelectWithPseudo = [&](unsigned NumValues, int TrueSlot, int FalseSlot, int ResultSlot) {
3808838282
for (unsigned Val = 0; Val < NumValues; ++Val) {
3808938283
unsigned Offset = Val * RegSizeInByte;
@@ -38131,17 +38325,40 @@ static MachineBasicBlock *emitCTSelectI386WithFpType(MachineInstr &MI,
3813138325

3813238326
switch (pseudoInstr) {
3813338327
case X86::CTSELECT_I386_FP32rr: {
38134-
// Allocate stack slots (4 bytes for f32)
38328+
// Check if both operands are simple memory loads
38329+
FPLoadMemOperands TrueMemOps =
38330+
getFPLoadMemOperands(TrueReg, MRI, X86::LD_Fp32m);
38331+
FPLoadMemOperands FalseMemOps =
38332+
getFPLoadMemOperands(FalseReg, MRI, X86::LD_Fp32m);
38333+
3813538334
int ResultSlot = MFI.CreateStackObject(RegSizeInByte, Align(4), false);
38136-
int TrueSlot = MFI.CreateStackObject(RegSizeInByte, Align(4), false);
38137-
int FalseSlot = MFI.CreateStackObject(RegSizeInByte, Align(4), false);
3813838335

38139-
// Store f32 values to stack
38140-
storeFpToSlot(X86::ST_Fp32m, TrueSlot, TrueReg);
38141-
storeFpToSlot(X86::ST_Fp32m, FalseSlot, FalseReg);
38336+
if (TrueMemOps.IsValid && FalseMemOps.IsValid) {
38337+
// Optimized path: load directly from memory as integers
38338+
// Works for both frame index loads (stack parameters) and
38339+
// constant pool loads (constants)
38340+
emitCtSelectFromMemory(1, TrueMemOps, FalseMemOps, ResultSlot);
3814238341

38143-
// Use pseudo instruction for selection (1 x 32-bit value)
38144-
emitCtSelectWithPseudo(1, TrueSlot, FalseSlot, ResultSlot);
38342+
// Erase the original FP load instructions since we're not using them
38343+
// and have loaded the data directly as integers instead
38344+
if (MRI.hasOneUse(TrueReg)) {
38345+
if (MachineInstr *TrueDefMI = MRI.getVRegDef(TrueReg))
38346+
TrueDefMI->eraseFromParent();
38347+
}
38348+
if (MRI.hasOneUse(FalseReg)) {
38349+
if (MachineInstr *FalseDefMI = MRI.getVRegDef(FalseReg))
38350+
FalseDefMI->eraseFromParent();
38351+
}
38352+
} else {
38353+
// General path: spill FP registers to stack first
38354+
int TrueSlot = MFI.CreateStackObject(RegSizeInByte, Align(4), false);
38355+
int FalseSlot = MFI.CreateStackObject(RegSizeInByte, Align(4), false);
38356+
38357+
storeFpToSlot(X86::ST_Fp32m, TrueSlot, TrueReg);
38358+
storeFpToSlot(X86::ST_Fp32m, FalseSlot, FalseReg);
38359+
38360+
emitCtSelectWithPseudo(1, TrueSlot, FalseSlot, ResultSlot);
38361+
}
3814538362

3814638363
// Load result back as f32
3814738364
addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::LD_Fp32m), DestReg),
@@ -38150,36 +38367,87 @@ static MachineBasicBlock *emitCTSelectI386WithFpType(MachineInstr &MI,
3815038367
}
3815138368
case X86::CTSELECT_I386_FP64rr: {
3815238369
unsigned StackSlotSize = 8;
38153-
// Allocate stack slots (8 bytes for f64)
38154-
int TrueSlot = MFI.CreateStackObject(StackSlotSize, Align(4), false);
38155-
int FalseSlot = MFI.CreateStackObject(StackSlotSize, Align(4), false);
38370+
38371+
// Check if both operands are simple memory loads
38372+
FPLoadMemOperands TrueMemOps =
38373+
getFPLoadMemOperands(TrueReg, MRI, X86::LD_Fp64m);
38374+
FPLoadMemOperands FalseMemOps =
38375+
getFPLoadMemOperands(FalseReg, MRI, X86::LD_Fp64m);
38376+
3815638377
int ResultSlot = MFI.CreateStackObject(StackSlotSize, Align(4), false);
3815738378

38158-
// Store f64 values to stack
38159-
storeFpToSlot(X86::ST_Fp64m, TrueSlot, TrueReg);
38160-
storeFpToSlot(X86::ST_Fp64m, FalseSlot, FalseReg);
38379+
if (TrueMemOps.IsValid && FalseMemOps.IsValid) {
38380+
// Optimized path: load directly from memory as integers
38381+
// Works for both frame index loads (stack parameters) and
38382+
// constant pool loads (constants)
38383+
emitCtSelectFromMemory(StackSlotSize / RegSizeInByte, TrueMemOps,
38384+
FalseMemOps, ResultSlot);
3816138385

38162-
// Use pseudo instruction for selection (2 x 32-bit values)
38163-
emitCtSelectWithPseudo(StackSlotSize/RegSizeInByte, TrueSlot, FalseSlot, ResultSlot);
38386+
// Erase the original FP load instructions since we're not using them
38387+
if (MRI.hasOneUse(TrueReg)) {
38388+
if (MachineInstr *TrueDefMI = MRI.getVRegDef(TrueReg))
38389+
TrueDefMI->eraseFromParent();
38390+
}
38391+
if (MRI.hasOneUse(FalseReg)) {
38392+
if (MachineInstr *FalseDefMI = MRI.getVRegDef(FalseReg))
38393+
FalseDefMI->eraseFromParent();
38394+
}
38395+
} else {
38396+
// General path: spill FP registers to stack first
38397+
int TrueSlot = MFI.CreateStackObject(StackSlotSize, Align(4), false);
38398+
int FalseSlot = MFI.CreateStackObject(StackSlotSize, Align(4), false);
38399+
38400+
storeFpToSlot(X86::ST_Fp64m, TrueSlot, TrueReg);
38401+
storeFpToSlot(X86::ST_Fp64m, FalseSlot, FalseReg);
38402+
38403+
emitCtSelectWithPseudo(StackSlotSize / RegSizeInByte, TrueSlot, FalseSlot,
38404+
ResultSlot);
38405+
}
3816438406

3816538407
// Load result back as f64
3816638408
addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::LD_Fp64m), DestReg),
3816738409
ResultSlot);
3816838410
break;
3816938411
}
3817038412
case X86::CTSELECT_I386_FP80rr: {
38171-
// Allocate stack slots (12 bytes for f80 - 80-bit = 10 bytes, aligned to 12)
38413+
// f80 is 80 bits (10 bytes), but stored with 12-byte alignment
3817238414
unsigned StackObjectSize = 12;
38173-
int TrueSlot = MFI.CreateStackObject(StackObjectSize, Align(4), false);
38174-
int FalseSlot = MFI.CreateStackObject(StackObjectSize, Align(4), false);
38415+
38416+
// Check if both operands are simple memory loads
38417+
FPLoadMemOperands TrueMemOps =
38418+
getFPLoadMemOperands(TrueReg, MRI, X86::LD_Fp80m);
38419+
FPLoadMemOperands FalseMemOps =
38420+
getFPLoadMemOperands(FalseReg, MRI, X86::LD_Fp80m);
38421+
3817538422
int ResultSlot = MFI.CreateStackObject(StackObjectSize, Align(4), false);
3817638423

38177-
// Store f80 values to stack
38178-
storeFpToSlot(X86::ST_FpP80m, TrueSlot, TrueReg);
38179-
storeFpToSlot(X86::ST_FpP80m, FalseSlot, FalseReg);
38424+
if (TrueMemOps.IsValid && FalseMemOps.IsValid) {
38425+
// Optimized path: load directly from memory as integers
38426+
// Works for both frame index loads (stack parameters) and
38427+
// constant pool loads (constants)
38428+
emitCtSelectFromMemory(StackObjectSize / RegSizeInByte, TrueMemOps,
38429+
FalseMemOps, ResultSlot);
3818038430

38181-
// Use pseudo instruction for selection (3 x 32-bit values)
38182-
emitCtSelectWithPseudo(StackObjectSize/RegSizeInByte, TrueSlot, FalseSlot, ResultSlot);
38431+
// Erase the original FP load instructions since we're not using them
38432+
if (MRI.hasOneUse(TrueReg)) {
38433+
if (MachineInstr *TrueDefMI = MRI.getVRegDef(TrueReg))
38434+
TrueDefMI->eraseFromParent();
38435+
}
38436+
if (MRI.hasOneUse(FalseReg)) {
38437+
if (MachineInstr *FalseDefMI = MRI.getVRegDef(FalseReg))
38438+
FalseDefMI->eraseFromParent();
38439+
}
38440+
} else {
38441+
// General path: spill FP registers to stack first
38442+
int TrueSlot = MFI.CreateStackObject(StackObjectSize, Align(4), false);
38443+
int FalseSlot = MFI.CreateStackObject(StackObjectSize, Align(4), false);
38444+
38445+
storeFpToSlot(X86::ST_FpP80m, TrueSlot, TrueReg);
38446+
storeFpToSlot(X86::ST_FpP80m, FalseSlot, FalseReg);
38447+
38448+
emitCtSelectWithPseudo(StackObjectSize / RegSizeInByte, TrueSlot,
38449+
FalseSlot, ResultSlot);
38450+
}
3818338451

3818438452
// Load result back as f80
3818538453
addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::LD_Fp80m), DestReg),

0 commit comments

Comments
 (0)