@@ -38057,6 +38057,121 @@ emitCTSelectI386WithConditionMaterialization(MachineInstr &MI,
3805738057 return BB;
3805838058}
3805938059
38060+ // Helper structure to hold memory operand information for FP loads
38061+ struct FPLoadMemOperands {
38062+ bool IsValid = false;
38063+ unsigned BaseReg = 0;
38064+ int64_t ScaleVal = 1;
38065+ unsigned IndexReg = 0;
38066+ int64_t Disp = 0;
38067+ unsigned SegReg = 0;
38068+ int FrameIndex = -1;
38069+ bool IsFrameIndex = false;
38070+ int ConstantPoolIndex = -1;
38071+ bool IsConstantPool = false;
38072+ const GlobalValue *Global = nullptr;
38073+ int64_t GlobalOffset = 0;
38074+ bool IsGlobal = false;
38075+ };
38076+
38077+ // Check if a virtual register is defined by a simple FP load instruction
38078+ // Returns the memory operands if it's a simple load, otherwise returns invalid
38079+ static FPLoadMemOperands getFPLoadMemOperands(Register Reg,
38080+ MachineRegisterInfo &MRI,
38081+ unsigned ExpectedLoadOpcode) {
38082+ FPLoadMemOperands Result;
38083+
38084+ if (!Reg.isVirtual())
38085+ return Result;
38086+
38087+ MachineInstr *DefMI = MRI.getVRegDef(Reg);
38088+ if (!DefMI)
38089+ return Result;
38090+
38091+ // Check if it's the expected load opcode (e.g., LD_Fp32m, LD_Fp64m, LD_Fp80m)
38092+ if (DefMI->getOpcode() != ExpectedLoadOpcode)
38093+ return Result;
38094+
38095+ // Check that this is a simple load - not volatile, not atomic, etc.
38096+ // FP loads have hasSideEffects = 0 in their definition for simple loads
38097+ if (DefMI->hasOrderedMemoryRef())
38098+ return Result;
38099+
38100+ // The load should have a single def (the destination register) and memory operands
38101+ // Format: %reg = LD_Fpxxm <fi#N>, 1, %noreg, 0, %noreg
38102+ // or: %reg = LD_Fpxxm %base, scale, %index, disp, %segment
38103+ if (DefMI->getNumOperands() < 6)
38104+ return Result;
38105+
38106+ // Operand 0 is the destination, operands 1-5 are the memory reference
38107+ MachineOperand &BaseMO = DefMI->getOperand(1);
38108+ MachineOperand &ScaleMO = DefMI->getOperand(2);
38109+ MachineOperand &IndexMO = DefMI->getOperand(3);
38110+ MachineOperand &DispMO = DefMI->getOperand(4);
38111+ MachineOperand &SegMO = DefMI->getOperand(5);
38112+
38113+ // Check if this is a frame index load
38114+ if (BaseMO.isFI()) {
38115+ Result.IsValid = true;
38116+ Result.IsFrameIndex = true;
38117+ Result.FrameIndex = BaseMO.getIndex();
38118+ Result.ScaleVal = ScaleMO.getImm();
38119+ Result.IndexReg = IndexMO.getReg();
38120+ Result.Disp = DispMO.getImm();
38121+ Result.SegReg = SegMO.getReg();
38122+ return Result;
38123+ }
38124+
38125+ // Check if this is a constant pool load
38126+ // Format: %reg = LD_Fpxxm $noreg, 1, $noreg, %const.N, $noreg
38127+ if (BaseMO.isReg() && BaseMO.getReg() == X86::NoRegister &&
38128+ ScaleMO.isImm() && IndexMO.isReg() &&
38129+ IndexMO.getReg() == X86::NoRegister &&
38130+ DispMO.isCPI() && SegMO.isReg()) {
38131+ Result.IsValid = true;
38132+ Result.IsConstantPool = true;
38133+ Result.ConstantPoolIndex = DispMO.getIndex();
38134+ Result.ScaleVal = ScaleMO.getImm();
38135+ Result.IndexReg = IndexMO.getReg();
38136+ Result.Disp = 0;
38137+ Result.SegReg = SegMO.getReg();
38138+ return Result;
38139+ }
38140+
38141+ // Check if this is a global variable load
38142+ // Format: %reg = LD_Fpxxm $noreg, 1, $noreg, @global_name, $noreg
38143+ if (BaseMO.isReg() && BaseMO.getReg() == X86::NoRegister &&
38144+ ScaleMO.isImm() && IndexMO.isReg() &&
38145+ IndexMO.getReg() == X86::NoRegister &&
38146+ DispMO.isGlobal() && SegMO.isReg()) {
38147+ Result.IsValid = true;
38148+ Result.IsGlobal = true;
38149+ Result.Global = DispMO.getGlobal();
38150+ Result.GlobalOffset = DispMO.getOffset();
38151+ Result.ScaleVal = ScaleMO.getImm();
38152+ Result.IndexReg = IndexMO.getReg();
38153+ Result.Disp = 0;
38154+ Result.SegReg = SegMO.getReg();
38155+ return Result;
38156+ }
38157+
38158+ // Regular memory operands (e.g., pointer loads)
38159+ if (BaseMO.isReg() && ScaleMO.isImm() && IndexMO.isReg() &&
38160+ DispMO.isImm() && SegMO.isReg()) {
38161+ Result.IsValid = true;
38162+ Result.IsFrameIndex = false;
38163+ Result.IsConstantPool = false;
38164+ Result.BaseReg = BaseMO.getReg();
38165+ Result.ScaleVal = ScaleMO.getImm();
38166+ Result.IndexReg = IndexMO.getReg();
38167+ Result.Disp = DispMO.getImm();
38168+ Result.SegReg = SegMO.getReg();
38169+ return Result;
38170+ }
38171+
38172+ return Result;
38173+ }
38174+
3806038175static MachineBasicBlock *emitCTSelectI386WithFpType(MachineInstr &MI,
3806138176 MachineBasicBlock *BB,
3806238177 unsigned pseudoInstr) {
@@ -38084,6 +38199,85 @@ static MachineBasicBlock *emitCTSelectI386WithFpType(MachineInstr &MI,
3808438199 .addReg(Reg, RegState::Kill);
3808538200 };
3808638201
38202+ // Helper to load integer from memory operands
38203+ auto loadIntFromMemOperands = [&](const FPLoadMemOperands &MemOps,
38204+ unsigned Offset) -> unsigned {
38205+ unsigned IntReg = MRI.createVirtualRegister(&X86::GR32RegClass);
38206+ MachineInstrBuilder MIB =
38207+ BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), IntReg);
38208+
38209+ if (MemOps.IsFrameIndex) {
38210+ // Frame index: addFrameIndex + scale + index + disp + segment
38211+ MIB.addFrameIndex(MemOps.FrameIndex)
38212+ .addImm(MemOps.ScaleVal)
38213+ .addReg(MemOps.IndexReg)
38214+ .addImm(MemOps.Disp + Offset)
38215+ .addReg(MemOps.SegReg);
38216+ } else if (MemOps.IsConstantPool) {
38217+ // Constant pool: base_reg + scale + index + CP_index + segment
38218+ // MOV32rm format: base, scale, index, displacement, segment
38219+ MIB.addReg(X86::NoRegister) // Base register
38220+ .addImm(MemOps.ScaleVal) // Scale
38221+ .addReg(MemOps.IndexReg) // Index register
38222+ .addConstantPoolIndex(MemOps.ConstantPoolIndex, Offset) // Displacement (CP index)
38223+ .addReg(MemOps.SegReg); // Segment
38224+ } else if (MemOps.IsGlobal) {
38225+ // Global variable: base_reg + scale + index + global + segment
38226+ // MOV32rm format: base, scale, index, displacement, segment
38227+ MIB.addReg(X86::NoRegister) // Base register
38228+ .addImm(MemOps.ScaleVal) // Scale
38229+ .addReg(MemOps.IndexReg) // Index register
38230+ .addGlobalAddress(MemOps.Global, MemOps.GlobalOffset + Offset) // Displacement (global address)
38231+ .addReg(MemOps.SegReg); // Segment
38232+ } else {
38233+ // Regular memory: base_reg + scale + index + disp + segment
38234+ MIB.addReg(MemOps.BaseReg)
38235+ .addImm(MemOps.ScaleVal)
38236+ .addReg(MemOps.IndexReg)
38237+ .addImm(MemOps.Disp + Offset)
38238+ .addReg(MemOps.SegReg);
38239+ }
38240+
38241+ return IntReg;
38242+ };
38243+
38244+ // Optimized path: load integers directly from memory when both operands are
38245+ // memory loads, avoiding FP register round-trip
38246+ auto emitCtSelectFromMemory = [&](unsigned NumValues,
38247+ const FPLoadMemOperands &TrueMemOps,
38248+ const FPLoadMemOperands &FalseMemOps,
38249+ int ResultSlot) {
38250+ for (unsigned Val = 0; Val < NumValues; ++Val) {
38251+ unsigned Offset = Val * RegSizeInByte;
38252+
38253+ // Load true and false values directly from their memory locations as integers
38254+ unsigned TrueIntReg = loadIntFromMemOperands(TrueMemOps, Offset);
38255+ unsigned FalseIntReg = loadIntFromMemOperands(FalseMemOps, Offset);
38256+
38257+ // Use CTSELECT_I386_INT_GR32 pseudo instruction for constant-time selection
38258+ unsigned ResultIntReg = MRI.createVirtualRegister(&X86::GR32RegClass);
38259+ unsigned TmpByteReg = MRI.createVirtualRegister(&X86::GR8RegClass);
38260+ unsigned TmpMaskReg = MRI.createVirtualRegister(&X86::GR32RegClass);
38261+
38262+ BuildMI(*BB, MI, MIMD, TII->get(X86::CTSELECT_I386_INT_GR32rr))
38263+ .addDef(ResultIntReg) // dst (output)
38264+ .addDef(TmpByteReg) // tmp_byte (output)
38265+ .addDef(TmpMaskReg) // tmp_mask (output)
38266+ .addReg(FalseIntReg) // src1 (input) - false value
38267+ .addReg(TrueIntReg) // src2 (input) - true value
38268+ .addReg(CondByteReg); // pre-materialized condition byte (input)
38269+
38270+ // Store result back to result slot
38271+ BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32mr))
38272+ .addFrameIndex(ResultSlot)
38273+ .addImm(1)
38274+ .addReg(0)
38275+ .addImm(Offset)
38276+ .addReg(0)
38277+ .addReg(ResultIntReg, RegState::Kill);
38278+ }
38279+ };
38280+
3808738281 auto emitCtSelectWithPseudo = [&](unsigned NumValues, int TrueSlot, int FalseSlot, int ResultSlot) {
3808838282 for (unsigned Val = 0; Val < NumValues; ++Val) {
3808938283 unsigned Offset = Val * RegSizeInByte;
@@ -38131,17 +38325,40 @@ static MachineBasicBlock *emitCTSelectI386WithFpType(MachineInstr &MI,
3813138325
3813238326 switch (pseudoInstr) {
3813338327 case X86::CTSELECT_I386_FP32rr: {
38134- // Allocate stack slots (4 bytes for f32)
38328+ // Check if both operands are simple memory loads
38329+ FPLoadMemOperands TrueMemOps =
38330+ getFPLoadMemOperands(TrueReg, MRI, X86::LD_Fp32m);
38331+ FPLoadMemOperands FalseMemOps =
38332+ getFPLoadMemOperands(FalseReg, MRI, X86::LD_Fp32m);
38333+
3813538334 int ResultSlot = MFI.CreateStackObject(RegSizeInByte, Align(4), false);
38136- int TrueSlot = MFI.CreateStackObject(RegSizeInByte, Align(4), false);
38137- int FalseSlot = MFI.CreateStackObject(RegSizeInByte, Align(4), false);
3813838335
38139- // Store f32 values to stack
38140- storeFpToSlot(X86::ST_Fp32m, TrueSlot, TrueReg);
38141- storeFpToSlot(X86::ST_Fp32m, FalseSlot, FalseReg);
38336+ if (TrueMemOps.IsValid && FalseMemOps.IsValid) {
38337+ // Optimized path: load directly from memory as integers
38338+ // Works for both frame index loads (stack parameters) and
38339+ // constant pool loads (constants)
38340+ emitCtSelectFromMemory(1, TrueMemOps, FalseMemOps, ResultSlot);
3814238341
38143- // Use pseudo instruction for selection (1 x 32-bit value)
38144- emitCtSelectWithPseudo(1, TrueSlot, FalseSlot, ResultSlot);
38342+ // Erase the original FP load instructions since we're not using them
38343+ // and have loaded the data directly as integers instead
38344+ if (MRI.hasOneUse(TrueReg)) {
38345+ if (MachineInstr *TrueDefMI = MRI.getVRegDef(TrueReg))
38346+ TrueDefMI->eraseFromParent();
38347+ }
38348+ if (MRI.hasOneUse(FalseReg)) {
38349+ if (MachineInstr *FalseDefMI = MRI.getVRegDef(FalseReg))
38350+ FalseDefMI->eraseFromParent();
38351+ }
38352+ } else {
38353+ // General path: spill FP registers to stack first
38354+ int TrueSlot = MFI.CreateStackObject(RegSizeInByte, Align(4), false);
38355+ int FalseSlot = MFI.CreateStackObject(RegSizeInByte, Align(4), false);
38356+
38357+ storeFpToSlot(X86::ST_Fp32m, TrueSlot, TrueReg);
38358+ storeFpToSlot(X86::ST_Fp32m, FalseSlot, FalseReg);
38359+
38360+ emitCtSelectWithPseudo(1, TrueSlot, FalseSlot, ResultSlot);
38361+ }
3814538362
3814638363 // Load result back as f32
3814738364 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::LD_Fp32m), DestReg),
@@ -38150,36 +38367,87 @@ static MachineBasicBlock *emitCTSelectI386WithFpType(MachineInstr &MI,
3815038367 }
3815138368 case X86::CTSELECT_I386_FP64rr: {
3815238369 unsigned StackSlotSize = 8;
38153- // Allocate stack slots (8 bytes for f64)
38154- int TrueSlot = MFI.CreateStackObject(StackSlotSize, Align(4), false);
38155- int FalseSlot = MFI.CreateStackObject(StackSlotSize, Align(4), false);
38370+
38371+ // Check if both operands are simple memory loads
38372+ FPLoadMemOperands TrueMemOps =
38373+ getFPLoadMemOperands(TrueReg, MRI, X86::LD_Fp64m);
38374+ FPLoadMemOperands FalseMemOps =
38375+ getFPLoadMemOperands(FalseReg, MRI, X86::LD_Fp64m);
38376+
3815638377 int ResultSlot = MFI.CreateStackObject(StackSlotSize, Align(4), false);
3815738378
38158- // Store f64 values to stack
38159- storeFpToSlot(X86::ST_Fp64m, TrueSlot, TrueReg);
38160- storeFpToSlot(X86::ST_Fp64m, FalseSlot, FalseReg);
38379+ if (TrueMemOps.IsValid && FalseMemOps.IsValid) {
38380+ // Optimized path: load directly from memory as integers
38381+ // Works for both frame index loads (stack parameters) and
38382+ // constant pool loads (constants)
38383+ emitCtSelectFromMemory(StackSlotSize / RegSizeInByte, TrueMemOps,
38384+ FalseMemOps, ResultSlot);
3816138385
38162- // Use pseudo instruction for selection (2 x 32-bit values)
38163- emitCtSelectWithPseudo(StackSlotSize/RegSizeInByte, TrueSlot, FalseSlot, ResultSlot);
38386+ // Erase the original FP load instructions since we're not using them
38387+ if (MRI.hasOneUse(TrueReg)) {
38388+ if (MachineInstr *TrueDefMI = MRI.getVRegDef(TrueReg))
38389+ TrueDefMI->eraseFromParent();
38390+ }
38391+ if (MRI.hasOneUse(FalseReg)) {
38392+ if (MachineInstr *FalseDefMI = MRI.getVRegDef(FalseReg))
38393+ FalseDefMI->eraseFromParent();
38394+ }
38395+ } else {
38396+ // General path: spill FP registers to stack first
38397+ int TrueSlot = MFI.CreateStackObject(StackSlotSize, Align(4), false);
38398+ int FalseSlot = MFI.CreateStackObject(StackSlotSize, Align(4), false);
38399+
38400+ storeFpToSlot(X86::ST_Fp64m, TrueSlot, TrueReg);
38401+ storeFpToSlot(X86::ST_Fp64m, FalseSlot, FalseReg);
38402+
38403+ emitCtSelectWithPseudo(StackSlotSize / RegSizeInByte, TrueSlot, FalseSlot,
38404+ ResultSlot);
38405+ }
3816438406
3816538407 // Load result back as f64
3816638408 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::LD_Fp64m), DestReg),
3816738409 ResultSlot);
3816838410 break;
3816938411 }
3817038412 case X86::CTSELECT_I386_FP80rr: {
38171- // Allocate stack slots (12 bytes for f80 - 80-bit = 10 bytes, aligned to 12)
38413+ // f80 is 80 bits ( 10 bytes), but stored with 12-byte alignment
3817238414 unsigned StackObjectSize = 12;
38173- int TrueSlot = MFI.CreateStackObject(StackObjectSize, Align(4), false);
38174- int FalseSlot = MFI.CreateStackObject(StackObjectSize, Align(4), false);
38415+
38416+ // Check if both operands are simple memory loads
38417+ FPLoadMemOperands TrueMemOps =
38418+ getFPLoadMemOperands(TrueReg, MRI, X86::LD_Fp80m);
38419+ FPLoadMemOperands FalseMemOps =
38420+ getFPLoadMemOperands(FalseReg, MRI, X86::LD_Fp80m);
38421+
3817538422 int ResultSlot = MFI.CreateStackObject(StackObjectSize, Align(4), false);
3817638423
38177- // Store f80 values to stack
38178- storeFpToSlot(X86::ST_FpP80m, TrueSlot, TrueReg);
38179- storeFpToSlot(X86::ST_FpP80m, FalseSlot, FalseReg);
38424+ if (TrueMemOps.IsValid && FalseMemOps.IsValid) {
38425+ // Optimized path: load directly from memory as integers
38426+ // Works for both frame index loads (stack parameters) and
38427+ // constant pool loads (constants)
38428+ emitCtSelectFromMemory(StackObjectSize / RegSizeInByte, TrueMemOps,
38429+ FalseMemOps, ResultSlot);
3818038430
38181- // Use pseudo instruction for selection (3 x 32-bit values)
38182- emitCtSelectWithPseudo(StackObjectSize/RegSizeInByte, TrueSlot, FalseSlot, ResultSlot);
38431+ // Erase the original FP load instructions since we're not using them
38432+ if (MRI.hasOneUse(TrueReg)) {
38433+ if (MachineInstr *TrueDefMI = MRI.getVRegDef(TrueReg))
38434+ TrueDefMI->eraseFromParent();
38435+ }
38436+ if (MRI.hasOneUse(FalseReg)) {
38437+ if (MachineInstr *FalseDefMI = MRI.getVRegDef(FalseReg))
38438+ FalseDefMI->eraseFromParent();
38439+ }
38440+ } else {
38441+ // General path: spill FP registers to stack first
38442+ int TrueSlot = MFI.CreateStackObject(StackObjectSize, Align(4), false);
38443+ int FalseSlot = MFI.CreateStackObject(StackObjectSize, Align(4), false);
38444+
38445+ storeFpToSlot(X86::ST_FpP80m, TrueSlot, TrueReg);
38446+ storeFpToSlot(X86::ST_FpP80m, FalseSlot, FalseReg);
38447+
38448+ emitCtSelectWithPseudo(StackObjectSize / RegSizeInByte, TrueSlot,
38449+ FalseSlot, ResultSlot);
38450+ }
3818338451
3818438452 // Load result back as f80
3818538453 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::LD_Fp80m), DestReg),
0 commit comments