@@ -25523,13 +25523,69 @@ SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const {
2552325523 // Handle floating point on i386 without SSE/CMOV (constant-time requirement)
2552425524 if (!Subtarget.hasSSE1() && VT.isFloatingPoint() && !VT.isVector()) {
2552525525 if (VT == MVT::f32) {
25526- // Bitcast f32 to i32, use raw condition with ISD::CTSELECT (avoids EFLAGS redundancy)
25526+ // Optimize: if operands are memory loads, access raw bits directly
25527+ if (TrueOp.getOpcode() == ISD::LOAD && FalseOp.getOpcode() == ISD::LOAD) {
25528+ LoadSDNode *TrueLoad = cast<LoadSDNode>(TrueOp.getNode());
25529+ LoadSDNode *FalseLoad = cast<LoadSDNode>(FalseOp.getNode());
25530+
25531+ // Load the same memory addresses as i32 (raw f32 bits)
25532+ SDValue TrueI32 = DAG.getLoad(MVT::i32, DL, TrueLoad->getChain(),
25533+ TrueLoad->getBasePtr(), TrueLoad->getPointerInfo());
25534+ SDValue FalseI32 = DAG.getLoad(MVT::i32, DL, FalseLoad->getChain(),
25535+ FalseLoad->getBasePtr(), FalseLoad->getPointerInfo());
25536+
25537+ // Direct CTSELECT on raw bits
25538+ SDValue CtSelect = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, Cond, TrueI32, FalseI32);
25539+
25540+ // Store result and load back as f32
25541+ SDValue ResultSlot = DAG.CreateStackTemporary(MVT::f32);
25542+ SDValue Store = DAG.getStore(DAG.getEntryNode(), DL, CtSelect, ResultSlot,
25543+ MachinePointerInfo());
25544+ return DAG.getLoad(MVT::f32, DL, Store, ResultSlot, MachinePointerInfo());
25545+ }
25546+
25547+ // Fallback: bitcast approach for register values
2552725548 TrueOp = DAG.getBitcast(MVT::i32, TrueOp);
2552825549 FalseOp = DAG.getBitcast(MVT::i32, FalseOp);
2552925550 SDValue CtSelect = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, Cond, TrueOp, FalseOp);
2553025551 return DAG.getBitcast(VT, CtSelect);
2553125552 } else if (VT == MVT::f64) {
25532- // For f64 on i386, avoid all i64 operations by using memory to split/reassemble
25553+ // Optimize: if operands are memory loads, access raw bits directly
25554+ if (TrueOp.getOpcode() == ISD::LOAD && FalseOp.getOpcode() == ISD::LOAD) {
25555+ LoadSDNode *TrueLoad = cast<LoadSDNode>(TrueOp.getNode());
25556+ LoadSDNode *FalseLoad = cast<LoadSDNode>(FalseOp.getNode());
25557+
25558+ // Load i32 parts directly from memory (lo/hi 32-bit chunks)
25559+ SDValue TrueLo = DAG.getLoad(MVT::i32, DL, TrueLoad->getChain(),
25560+ TrueLoad->getBasePtr(), TrueLoad->getPointerInfo());
25561+ SDValue TrueHiPtr = DAG.getMemBasePlusOffset(TrueLoad->getBasePtr(),
25562+ TypeSize::getFixed(4), DL);
25563+ SDValue TrueHi = DAG.getLoad(MVT::i32, DL, TrueLoad->getChain(),
25564+ TrueHiPtr, TrueLoad->getPointerInfo());
25565+
25566+ SDValue FalseLo = DAG.getLoad(MVT::i32, DL, FalseLoad->getChain(),
25567+ FalseLoad->getBasePtr(), FalseLoad->getPointerInfo());
25568+ SDValue FalseHiPtr = DAG.getMemBasePlusOffset(FalseLoad->getBasePtr(),
25569+ TypeSize::getFixed(4), DL);
25570+ SDValue FalseHi = DAG.getLoad(MVT::i32, DL, FalseLoad->getChain(),
25571+ FalseHiPtr, FalseLoad->getPointerInfo());
25572+
25573+ // Direct CTSELECT on both i32 parts
25574+ SDValue LoSelect = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, Cond, TrueLo, FalseLo);
25575+ SDValue HiSelect = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, Cond, TrueHi, FalseHi);
25576+
25577+ // Store result parts and load back as f64
25578+ SDValue ResultSlot = DAG.CreateStackTemporary(MVT::f64);
25579+ SDValue Chain = DAG.getEntryNode();
25580+ SDValue StoreResLo = DAG.getStore(Chain, DL, LoSelect, ResultSlot,
25581+ MachinePointerInfo());
25582+ SDValue ResHiPtr = DAG.getMemBasePlusOffset(ResultSlot, TypeSize::getFixed(4), DL);
25583+ SDValue StoreResHi = DAG.getStore(StoreResLo, DL, HiSelect, ResHiPtr,
25584+ MachinePointerInfo());
25585+ return DAG.getLoad(MVT::f64, DL, StoreResHi, ResultSlot, MachinePointerInfo());
25586+ }
25587+
25588+ // Fallback: memory-based approach for register values
2553325589 // TODO: Consider creating CTSELECT_I386_F64mm pseudo instruction
2553425590 // for single bundled 64-bit memory-based post-RA expansion
2553525591
@@ -25573,7 +25629,54 @@ SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const {
2557325629 // Load complete f64 result from memory
2557425630 return DAG.getLoad(MVT::f64, DL, StoreResHi, ResultSlot, MachinePointerInfo());
2557525631 } else if (VT == MVT::f80) {
25576- // For f80 on i386, use memory-based approach with 3×32-bit chunks
25632+ // Optimize: if operands are memory loads, access raw bits directly
25633+ if (TrueOp.getOpcode() == ISD::LOAD && FalseOp.getOpcode() == ISD::LOAD) {
25634+ LoadSDNode *TrueLoad = cast<LoadSDNode>(TrueOp.getNode());
25635+ LoadSDNode *FalseLoad = cast<LoadSDNode>(FalseOp.getNode());
25636+
25637+ // Load i32 parts directly from memory (3 chunks: [0-3], [4-7], [8-11] bytes)
25638+ SDValue TruePart0 = DAG.getLoad(MVT::i32, DL, TrueLoad->getChain(),
25639+ TrueLoad->getBasePtr(), TrueLoad->getPointerInfo());
25640+ SDValue TruePart1Ptr = DAG.getMemBasePlusOffset(TrueLoad->getBasePtr(),
25641+ TypeSize::getFixed(4), DL);
25642+ SDValue TruePart1 = DAG.getLoad(MVT::i32, DL, TrueLoad->getChain(),
25643+ TruePart1Ptr, TrueLoad->getPointerInfo());
25644+ SDValue TruePart2Ptr = DAG.getMemBasePlusOffset(TrueLoad->getBasePtr(),
25645+ TypeSize::getFixed(8), DL);
25646+ SDValue TruePart2 = DAG.getLoad(MVT::i32, DL, TrueLoad->getChain(),
25647+ TruePart2Ptr, TrueLoad->getPointerInfo());
25648+
25649+ SDValue FalsePart0 = DAG.getLoad(MVT::i32, DL, FalseLoad->getChain(),
25650+ FalseLoad->getBasePtr(), FalseLoad->getPointerInfo());
25651+ SDValue FalsePart1Ptr = DAG.getMemBasePlusOffset(FalseLoad->getBasePtr(),
25652+ TypeSize::getFixed(4), DL);
25653+ SDValue FalsePart1 = DAG.getLoad(MVT::i32, DL, FalseLoad->getChain(),
25654+ FalsePart1Ptr, FalseLoad->getPointerInfo());
25655+ SDValue FalsePart2Ptr = DAG.getMemBasePlusOffset(FalseLoad->getBasePtr(),
25656+ TypeSize::getFixed(8), DL);
25657+ SDValue FalsePart2 = DAG.getLoad(MVT::i32, DL, FalseLoad->getChain(),
25658+ FalsePart2Ptr, FalseLoad->getPointerInfo());
25659+
25660+ // Direct CTSELECT on all three i32 parts
25661+ SDValue Part0Select = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, Cond, TruePart0, FalsePart0);
25662+ SDValue Part1Select = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, Cond, TruePart1, FalsePart1);
25663+ SDValue Part2Select = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, Cond, TruePart2, FalsePart2);
25664+
25665+ // Store result parts and load back as f80
25666+ SDValue ResultSlot = DAG.CreateStackTemporary(MVT::f80);
25667+ SDValue Chain = DAG.getEntryNode();
25668+ SDValue StorePart0 = DAG.getStore(Chain, DL, Part0Select, ResultSlot,
25669+ MachinePointerInfo());
25670+ SDValue ResPart1Ptr = DAG.getMemBasePlusOffset(ResultSlot, TypeSize::getFixed(4), DL);
25671+ SDValue StorePart1 = DAG.getStore(StorePart0, DL, Part1Select, ResPart1Ptr,
25672+ MachinePointerInfo());
25673+ SDValue ResPart2Ptr = DAG.getMemBasePlusOffset(ResultSlot, TypeSize::getFixed(8), DL);
25674+ SDValue StorePart2 = DAG.getStore(StorePart1, DL, Part2Select, ResPart2Ptr,
25675+ MachinePointerInfo());
25676+ return DAG.getLoad(MVT::f80, DL, StorePart2, ResultSlot, MachinePointerInfo());
25677+ }
25678+
25679+ // Fallback: memory-based approach for register values
2557725680 // f80 is stored as 96 bits (80 bits + 16 padding), handled as 3×i32
2557825681 // TODO: Consider creating CTSELECT_I386_F80mm pseudo instruction
2557925682 // for single bundled 80-bit memory-based post-RA expansion
0 commit comments