Skip to content

Commit e4fcc0f

Browse files
hbrodinwizardengineer
authored andcommitted
[CT] Optimize for when operands on stack already - skip FP
1 parent 878d53d commit e4fcc0f

File tree

4 files changed

+111
-29
lines changed

4 files changed

+111
-29
lines changed

llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6834,9 +6834,6 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
68346834
return;
68356835
}
68366836
case Intrinsic::ct_select: {
6837-
// Set function attribute to indicate ct.select usage
6838-
Function &F = DAG.getMachineFunction().getFunction();
6839-
F.addFnAttr("ct-select");
68406837

68416838
SDLoc DL = getCurSDLoc();
68426839

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 106 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25523,13 +25523,69 @@ SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const {
2552325523
// Handle floating point on i386 without SSE/CMOV (constant-time requirement)
2552425524
if (!Subtarget.hasSSE1() && VT.isFloatingPoint() && !VT.isVector()) {
2552525525
if (VT == MVT::f32) {
25526-
// Bitcast f32 to i32, use raw condition with ISD::CTSELECT (avoids EFLAGS redundancy)
25526+
// Optimize: if operands are memory loads, access raw bits directly
25527+
if (TrueOp.getOpcode() == ISD::LOAD && FalseOp.getOpcode() == ISD::LOAD) {
25528+
LoadSDNode *TrueLoad = cast<LoadSDNode>(TrueOp.getNode());
25529+
LoadSDNode *FalseLoad = cast<LoadSDNode>(FalseOp.getNode());
25530+
25531+
// Load the same memory addresses as i32 (raw f32 bits)
25532+
SDValue TrueI32 = DAG.getLoad(MVT::i32, DL, TrueLoad->getChain(),
25533+
TrueLoad->getBasePtr(), TrueLoad->getPointerInfo());
25534+
SDValue FalseI32 = DAG.getLoad(MVT::i32, DL, FalseLoad->getChain(),
25535+
FalseLoad->getBasePtr(), FalseLoad->getPointerInfo());
25536+
25537+
// Direct CTSELECT on raw bits
25538+
SDValue CtSelect = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, Cond, TrueI32, FalseI32);
25539+
25540+
// Store result and load back as f32
25541+
SDValue ResultSlot = DAG.CreateStackTemporary(MVT::f32);
25542+
SDValue Store = DAG.getStore(DAG.getEntryNode(), DL, CtSelect, ResultSlot,
25543+
MachinePointerInfo());
25544+
return DAG.getLoad(MVT::f32, DL, Store, ResultSlot, MachinePointerInfo());
25545+
}
25546+
25547+
// Fallback: bitcast approach for register values
2552725548
TrueOp = DAG.getBitcast(MVT::i32, TrueOp);
2552825549
FalseOp = DAG.getBitcast(MVT::i32, FalseOp);
2552925550
SDValue CtSelect = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, Cond, TrueOp, FalseOp);
2553025551
return DAG.getBitcast(VT, CtSelect);
2553125552
} else if (VT == MVT::f64) {
25532-
// For f64 on i386, avoid all i64 operations by using memory to split/reassemble
25553+
// Optimize: if operands are memory loads, access raw bits directly
25554+
if (TrueOp.getOpcode() == ISD::LOAD && FalseOp.getOpcode() == ISD::LOAD) {
25555+
LoadSDNode *TrueLoad = cast<LoadSDNode>(TrueOp.getNode());
25556+
LoadSDNode *FalseLoad = cast<LoadSDNode>(FalseOp.getNode());
25557+
25558+
// Load i32 parts directly from memory (lo/hi 32-bit chunks)
25559+
SDValue TrueLo = DAG.getLoad(MVT::i32, DL, TrueLoad->getChain(),
25560+
TrueLoad->getBasePtr(), TrueLoad->getPointerInfo());
25561+
SDValue TrueHiPtr = DAG.getMemBasePlusOffset(TrueLoad->getBasePtr(),
25562+
TypeSize::getFixed(4), DL);
25563+
SDValue TrueHi = DAG.getLoad(MVT::i32, DL, TrueLoad->getChain(),
25564+
TrueHiPtr, TrueLoad->getPointerInfo());
25565+
25566+
SDValue FalseLo = DAG.getLoad(MVT::i32, DL, FalseLoad->getChain(),
25567+
FalseLoad->getBasePtr(), FalseLoad->getPointerInfo());
25568+
SDValue FalseHiPtr = DAG.getMemBasePlusOffset(FalseLoad->getBasePtr(),
25569+
TypeSize::getFixed(4), DL);
25570+
SDValue FalseHi = DAG.getLoad(MVT::i32, DL, FalseLoad->getChain(),
25571+
FalseHiPtr, FalseLoad->getPointerInfo());
25572+
25573+
// Direct CTSELECT on both i32 parts
25574+
SDValue LoSelect = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, Cond, TrueLo, FalseLo);
25575+
SDValue HiSelect = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, Cond, TrueHi, FalseHi);
25576+
25577+
// Store result parts and load back as f64
25578+
SDValue ResultSlot = DAG.CreateStackTemporary(MVT::f64);
25579+
SDValue Chain = DAG.getEntryNode();
25580+
SDValue StoreResLo = DAG.getStore(Chain, DL, LoSelect, ResultSlot,
25581+
MachinePointerInfo());
25582+
SDValue ResHiPtr = DAG.getMemBasePlusOffset(ResultSlot, TypeSize::getFixed(4), DL);
25583+
SDValue StoreResHi = DAG.getStore(StoreResLo, DL, HiSelect, ResHiPtr,
25584+
MachinePointerInfo());
25585+
return DAG.getLoad(MVT::f64, DL, StoreResHi, ResultSlot, MachinePointerInfo());
25586+
}
25587+
25588+
// Fallback: memory-based approach for register values
2553325589
// TODO: Consider creating CTSELECT_I386_F64mm pseudo instruction
2553425590
// for single bundled 64-bit memory-based post-RA expansion
2553525591

@@ -25573,7 +25629,54 @@ SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const {
2557325629
// Load complete f64 result from memory
2557425630
return DAG.getLoad(MVT::f64, DL, StoreResHi, ResultSlot, MachinePointerInfo());
2557525631
} else if (VT == MVT::f80) {
25576-
// For f80 on i386, use memory-based approach with 3×32-bit chunks
25632+
// Optimize: if operands are memory loads, access raw bits directly
25633+
if (TrueOp.getOpcode() == ISD::LOAD && FalseOp.getOpcode() == ISD::LOAD) {
25634+
LoadSDNode *TrueLoad = cast<LoadSDNode>(TrueOp.getNode());
25635+
LoadSDNode *FalseLoad = cast<LoadSDNode>(FalseOp.getNode());
25636+
25637+
// Load i32 parts directly from memory (3 chunks: [0-3], [4-7], [8-11] bytes)
25638+
SDValue TruePart0 = DAG.getLoad(MVT::i32, DL, TrueLoad->getChain(),
25639+
TrueLoad->getBasePtr(), TrueLoad->getPointerInfo());
25640+
SDValue TruePart1Ptr = DAG.getMemBasePlusOffset(TrueLoad->getBasePtr(),
25641+
TypeSize::getFixed(4), DL);
25642+
SDValue TruePart1 = DAG.getLoad(MVT::i32, DL, TrueLoad->getChain(),
25643+
TruePart1Ptr, TrueLoad->getPointerInfo());
25644+
SDValue TruePart2Ptr = DAG.getMemBasePlusOffset(TrueLoad->getBasePtr(),
25645+
TypeSize::getFixed(8), DL);
25646+
SDValue TruePart2 = DAG.getLoad(MVT::i32, DL, TrueLoad->getChain(),
25647+
TruePart2Ptr, TrueLoad->getPointerInfo());
25648+
25649+
SDValue FalsePart0 = DAG.getLoad(MVT::i32, DL, FalseLoad->getChain(),
25650+
FalseLoad->getBasePtr(), FalseLoad->getPointerInfo());
25651+
SDValue FalsePart1Ptr = DAG.getMemBasePlusOffset(FalseLoad->getBasePtr(),
25652+
TypeSize::getFixed(4), DL);
25653+
SDValue FalsePart1 = DAG.getLoad(MVT::i32, DL, FalseLoad->getChain(),
25654+
FalsePart1Ptr, FalseLoad->getPointerInfo());
25655+
SDValue FalsePart2Ptr = DAG.getMemBasePlusOffset(FalseLoad->getBasePtr(),
25656+
TypeSize::getFixed(8), DL);
25657+
SDValue FalsePart2 = DAG.getLoad(MVT::i32, DL, FalseLoad->getChain(),
25658+
FalsePart2Ptr, FalseLoad->getPointerInfo());
25659+
25660+
// Direct CTSELECT on all three i32 parts
25661+
SDValue Part0Select = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, Cond, TruePart0, FalsePart0);
25662+
SDValue Part1Select = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, Cond, TruePart1, FalsePart1);
25663+
SDValue Part2Select = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, Cond, TruePart2, FalsePart2);
25664+
25665+
// Store result parts and load back as f80
25666+
SDValue ResultSlot = DAG.CreateStackTemporary(MVT::f80);
25667+
SDValue Chain = DAG.getEntryNode();
25668+
SDValue StorePart0 = DAG.getStore(Chain, DL, Part0Select, ResultSlot,
25669+
MachinePointerInfo());
25670+
SDValue ResPart1Ptr = DAG.getMemBasePlusOffset(ResultSlot, TypeSize::getFixed(4), DL);
25671+
SDValue StorePart1 = DAG.getStore(StorePart0, DL, Part1Select, ResPart1Ptr,
25672+
MachinePointerInfo());
25673+
SDValue ResPart2Ptr = DAG.getMemBasePlusOffset(ResultSlot, TypeSize::getFixed(8), DL);
25674+
SDValue StorePart2 = DAG.getStore(StorePart1, DL, Part2Select, ResPart2Ptr,
25675+
MachinePointerInfo());
25676+
return DAG.getLoad(MVT::f80, DL, StorePart2, ResultSlot, MachinePointerInfo());
25677+
}
25678+
25679+
// Fallback: memory-based approach for register values
2557725680
// f80 is stored as 96 bits (80 bits + 16 padding), handled as 3×i32
2557825681
// TODO: Consider creating CTSELECT_I386_F80mm pseudo instruction
2557925682
// for single bundled 80-bit memory-based post-RA expansion

llvm/lib/Target/X86/X86InstrCompiler.td

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -747,9 +747,6 @@ let isCodeGenOnly = 1, hasSideEffects = 1, ForceDisassemble = 1 in {
747747
defm CTSELECT_I386_INT_GR16 : CTSELECT_I386_INTERNAL<GR16, GR8>;
748748
defm CTSELECT_I386_INT_GR32 : CTSELECT_I386_INTERNAL<GR32, GR8>;
749749
}
750-
let Predicates = [NoCMOV, HasMMX] in {
751-
defm CTSELECT_I386_VR64 : CTSELECT_I386_VR64<VR64>;
752-
}
753750
}
754751

755752
let usesCustomInserter = 1,
@@ -786,12 +783,6 @@ let Predicates = [NoNativeCMOV] in {
786783
// i64 patterns handled automatically by type legalization
787784
}
788785

789-
// Pattern matching for VR64 CTSELECT on i386 without CMOV (routes to post-RA expansion)
790-
let Predicates = [NoCMOV, Not64BitMode, HasMMX] in {
791-
def : Pat<(x86mmx(X86ctselect VR64:$src1, VR64:$src2, timm:$cond, EFLAGS)),
792-
(CTSELECT_I386_VR64rr VR64:$src1, VR64:$src2, timm:$cond)>;
793-
}
794-
795786
//===----------------------------------------------------------------------===//
796787
// Normal-Instructions-With-Lock-Prefix Pseudo Instructions
797788
//===----------------------------------------------------------------------===//

llvm/lib/Target/X86/X86InstrInfo.cpp

Lines changed: 5 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1023,15 +1023,6 @@ bool X86InstrInfo::expandCtSelectIntWithoutCMOV(MachineInstr &MI) const {
10231023
// Remove the original pseudo instruction
10241024
MI.eraseFromParent();
10251025

1026-
// Bundle all generated instructions for atomic execution
1027-
auto BundleEnd = MI.getIterator();
1028-
if (BundleStart != BundleEnd) {
1029-
// Only bundle if we have multiple instructions
1030-
finalizeBundle(*MBB, BundleStart, BundleEnd);
1031-
}
1032-
1033-
// Remove the original pseudo instruction
1034-
MI.eraseFromParent();
10351026
return true;
10361027
}
10371028

@@ -7010,13 +7001,13 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
70107001
return expandCtSelectVector(MI);
70117002

70127003
// i386-specific CTSELECT expansion (post-RA, constant-time)
7013-
case X86::CTSELECT_I386_GR16rr:
7014-
case X86::CTSELECT_I386_GR32rr:
7015-
return expandCtSelectI386(MI);
7004+
//case X86::CTSELECT_I386_GR16rr:
7005+
//case X86::CTSELECT_I386_GR32rr:
7006+
// return expandCtSelectI386(MI);
70167007

70177008
// VR64-specific CTSELECT expansion (post-RA, constant-time)
7018-
case X86::CTSELECT_I386_VR64rr:
7019-
return expandCtSelectI386VR64(MI);
7009+
//case X86::CTSELECT_I386_VR64rr:
7010+
// return expandCtSelectI386VR64(MI);
70207011
}
70217012
return false;
70227013
}

0 commit comments

Comments
 (0)