Skip to content

Commit 89203ff

Browse files
[LLVM][X86] Add native ct.select support for X86 and i386
Add native X86 implementation with CMOV instructions and comprehensive tests: - X86 ISelLowering with CMOV for x86_64 and i386 - Fallback bitwise operations for i386 targets without CMOV - Post-RA expansion for pseudo-instructions - Comprehensive test coverage: - Edge cases (zero conditions, large integers) - i386-specific tests (FP, MMX, non-CMOV fallback) - Vector operations - Optimization patterns The basic test demonstrating fallback is in the core infrastructure PR.
1 parent cbb5490 commit 89203ff

17 files changed

+5671
-451
lines changed

llvm/lib/Target/X86/X86.td

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -825,9 +825,10 @@ include "X86SchedSapphireRapids.td"
825825

826826
def ProcessorFeatures {
827827
// x86-64 micro-architecture levels: x86-64 and x86-64-v[234]
828-
list<SubtargetFeature> X86_64V1Features = [
829-
FeatureX87, FeatureCX8, FeatureCMOV, FeatureMMX, FeatureSSE2,
830-
FeatureFXSR, FeatureNOPL, FeatureX86_64,
828+
list<SubtargetFeature> X86_64V1Features = [FeatureX87, FeatureCX8,
829+
FeatureCMOV, FeatureMMX,
830+
FeatureSSE2, FeatureFXSR,
831+
FeatureNOPL, FeatureX86_64,
831832
];
832833
list<SubtargetFeature> X86_64V1Tuning = [
833834
TuningMacroFusion,
@@ -1161,6 +1162,7 @@ def ProcessorFeatures {
11611162
FeatureAVXNECONVERT,
11621163
FeatureAVXVNNIINT8,
11631164
FeatureAVXVNNIINT16,
1165+
FeatureUSERMSR,
11641166
FeatureSHA512,
11651167
FeatureSM3,
11661168
FeatureEGPR,

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 763 additions & 28 deletions
Large diffs are not rendered by default.

llvm/lib/Target/X86/X86ISelLowering.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,10 @@ namespace llvm {
114114
/// X86 Select
115115
SELECTS,
116116

117+
/// X86 Constant-time Select, implemented with CMOV instruction. This is
118+
/// used to implement constant-time select.
119+
CTSELECT,
120+
117121
// Same as SETCC except it's materialized with a sbb and the value is all
118122
// one's or all zero's.
119123
SETCC_CARRY, // R = carry_bit ? ~0 : 0
@@ -1139,6 +1143,8 @@ namespace llvm {
11391143
///
11401144
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
11411145

1146+
bool isSelectSupported(SelectSupportKind Kind) const override;
1147+
11421148
/// Replace the results of node with an illegal result
11431149
/// type with new values built out of custom code.
11441150
///
@@ -1765,6 +1771,7 @@ namespace llvm {
17651771
SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
17661772
SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const;
17671773
SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
1774+
SDValue LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const;
17681775
SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
17691776
SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
17701777
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;

llvm/lib/Target/X86/X86InstrCMovSetCC.td

Lines changed: 205 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,211 @@ let Predicates = [HasCMOV, HasNDD] in {
106106
def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, timm:$cond, EFLAGS),
107107
(CMOV64rm_ND GR64:$src2, addr:$src1, (inv_cond_XFORM timm:$cond))>;
108108
}
109+
110+
// Create pseudo instruction and do the pattern matching to them.
111+
// We use a machine pass to lower these pseudos into cmov, in order
112+
// to avoid backend optimizations
113+
let Uses = [EFLAGS], isNotDuplicable = 1, isPseudo = 1 in {
114+
115+
multiclass CTSELECT<X86TypeInfo t> {
116+
// register-only
117+
let isCommutable = 0, SchedRW = [WriteCMOV], Predicates = [HasNativeCMOV],
118+
AsmString = "ctselect\\t$dst, $src1, $src2, $cond" in {
119+
def rr : PseudoI<(outs t.RegClass:$dst),
120+
(ins t.RegClass:$src1, t.RegClass:$src2, i8imm:$cond),
121+
[(set t.RegClass:$dst, (X86ctselect t.RegClass:$src1, t.RegClass:$src2, timm:$cond, EFLAGS))]>;
122+
}
123+
124+
// register-memory
125+
let SchedRW = [WriteCMOV.Folded, WriteCMOV.ReadAfterFold], Predicates = [HasNativeCMOV],
126+
AsmString = "ctselect\\t$dst, $src1, $src2, $cond" in {
127+
def rm : PseudoI<(outs t.RegClass:$dst),
128+
(ins t.RegClass:$src1, t.MemOperand:$src2, i8imm:$cond),
129+
[(set t.RegClass:$dst, (X86ctselect t.RegClass:$src1, (t.LoadNode addr:$src2), timm:$cond, EFLAGS))]>;
130+
}
131+
}
132+
}
133+
134+
let isCodeGenOnly = 1, hasSideEffects = 1, ForceDisassemble = 1 in {
135+
let Constraints = "$dst = $src1" in {
136+
defm CTSELECT16 : CTSELECT<Xi16>;
137+
defm CTSELECT32 : CTSELECT<Xi32>;
138+
defm CTSELECT64 : CTSELECT<Xi64>;
139+
}
140+
}
141+
142+
// CTSELECT_VEC base class
143+
class CTSELECT_VEC<RegisterClass VRc, RegisterClass GRc>
144+
: PseudoI<
145+
(outs VRc:$dst, VRc:$tmpx, GRc:$tmpg),
146+
(ins VRc:$t, VRc:$f, i8imm:$cond),
147+
[]
148+
> {
149+
let Uses = [EFLAGS];
150+
let isPseudo = 1;
151+
let isNotDuplicable = 1;
152+
let hasSideEffects = 1;
153+
let AsmString = "ctselect\t$dst, $f, $t, $cond";
154+
let SchedRW = [];
155+
}
156+
157+
// Width-specific class aliases
158+
class CTSELECT_VEC128 : CTSELECT_VEC<VR128, GR32>;
159+
class CTSELECT_VEC128X : CTSELECT_VEC<VR128X, GR32>;
160+
class CTSELECT_VEC256 : CTSELECT_VEC<VR256, GR32>;
161+
class CTSELECT_VEC512 : CTSELECT_VEC<VR512, GR32>;
162+
163+
164+
//===----------------------------------------------------------------------===//
165+
// 128-bit pseudos (SSE2 baseline; we use PXOR/PAND/MOVD/PSHUFD in the expander)
166+
//===----------------------------------------------------------------------===//
167+
168+
let Predicates = [HasSSE1] in {
169+
170+
def CTSELECT_V4F32 : CTSELECT_VEC128 {
171+
let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg";
172+
}
173+
}
174+
175+
let Predicates = [HasSSE2] in {
176+
177+
def CTSELECT_V2F64 : CTSELECT_VEC128 {
178+
let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg";
179+
}
180+
def CTSELECT_V4I32 : CTSELECT_VEC128 {
181+
let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg";
182+
}
183+
def CTSELECT_V2I64 : CTSELECT_VEC128 {
184+
let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg";
185+
}
186+
def CTSELECT_V8I16 : CTSELECT_VEC128 {
187+
let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg";
188+
}
189+
def CTSELECT_V16I8 : CTSELECT_VEC128 {
190+
let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg";
191+
}
192+
193+
// If your build has v8f16, keep this; otherwise comment it out.
194+
def CTSELECT_V8F16 : CTSELECT_VEC128 {
195+
let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg";
196+
}
197+
}
198+
199+
let Predicates = [HasAVX] in {
200+
201+
def CTSELECT_V4F32X : CTSELECT_VEC128X {
202+
let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg";
203+
}
204+
def CTSELECT_V2F64X : CTSELECT_VEC128X {
205+
let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg";
206+
}
207+
def CTSELECT_V4I32X : CTSELECT_VEC128X {
208+
let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg";
209+
}
210+
def CTSELECT_V2I64X : CTSELECT_VEC128X {
211+
let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg";
212+
}
213+
def CTSELECT_V8I16X : CTSELECT_VEC128X {
214+
let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg";
215+
}
216+
def CTSELECT_V16I8X : CTSELECT_VEC128X {
217+
let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg";
218+
}
219+
220+
// If your build has v8f16, keep this; otherwise comment it out.
221+
def CTSELECT_V8F16X : CTSELECT_VEC128X {
222+
let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg";
223+
}
224+
}
225+
226+
//===----------------------------------------------------------------------===//
227+
// 256-bit pseudos
228+
//===----------------------------------------------------------------------===//
229+
let Predicates = [HasAVX] in {
230+
231+
def CTSELECT_V8F32 : CTSELECT_VEC256 {
232+
let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg";
233+
}
234+
def CTSELECT_V4F64 : CTSELECT_VEC256 {
235+
let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg";
236+
}
237+
def CTSELECT_V8I32 : CTSELECT_VEC256 {
238+
let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg";
239+
}
240+
def CTSELECT_V4I64 : CTSELECT_VEC256 {
241+
let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg";
242+
}
243+
def CTSELECT_V16I16 : CTSELECT_VEC256 {
244+
let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg";
245+
}
246+
def CTSELECT_V32I8 : CTSELECT_VEC256 {
247+
let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg";
248+
}
249+
250+
// If your build has v16f16, keep this; otherwise comment it out.
251+
def CTSELECT_V16F16 : CTSELECT_VEC256 {
252+
let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg";
253+
}
254+
}
255+
256+
//===----------------------------------------------------------------------===//
257+
// Selection patterns: X86ctselect(...), EFLAGS -> CTSELECT_V*
258+
//
259+
// NOTE:
260+
// * The SDNode carries Glue from CMP/TEST (due to SDNPInGlue).
261+
// * We list EFLAGS explicitly in the pattern (X86 style) to model the arch read.
262+
// * Temps (tmpx/tmpy,tmpg) are not in the pattern; they’re outs allocated by RA.
263+
//===----------------------------------------------------------------------===//
264+
265+
let Predicates = [HasSSE1] in {
266+
267+
// 128-bit float (bitwise-equivalent ops in expander)
268+
def : Pat<(v4f32 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
269+
(CTSELECT_V4F32 VR128:$t, VR128:$f, timm:$cc)>;
270+
}
271+
272+
let Predicates = [HasSSE2] in {
273+
274+
// 128-bit integer
275+
def : Pat<(v4i32 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
276+
(CTSELECT_V4I32 VR128:$t, VR128:$f, timm:$cc)>;
277+
def : Pat<(v2i64 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
278+
(CTSELECT_V2I64 VR128:$t, VR128:$f, timm:$cc)>;
279+
def : Pat<(v8i16 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
280+
(CTSELECT_V8I16 VR128:$t, VR128:$f, timm:$cc)>;
281+
def : Pat<(v16i8 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
282+
(CTSELECT_V16I8 VR128:$t, VR128:$f, timm:$cc)>;
283+
def : Pat<(v2f64 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
284+
(CTSELECT_V2F64 VR128:$t, VR128:$f, timm:$cc)>;
285+
286+
// 128-bit f16 (optional)
287+
def : Pat<(v8f16 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
288+
(CTSELECT_V8F16 VR128:$t, VR128:$f, timm:$cc)>;
289+
}
290+
291+
let Predicates = [HasAVX] in {
292+
293+
// 256-bit integer
294+
def : Pat<(v8i32 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
295+
(CTSELECT_V8I32 VR256:$t, VR256:$f, timm:$cc)>;
296+
def : Pat<(v4i64 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
297+
(CTSELECT_V4I64 VR256:$t, VR256:$f, timm:$cc)>;
298+
def : Pat<(v16i16 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
299+
(CTSELECT_V16I16 VR256:$t, VR256:$f, timm:$cc)>;
300+
def : Pat<(v32i8 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
301+
(CTSELECT_V32I8 VR256:$t, VR256:$f, timm:$cc)>;
302+
303+
// 256-bit float (bitwise-equivalent ops in expander)
304+
def : Pat<(v8f32 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
305+
(CTSELECT_V8F32 VR256:$t, VR256:$f, timm:$cc)>;
306+
def : Pat<(v4f64 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
307+
(CTSELECT_V4F64 VR256:$t, VR256:$f, timm:$cc)>;
308+
309+
// 256-bit f16 (optional)
310+
def : Pat<(v16f16 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
311+
(CTSELECT_V16F16 VR256:$t, VR256:$f, timm:$cc)>;
312+
}
313+
109314
let Predicates = [HasCMOV, HasCF] in {
110315
def : Pat<(X86cmov GR16:$src1, 0, timm:$cond, EFLAGS),
111316
(CFCMOV16rr GR16:$src1, (inv_cond_XFORM timm:$cond))>;

llvm/lib/Target/X86/X86InstrCompiler.td

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -693,6 +693,87 @@ def : Pat<(v16f32 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
693693
def : Pat<(v8f64 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
694694
(CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>;
695695

696+
// CTSELECT
697+
// Enhanced CTSELECT pseudos for i386 with temporary register allocation
698+
// These use a two-phase approach:
699+
// 1. Custom inserter materializes condition byte from EFLAGS
700+
// 2. Post-RA expansion generates constant-time instruction bundles
701+
702+
let isPseudo = 1, isNotDuplicable = 1 in {
703+
// Phase 1: Initial pseudos that consume EFLAGS (via custom inserter)
704+
// These are matched by patterns and convert EFLAGS to condition byte
705+
class CTSELECT_I386_INITIAL<RegisterClass RC, ValueType VT>
706+
: PseudoI<(outs RC:$dst),
707+
(ins RC:$src1, RC:$src2, i8imm:$cond),
708+
[(set RC:$dst, (VT(X86ctselect RC:$src1, RC:$src2, timm:$cond,
709+
EFLAGS)))]> {
710+
let Uses = [EFLAGS];
711+
let Defs = [EFLAGS];
712+
let usesCustomInserter = 1;
713+
let hasNoSchedulingInfo = 1;
714+
}
715+
716+
// Phase 2: Internal pseudos with pre-materialized condition byte (post-RA expansion)
717+
// These generate the actual constant-time instruction bundles
718+
class CTSELECT_I386_INTERNAL<RegisterClass RC, RegisterClass ByteRC>
719+
: PseudoI<(outs RC:$dst, ByteRC:$tmp_byte, RC:$tmp_mask),
720+
(ins RC:$src1, RC:$src2, ByteRC:$cond_byte), []> {
721+
let hasNoSchedulingInfo = 1;
722+
let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_byte,@earlyclobber $tmp_mask";
723+
let Defs = [EFLAGS]; // NEG instruction in post-RA expansion clobbers EFLAGS
724+
}
725+
}
726+
727+
// Phase 1 pseudos for non-CMOV targets (custom inserter materializes condition)
728+
let isCodeGenOnly = 1, hasSideEffects = 1, ForceDisassemble = 1 in {
729+
let Predicates = [NoNativeCMOV] in {
730+
def CTSELECT_I386_GR8rr : CTSELECT_I386_INITIAL<GR8, i8>;
731+
def CTSELECT_I386_GR16rr : CTSELECT_I386_INITIAL<GR16, i16>;
732+
def CTSELECT_I386_GR32rr : CTSELECT_I386_INITIAL<GR32, i32>;
733+
}
734+
}
735+
736+
// Phase 2 pseudos (post-RA expansion with pre-materialized condition byte)
737+
let isCodeGenOnly = 1, hasSideEffects = 1, ForceDisassemble = 1 in {
738+
let Predicates = [NoNativeCMOV] in {
739+
def CTSELECT_I386_INT_GR8rr :
740+
CTSELECT_I386_INTERNAL<GR8, GR8>;
741+
def CTSELECT_I386_INT_GR16rr :
742+
CTSELECT_I386_INTERNAL<GR16, GR8>;
743+
def CTSELECT_I386_INT_GR32rr :
744+
CTSELECT_I386_INTERNAL<GR32, GR8>;
745+
}
746+
}
747+
748+
let hasSideEffects = 1,
749+
ForceDisassemble = 1,
750+
Constraints = "$dst = $src1" in {
751+
752+
let Predicates = [FPStackf32] in
753+
def CTSELECT_I386_FP32rr : CTSELECT_I386_INITIAL<RFP32, f32>;
754+
755+
let Predicates = [FPStackf64] in
756+
def CTSELECT_I386_FP64rr : CTSELECT_I386_INITIAL<RFP64, f64>;
757+
758+
def CTSELECT_I386_FP80rr : CTSELECT_I386_INITIAL<RFP80, f80>;
759+
}
760+
761+
// Pattern matching for non-native-CMOV CTSELECT (routes to custom inserter for condition materialization)
762+
// NoNativeCMOV ensures these patterns are used when actual CMOV instruction is not available
763+
// even if canUseCMOV() is true (e.g., i386 with SSE which can emulate CMOV)
764+
let Predicates = [NoNativeCMOV] in {
765+
def : Pat<(i8(X86ctselect GR8:$src1, GR8:$src2, timm:$cond, EFLAGS)),
766+
(CTSELECT_I386_GR8rr GR8:$src1, GR8:$src2, timm:$cond)>;
767+
768+
def : Pat<(i16(X86ctselect GR16:$src1, GR16:$src2, timm:$cond, EFLAGS)),
769+
(CTSELECT_I386_GR16rr GR16:$src1, GR16:$src2, timm:$cond)>;
770+
771+
def : Pat<(i32(X86ctselect GR32:$src1, GR32:$src2, timm:$cond, EFLAGS)),
772+
(CTSELECT_I386_GR32rr GR32:$src1, GR32:$src2, timm:$cond)>;
773+
774+
// i64 patterns handled automatically by type legalization
775+
}
776+
696777
//===----------------------------------------------------------------------===//
697778
// Normal-Instructions-With-Lock-Prefix Pseudo Instructions
698779
//===----------------------------------------------------------------------===//

llvm/lib/Target/X86/X86InstrFragments.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,10 @@ def SDTX86Cmov : SDTypeProfile<1, 4,
2828
[SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
2929
SDTCisVT<3, i8>, SDTCisVT<4, i32>]>;
3030

31+
def SDTX86CtSelect : SDTypeProfile<1, 4,
32+
[SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
33+
SDTCisVT<3, i8>, SDTCisVT<4, i32>]>;
34+
3135
// Unary and binary operator instructions that set EFLAGS as a side-effect.
3236
def SDTUnaryArithWithFlags : SDTypeProfile<2, 1,
3337
[SDTCisSameAs<0, 2>,
@@ -151,6 +155,7 @@ def X86ctest : SDNode<"X86ISD::CTEST", SDTX86Ccmp>;
151155
def X86cload : SDNode<"X86ISD::CLOAD", SDTX86Cload, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
152156
def X86cstore : SDNode<"X86ISD::CSTORE", SDTX86Cstore, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
153157

158+
def X86ctselect: SDNode<"X86ISD::CTSELECT", SDTX86CtSelect, [SDNPInGlue]>;
154159
def X86cmov : SDNode<"X86ISD::CMOV", SDTX86Cmov>;
155160
def X86brcond : SDNode<"X86ISD::BRCOND", SDTX86BrCond,
156161
[SDNPHasChain]>;

0 commit comments

Comments
 (0)