From cee41562976955a1e4c7b911a304b989a73be16d Mon Sep 17 00:00:00 2001 From: wizardengineer Date: Wed, 5 Nov 2025 17:09:23 -0500 Subject: [PATCH 1/2] [LLVM][X86] Add native ct.select support for X86 and i386 Add native X86 implementation with CMOV instructions and comprehensive tests: - X86 ISelLowering with CMOV for x86_64 and i386 - Fallback bitwise operations for i386 targets without CMOV - Post-RA expansion for pseudo-instructions - Comprehensive test coverage: - Edge cases (zero conditions, large integers) - i386-specific tests (FP, MMX, non-CMOV fallback) - Vector operations - Optimization patterns The basic test demonstrating fallback is in the core infrastructure PR. --- llvm/lib/Target/X86/X86.td | 8 +- llvm/lib/Target/X86/X86ISelLowering.cpp | 791 +++++++++- llvm/lib/Target/X86/X86ISelLowering.h | 7 + llvm/lib/Target/X86/X86InstrCMovSetCC.td | 205 +++ llvm/lib/Target/X86/X86InstrCompiler.td | 81 ++ llvm/lib/Target/X86/X86InstrFragments.td | 5 + llvm/lib/Target/X86/X86InstrInfo.cpp | 609 +++++++- llvm/lib/Target/X86/X86InstrInfo.h | 6 + llvm/lib/Target/X86/X86InstrPredicates.td | 5 + llvm/lib/Target/X86/X86TargetMachine.cpp | 5 +- llvm/test/CodeGen/X86/ctselect-edge-cases.ll | 409 ++++++ llvm/test/CodeGen/X86/ctselect-i386-fp.ll | 722 ++++++++++ llvm/test/CodeGen/X86/ctselect-i386-mmx.ll | 428 ++++++ llvm/test/CodeGen/X86/ctselect-i386.ll | 267 ++++ .../test/CodeGen/X86/ctselect-optimization.ll | 304 ++++ llvm/test/CodeGen/X86/ctselect-vector.ll | 1274 +++++++++++++++++ llvm/test/CodeGen/X86/ctselect.ll | 996 +++++++------ 17 files changed, 5671 insertions(+), 451 deletions(-) create mode 100644 llvm/test/CodeGen/X86/ctselect-edge-cases.ll create mode 100644 llvm/test/CodeGen/X86/ctselect-i386-fp.ll create mode 100644 llvm/test/CodeGen/X86/ctselect-i386-mmx.ll create mode 100644 llvm/test/CodeGen/X86/ctselect-i386.ll create mode 100644 llvm/test/CodeGen/X86/ctselect-optimization.ll create mode 100644 llvm/test/CodeGen/X86/ctselect-vector.ll diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 9e291a6ae431f..21826d8289bb9 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -825,9 +825,10 @@ include "X86SchedSapphireRapids.td" def ProcessorFeatures { // x86-64 micro-architecture levels: x86-64 and x86-64-v[234] - list X86_64V1Features = [ - FeatureX87, FeatureCX8, FeatureCMOV, FeatureMMX, FeatureSSE2, - FeatureFXSR, FeatureNOPL, FeatureX86_64, + list X86_64V1Features = [FeatureX87, FeatureCX8, + FeatureCMOV, FeatureMMX, + FeatureSSE2, FeatureFXSR, + FeatureNOPL, FeatureX86_64, ]; list X86_64V1Tuning = [ TuningMacroFusion, @@ -1161,6 +1162,7 @@ def ProcessorFeatures { FeatureAVXNECONVERT, FeatureAVXVNNIINT8, FeatureAVXVNNIINT16, + FeatureUSERMSR, FeatureSHA512, FeatureSM3, FeatureEGPR, diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 6edf0185df813..833afa717c32c 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "X86ISelLowering.h" +#include "MCTargetDesc/X86MCTargetDesc.h" #include "MCTargetDesc/X86ShuffleDecode.h" #include "X86.h" #include "X86FrameLowering.h" @@ -29,6 +30,8 @@ #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/LivePhysRegs.h" +#include "llvm/CodeGen/ISDOpcodes.h" +#include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -48,6 +51,7 @@ #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InlineAsm.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/PatternMatch.h" @@ -488,6 +492,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // X86 wants to expand cmov itself. for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) { setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::CTSELECT, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::STRICT_FSETCC, VT, Custom); setOperationAction(ISD::STRICT_FSETCCS, VT, Custom); @@ -496,11 +501,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (VT == MVT::i64 && !Subtarget.is64Bit()) continue; setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::CTSELECT, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); } // Custom action for SELECT MMX and expand action for SELECT_CC MMX setOperationAction(ISD::SELECT, MVT::x86mmx, Custom); + setOperationAction(ISD::CTSELECT, MVT::x86mmx, Custom); setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand); setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); @@ -630,6 +637,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::BR_CC, VT, Action); setOperationAction(ISD::SETCC, VT, Action); setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::CTSELECT, VT, Custom); setOperationAction(ISD::SELECT_CC, VT, Action); setOperationAction(ISD::FROUND, VT, Action); setOperationAction(ISD::FROUNDEVEN, VT, Action); @@ -1067,6 +1075,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::VSELECT, MVT::v4f32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::SELECT, MVT::v4f32, Custom); + setOperationAction(ISD::CTSELECT, MVT::v4f32, Custom); setOperationAction(ISD::FCANONICALIZE, MVT::v4f32, Custom); setOperationAction(ISD::LOAD, MVT::v2f32, Custom); @@ -1220,6 +1229,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SELECT, MVT::v8f16, Custom); setOperationAction(ISD::SELECT, MVT::v16i8, Custom); + setOperationAction(ISD::CTSELECT, MVT::v2f64, Custom); + setOperationAction(ISD::CTSELECT, MVT::v2i64, Custom); + setOperationAction(ISD::CTSELECT, MVT::v4i32, Custom); + setOperationAction(ISD::CTSELECT, MVT::v8i16, Custom); + setOperationAction(ISD::CTSELECT, MVT::v8f16, Custom); + setOperationAction(ISD::CTSELECT, MVT::v16i8, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); @@ -1541,6 +1557,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SELECT, MVT::v32i8, Custom); setOperationAction(ISD::SELECT, MVT::v8f32, Custom); + setOperationAction(ISD::CTSELECT, MVT::v4f64, Custom); + setOperationAction(ISD::CTSELECT, MVT::v4i64, Custom); + setOperationAction(ISD::CTSELECT, MVT::v8i32, Custom); + setOperationAction(ISD::CTSELECT, MVT::v16i16, Custom); + setOperationAction(ISD::CTSELECT, MVT::v16f16, Custom); + setOperationAction(ISD::CTSELECT, MVT::v32i8, Custom); + setOperationAction(ISD::CTSELECT, MVT::v8f32, Custom); + for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { setOperationAction(ISD::SIGN_EXTEND, VT, Custom); setOperationAction(ISD::ZERO_EXTEND, VT, Custom); @@ -1727,6 +1751,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addRegisterClass(MVT::v16i1, &X86::VK16RegClass); setOperationAction(ISD::SELECT, MVT::v1i1, Custom); + setOperationAction(ISD::CTSELECT, MVT::v1i1, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom); @@ -1772,6 +1797,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) { setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::CTSELECT, VT, Custom); setOperationAction(ISD::TRUNCATE, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); @@ -2038,6 +2064,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::CTSELECT, VT, Custom); setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); @@ -2203,6 +2230,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::CTSELECT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); @@ -2269,6 +2297,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::VSELECT, VT, Legal); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::CTSELECT, VT, Custom); setOperationAction(ISD::FNEG, VT, Custom); setOperationAction(ISD::FABS, VT, Custom); @@ -2538,6 +2567,22 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addRegisterClass(MVT::x86amx, &X86::TILERegClass); } + // Handle 512-bit vector CTSELECT without AVX512 by setting them to Expand + // This allows type legalization to split them into smaller vectors + for (auto VT : {MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64, MVT::v32f16, + MVT::v16f32, MVT::v8f64}) { + setOperationAction(ISD::CTSELECT, VT, Expand); + } + + // Handle 256-bit vector CTSELECT without AVX by setting them to Expand + // This allows type legalization to split them into 128-bit vectors + if (!Subtarget.hasAVX()) { + for (auto VT : {MVT::v4f64, MVT::v4i64, MVT::v8i32, MVT::v16i16, + MVT::v16f16, MVT::v32i8, MVT::v8f32}) { + setOperationAction(ISD::CTSELECT, VT, Expand); + } + } + // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); @@ -2644,6 +2689,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, ISD::BITCAST, ISD::VSELECT, ISD::SELECT, + ISD::CTSELECT, ISD::SHL, ISD::SRA, ISD::SRL, @@ -25325,6 +25371,174 @@ static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const SDLoc &dl, return V; } +SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const { + SDValue Cond = Op.getOperand(0); // condition + SDValue TrueOp = Op.getOperand(1); // true_value + SDValue FalseOp = Op.getOperand(2); // false_value + SDLoc DL(Op); + MVT VT = TrueOp.getSimpleValueType(); + + // Special handling for i386 targets (no CMOV) - route to post-RA expansion + // pseudos Let standard type legalization handle i64 automatically (splits + // into EDX:EAX) + + // Handle soft float16 by converting to integer operations + if (isSoftF16(VT, Subtarget)) { + MVT NVT = VT.changeTypeToInteger(); + SDValue CtSelect = + DAG.getNode(ISD::CTSELECT, DL, NVT, Cond, DAG.getBitcast(NVT, FalseOp), + DAG.getBitcast(NVT, TrueOp)); + return DAG.getBitcast(VT, CtSelect); + } + + // Handle vector types + if (VT.isVector()) { + // Handle soft float16 vectors + if (isSoftF16(VT, Subtarget)) { + MVT NVT = VT.changeVectorElementTypeToInteger(); + SDValue CtSelect = DAG.getNode(ISD::CTSELECT, DL, NVT, Cond, + DAG.getBitcast(NVT, FalseOp), + DAG.getBitcast(NVT, TrueOp)); + return DAG.getBitcast(VT, CtSelect); + } + + unsigned VectorWidth = VT.getSizeInBits(); + MVT EltVT = VT.getVectorElementType(); + + // 512-bit vectors without AVX512 are now handled by type legalization + // (Expand action) 256-bit vectors without AVX are now handled by type + // legalization (Expand action) + + if (VectorWidth == 128 && !Subtarget.hasSSE1()) + return SDValue(); + + // Handle special cases for floating point vectors + if (EltVT.isFloatingPoint()) { + // For vector floating point with AVX, use VBLENDV-style operations + if (Subtarget.hasAVX() && (VectorWidth == 256 || VectorWidth == 128)) { + // Convert to bitwise operations using the condition + MVT IntVT = VT.changeVectorElementTypeToInteger(); + SDValue IntOp1 = DAG.getBitcast(IntVT, TrueOp); + SDValue IntOp2 = DAG.getBitcast(IntVT, FalseOp); + + // Create the CTSELECT node with integer types + SDValue IntResult = + DAG.getNode(X86ISD::CTSELECT, DL, IntVT, IntOp2, IntOp1, + DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), + EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget)); + return DAG.getBitcast(VT, IntResult); + } + } + + // For integer vectors or when we don't have advanced SIMD support, + // use the generic X86 CTSELECT node which will be matched by the patterns + SDValue CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8); + SDValue EFLAGS = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget); + // Create the X86 CTSELECT node - note operand order: true, false, cc, flags + return DAG.getNode(X86ISD::CTSELECT, DL, VT, FalseOp, TrueOp, CC, EFLAGS); + } + + // Look past (and (setcc_carry (cmp ...)), 1) + if (Cond.getOpcode() == ISD::AND && + Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY && + isOneConstant(Cond.getOperand(1))) + Cond = Cond.getOperand(0); + + /// Process condition flags and prepare for CTSELECT node creation + auto ProcessConditionFlags = + [&](SDValue Cond, MVT VT, SDLoc DL, SelectionDAG &DAG, + const X86Subtarget &Subtarget) -> std::pair { + SDValue CC; + bool AddTest = true; + + unsigned CondOpcode = Cond.getOpcode(); + if (CondOpcode == X86ISD::SETCC || CondOpcode == X86ISD::SETCC_CARRY) { + CC = Cond.getOperand(0); + SDValue Cmp = Cond.getOperand(1); + + if ((isX86LogicalCmp(Cmp)) || Cmp.getOpcode() == X86ISD::BT) { + Cond = Cmp; + AddTest = false; + } + } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || + CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || + CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) { + SDValue Value; + X86::CondCode X86Cond; + std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG); + CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8); + AddTest = false; + } + + if (AddTest) { + // Look past the truncate if the high bits are known zero + if (isTruncWithZeroHighBitsInput(Cond, DAG)) + Cond = Cond.getOperand(0); + + // Try to match AND to BT instruction + if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { + X86::CondCode X86CondCode; + if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) { + CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8); + Cond = BT; + AddTest = false; + } + } + } + + if (AddTest) { + CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8); + Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget); + } + + return {CC, Cond}; + }; + + // Process condition flags and prepare for CTSELECT + auto [CC, ProcessedCond] = + ProcessConditionFlags(Cond, VT, DL, DAG, Subtarget); + + // Handle i8 CTSELECT with truncate optimization + if (Op.getValueType() == MVT::i8 && TrueOp.getOpcode() == ISD::TRUNCATE && + FalseOp.getOpcode() == ISD::TRUNCATE) { + SDValue T1 = TrueOp.getOperand(0), T2 = FalseOp.getOperand(0); + if (T1.getValueType() == T2.getValueType() && + T1.getOpcode() != ISD::CopyFromReg && + T2.getOpcode() != ISD::CopyFromReg) { + SDValue CtSelect = DAG.getNode(X86ISD::CTSELECT, DL, T1.getValueType(), + T2, T1, CC, ProcessedCond); + return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), CtSelect); + } + } + + // Promote small integer types to avoid partial register stalls + // Exception: For i8 without CMOV, we can generate a shorter instruction + // sequence without movzx so keep it as is. + if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMOV()) || + (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(TrueOp, Subtarget) && + !X86::mayFoldLoad(FalseOp, Subtarget))) { + TrueOp = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, TrueOp); + FalseOp = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, FalseOp); + SDValue Ops[] = {FalseOp, TrueOp, CC, ProcessedCond}; + SDValue CtSelect = DAG.getNode(X86ISD::CTSELECT, DL, MVT::i32, Ops); + return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), CtSelect); + } + + if (isScalarFPTypeInSSEReg(VT)) { + MVT IntVT = (VT == MVT::f32) ? MVT::i32 : MVT::i64; + TrueOp = DAG.getBitcast(IntVT, TrueOp); + FalseOp = DAG.getBitcast(IntVT, FalseOp); + SDValue Ops[] = {FalseOp, TrueOp, CC, ProcessedCond}; + SDValue CtSelect = DAG.getNode(X86ISD::CTSELECT, DL, IntVT, Ops); + return DAG.getBitcast(VT, CtSelect); + } + + // Create final CTSELECT node + SDValue Ops[] = {FalseOp, TrueOp, CC, ProcessedCond}; + return DAG.getNode(X86ISD::CTSELECT, DL, Op.getValueType(), Ops, + Op->getFlags()); +} + static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue In = Op->getOperand(0); @@ -29695,30 +29909,65 @@ static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue *Low = nullptr) { + unsigned NumElts = VT.getVectorNumElements(); + // For vXi8 we will unpack the low and high half of each 128 bit lane to widen // to a vXi16 type. Do the multiplies, shift the results and pack the half // lane results back together. // We'll take different approaches for signed and unsigned. - // For unsigned we'll use punpcklbw/punpckhbw to zero extend the bytes to - // words and use pmullw to calculate the full 16-bit product. + // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes + // and use pmullw to calculate the full 16-bit product. // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and // shift them left into the upper byte of each word. This allows us to use // pmulhw to calculate the full 16-bit product. This trick means we don't // need to sign extend the bytes to use pmullw. - MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2); + + MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2); SDValue Zero = DAG.getConstant(0, dl, VT); - SDValue ALo, AHi, BLo, BHi; + SDValue ALo, AHi; if (IsSigned) { ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A)); - BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B)); AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A)); - BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B)); } else { ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero)); - BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero)); AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero)); + } + + SDValue BLo, BHi; + if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) { + // If the RHS is a constant, manually unpackl/unpackh and extend. + SmallVector LoOps, HiOps; + for (unsigned i = 0; i != NumElts; i += 16) { + for (unsigned j = 0; j != 8; ++j) { + SDValue LoOp = B.getOperand(i + j); + SDValue HiOp = B.getOperand(i + j + 8); + + if (IsSigned) { + LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16); + HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16); + LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp, + DAG.getConstant(8, dl, MVT::i16)); + HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp, + DAG.getConstant(8, dl, MVT::i16)); + } else { + LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16); + HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16); + } + + LoOps.push_back(LoOp); + HiOps.push_back(HiOp); + } + } + + BLo = DAG.getBuildVector(ExVT, dl, LoOps); + BHi = DAG.getBuildVector(ExVT, dl, HiOps); + } else if (IsSigned) { + BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B)); + BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B)); + } else { + BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero)); BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero)); } @@ -29731,7 +29980,7 @@ static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl, if (Low) *Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi); - return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf=*/true); + return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true); } static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, @@ -33594,6 +33843,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG); case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); + case ISD::CTSELECT: return LowerCTSELECT(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::JumpTable: return LowerJumpTable(Op, DAG); case ISD::VASTART: return LowerVASTART(Op, DAG); @@ -33677,6 +33927,12 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { } } +bool X86TargetLowering::isSelectSupported(SelectSupportKind Kind) const { + if (Kind == SelectSupportKind::CtSelect) { + return true; + } + return TargetLoweringBase::isSelectSupported(Kind); +} /// Replace a node with an illegal result type with a new node built out of /// custom code. void X86TargetLowering::ReplaceNodeResults(SDNode *N, @@ -34904,6 +35160,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(STRICT_CMPM) NODE_NAME_CASE(CMPMM_SAE) NODE_NAME_CASE(SETCC) + NODE_NAME_CASE(CTSELECT) NODE_NAME_CASE(SETCC_CARRY) NODE_NAME_CASE(FSETCC) NODE_NAME_CASE(FSETCCM) @@ -37677,6 +37934,480 @@ X86TargetLowering::emitPatchableEventCall(MachineInstr &MI, return BB; } +/// Helper function to emit i386 CTSELECT with condition materialization. +/// This converts EFLAGS-based CTSELECT into a condition byte that can be +/// shared across multiple operations (critical for i64 type legalization). +/// +/// Phase 1: Materialize condition byte from EFLAGS using SETCC +/// Phase 2: Create internal pseudo with condition byte for post-RA expansion +/// +/// This approach ensures that when i64 is type-legalized into two i32 +/// operations, both operations share the same condition byte rather than +/// each independently reading (and destroying) EFLAGS. +static MachineBasicBlock * +emitCTSelectI386WithConditionMaterialization(MachineInstr &MI, + MachineBasicBlock *BB, + unsigned InternalPseudoOpcode) { + const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo(); + const MIMetadata MIMD(MI); + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + // Original pseudo operands: (outs dst), (ins src1, src2, cond) + Register Src1Reg = MI.getOperand(1).getReg(); + Register Src2Reg = MI.getOperand(2).getReg(); + X86::CondCode CC = static_cast(MI.getOperand(3).getImm()); + + // Get opposite condition (SETCC sets to 1 when condition is TRUE, + // but we want to select src1 when condition is FALSE for X86 semantics) + X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC); + + // Step 1: Materialize condition byte from EFLAGS + // This is done OUTSIDE the constant-time bundle, before any EFLAGS corruption + Register CondByteReg = MRI.createVirtualRegister(&X86::GR8RegClass); + BuildMI(*BB, MI, MIMD, TII->get(X86::SETCCr), CondByteReg).addImm(OppCC); + + // Step 2: Create internal pseudo that takes condition byte as input + // This pseudo will be expanded post-RA into the actual constant-time bundle + // The condition byte can now be safely shared between multiple pseudos + + // Internal pseudo has operands: (outs dst, tmp_byte, tmp_mask), (ins src1, + // src2, cond_byte) + Register DstReg = MI.getOperand(0).getReg(); + + // Create virtual registers for the temporary outputs + Register TmpByteReg = MRI.createVirtualRegister(&X86::GR8RegClass); + Register TmpMaskReg; + + // Determine the register class for tmp_mask based on the data type + if (InternalPseudoOpcode == X86::CTSELECT_I386_INT_GR8rr) { + TmpMaskReg = MRI.createVirtualRegister(&X86::GR8RegClass); + } else if (InternalPseudoOpcode == X86::CTSELECT_I386_INT_GR16rr) { + TmpMaskReg = MRI.createVirtualRegister(&X86::GR16RegClass); + } else if (InternalPseudoOpcode == X86::CTSELECT_I386_INT_GR32rr) { + TmpMaskReg = MRI.createVirtualRegister(&X86::GR32RegClass); + } else { + llvm_unreachable("Unknown internal pseudo opcode"); + } + + BuildMI(*BB, MI, MIMD, TII->get(InternalPseudoOpcode)) + .addDef(DstReg) // dst (output) + .addDef(TmpByteReg) // tmp_byte (output) + .addDef(TmpMaskReg) // tmp_mask (output) + .addReg(Src1Reg) // src1 (input) + .addReg(Src2Reg) // src2 (input) + .addReg(CondByteReg); // pre-materialized condition byte (input) + + MI.eraseFromParent(); + return BB; +} + +// Helper structure to hold memory operand information for FP loads +struct FPLoadMemOperands { + bool IsValid = false; + unsigned BaseReg = 0; + int64_t ScaleVal = 1; + unsigned IndexReg = 0; + int64_t Disp = 0; + unsigned SegReg = 0; + int FrameIndex = -1; + bool IsFrameIndex = false; + int ConstantPoolIndex = -1; + bool IsConstantPool = false; + const GlobalValue *Global = nullptr; + int64_t GlobalOffset = 0; + bool IsGlobal = false; +}; + +// Check if a virtual register is defined by a simple FP load instruction +// Returns the memory operands if it's a simple load, otherwise returns invalid +static FPLoadMemOperands getFPLoadMemOperands(Register Reg, + MachineRegisterInfo &MRI, + unsigned ExpectedLoadOpcode) { + FPLoadMemOperands Result; + + if (!Reg.isVirtual()) + return Result; + + MachineInstr *DefMI = MRI.getVRegDef(Reg); + if (!DefMI) + return Result; + + // Check if it's the expected load opcode (e.g., LD_Fp32m, LD_Fp64m, LD_Fp80m) + if (DefMI->getOpcode() != ExpectedLoadOpcode) + return Result; + + // Check that this is a simple load - not volatile, not atomic, etc. + // FP loads have hasSideEffects = 0 in their definition for simple loads + if (DefMI->hasOrderedMemoryRef()) + return Result; + + // The load should have a single def (the destination register) and memory operands + // Format: %reg = LD_Fpxxm , 1, %noreg, 0, %noreg + // or: %reg = LD_Fpxxm %base, scale, %index, disp, %segment + if (DefMI->getNumOperands() < 6) + return Result; + + // Operand 0 is the destination, operands 1-5 are the memory reference + MachineOperand &BaseMO = DefMI->getOperand(1); + MachineOperand &ScaleMO = DefMI->getOperand(2); + MachineOperand &IndexMO = DefMI->getOperand(3); + MachineOperand &DispMO = DefMI->getOperand(4); + MachineOperand &SegMO = DefMI->getOperand(5); + + // Check if this is a frame index load + if (BaseMO.isFI()) { + Result.IsValid = true; + Result.IsFrameIndex = true; + Result.FrameIndex = BaseMO.getIndex(); + Result.ScaleVal = ScaleMO.getImm(); + Result.IndexReg = IndexMO.getReg(); + Result.Disp = DispMO.getImm(); + Result.SegReg = SegMO.getReg(); + return Result; + } + + // Check if this is a constant pool load + // Format: %reg = LD_Fpxxm $noreg, 1, $noreg, %const.N, $noreg + if (BaseMO.isReg() && BaseMO.getReg() == X86::NoRegister && + ScaleMO.isImm() && IndexMO.isReg() && + IndexMO.getReg() == X86::NoRegister && + DispMO.isCPI() && SegMO.isReg()) { + Result.IsValid = true; + Result.IsConstantPool = true; + Result.ConstantPoolIndex = DispMO.getIndex(); + Result.ScaleVal = ScaleMO.getImm(); + Result.IndexReg = IndexMO.getReg(); + Result.Disp = 0; + Result.SegReg = SegMO.getReg(); + return Result; + } + + // Check if this is a global variable load + // Format: %reg = LD_Fpxxm $noreg, 1, $noreg, @global_name, $noreg + if (BaseMO.isReg() && BaseMO.getReg() == X86::NoRegister && + ScaleMO.isImm() && IndexMO.isReg() && + IndexMO.getReg() == X86::NoRegister && + DispMO.isGlobal() && SegMO.isReg()) { + Result.IsValid = true; + Result.IsGlobal = true; + Result.Global = DispMO.getGlobal(); + Result.GlobalOffset = DispMO.getOffset(); + Result.ScaleVal = ScaleMO.getImm(); + Result.IndexReg = IndexMO.getReg(); + Result.Disp = 0; + Result.SegReg = SegMO.getReg(); + return Result; + } + + // Regular memory operands (e.g., pointer loads) + if (BaseMO.isReg() && ScaleMO.isImm() && IndexMO.isReg() && + DispMO.isImm() && SegMO.isReg()) { + Result.IsValid = true; + Result.IsFrameIndex = false; + Result.IsConstantPool = false; + Result.BaseReg = BaseMO.getReg(); + Result.ScaleVal = ScaleMO.getImm(); + Result.IndexReg = IndexMO.getReg(); + Result.Disp = DispMO.getImm(); + Result.SegReg = SegMO.getReg(); + return Result; + } + + return Result; +} + +static MachineBasicBlock *emitCTSelectI386WithFpType(MachineInstr &MI, + MachineBasicBlock *BB, + unsigned pseudoInstr) { + const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo(); + const MIMetadata MIMD(MI); + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + MachineFrameInfo &MFI = MF->getFrameInfo(); + unsigned RegSizeInByte = 4; + + // Get operands + // MI operands: %result:rfp80 = CTSELECT_I386 %false:rfp80, %true:rfp80, %cond:i8imm + unsigned DestReg = MI.getOperand(0).getReg(); + unsigned FalseReg = MI.getOperand(1).getReg(); + unsigned TrueReg = MI.getOperand(2).getReg(); + X86::CondCode CC = static_cast(MI.getOperand(3).getImm()); + X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC); + + // Materialize condition byte from EFLAGS + Register CondByteReg = MRI.createVirtualRegister(&X86::GR8RegClass); + BuildMI(*BB, MI, MIMD, TII->get(X86::SETCCr), CondByteReg).addImm(OppCC); + + auto storeFpToSlot = [&](unsigned Opcode, int Slot, Register Reg) { + addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(Opcode)), Slot) + .addReg(Reg, RegState::Kill); + }; + + // Helper to load integer from memory operands + auto loadIntFromMemOperands = [&](const FPLoadMemOperands &MemOps, + unsigned Offset) -> unsigned { + unsigned IntReg = MRI.createVirtualRegister(&X86::GR32RegClass); + MachineInstrBuilder MIB = + BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), IntReg); + + if (MemOps.IsFrameIndex) { + // Frame index: addFrameIndex + scale + index + disp + segment + MIB.addFrameIndex(MemOps.FrameIndex) + .addImm(MemOps.ScaleVal) + .addReg(MemOps.IndexReg) + .addImm(MemOps.Disp + Offset) + .addReg(MemOps.SegReg); + } else if (MemOps.IsConstantPool) { + // Constant pool: base_reg + scale + index + CP_index + segment + // MOV32rm format: base, scale, index, displacement, segment + MIB.addReg(X86::NoRegister) // Base register + .addImm(MemOps.ScaleVal) // Scale + .addReg(MemOps.IndexReg) // Index register + .addConstantPoolIndex(MemOps.ConstantPoolIndex, Offset) // Displacement (CP index) + .addReg(MemOps.SegReg); // Segment + } else if (MemOps.IsGlobal) { + // Global variable: base_reg + scale + index + global + segment + // MOV32rm format: base, scale, index, displacement, segment + MIB.addReg(X86::NoRegister) // Base register + .addImm(MemOps.ScaleVal) // Scale + .addReg(MemOps.IndexReg) // Index register + .addGlobalAddress(MemOps.Global, MemOps.GlobalOffset + Offset) // Displacement (global address) + .addReg(MemOps.SegReg); // Segment + } else { + // Regular memory: base_reg + scale + index + disp + segment + MIB.addReg(MemOps.BaseReg) + .addImm(MemOps.ScaleVal) + .addReg(MemOps.IndexReg) + .addImm(MemOps.Disp + Offset) + .addReg(MemOps.SegReg); + } + + return IntReg; + }; + + // Optimized path: load integers directly from memory when both operands are + // memory loads, avoiding FP register round-trip + auto emitCtSelectFromMemory = [&](unsigned NumValues, + const FPLoadMemOperands &TrueMemOps, + const FPLoadMemOperands &FalseMemOps, + int ResultSlot) { + for (unsigned Val = 0; Val < NumValues; ++Val) { + unsigned Offset = Val * RegSizeInByte; + + // Load true and false values directly from their memory locations as integers + unsigned TrueIntReg = loadIntFromMemOperands(TrueMemOps, Offset); + unsigned FalseIntReg = loadIntFromMemOperands(FalseMemOps, Offset); + + // Use CTSELECT_I386_INT_GR32 pseudo instruction for constant-time selection + unsigned ResultIntReg = MRI.createVirtualRegister(&X86::GR32RegClass); + unsigned TmpByteReg = MRI.createVirtualRegister(&X86::GR8RegClass); + unsigned TmpMaskReg = MRI.createVirtualRegister(&X86::GR32RegClass); + + BuildMI(*BB, MI, MIMD, TII->get(X86::CTSELECT_I386_INT_GR32rr)) + .addDef(ResultIntReg) // dst (output) + .addDef(TmpByteReg) // tmp_byte (output) + .addDef(TmpMaskReg) // tmp_mask (output) + .addReg(FalseIntReg) // src1 (input) - false value + .addReg(TrueIntReg) // src2 (input) - true value + .addReg(CondByteReg); // pre-materialized condition byte (input) + + // Store result back to result slot + BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32mr)) + .addFrameIndex(ResultSlot) + .addImm(1) + .addReg(0) + .addImm(Offset) + .addReg(0) + .addReg(ResultIntReg, RegState::Kill); + } + }; + + auto emitCtSelectWithPseudo = [&](unsigned NumValues, int TrueSlot, int FalseSlot, int ResultSlot) { + for (unsigned Val = 0; Val < NumValues; ++Val) { + unsigned Offset = Val * RegSizeInByte; + + // Load true and false values from stack as 32-bit integers + unsigned TrueIntReg = MRI.createVirtualRegister(&X86::GR32RegClass); + BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), TrueIntReg) + .addFrameIndex(TrueSlot) + .addImm(1) + .addReg(0) + .addImm(Offset) + .addReg(0); + + unsigned FalseIntReg = MRI.createVirtualRegister(&X86::GR32RegClass); + BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), FalseIntReg) + .addFrameIndex(FalseSlot) + .addImm(1) + .addReg(0) + .addImm(Offset) + .addReg(0); + + // Use CTSELECT_I386_INT_GR32 pseudo instruction for constant-time selection + unsigned ResultIntReg = MRI.createVirtualRegister(&X86::GR32RegClass); + unsigned TmpByteReg = MRI.createVirtualRegister(&X86::GR8RegClass); + unsigned TmpMaskReg = MRI.createVirtualRegister(&X86::GR32RegClass); + + BuildMI(*BB, MI, MIMD, TII->get(X86::CTSELECT_I386_INT_GR32rr)) + .addDef(ResultIntReg) // dst (output) + .addDef(TmpByteReg) // tmp_byte (output) + .addDef(TmpMaskReg) // tmp_mask (output) + .addReg(FalseIntReg) // src1 (input) - false value + .addReg(TrueIntReg) // src2 (input) - true value + .addReg(CondByteReg); // pre-materialized condition byte (input) + + // Store result back to result slot + BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32mr)) + .addFrameIndex(ResultSlot) + .addImm(1) + .addReg(0) + .addImm(Offset) + .addReg(0) + .addReg(ResultIntReg, RegState::Kill); + } + }; + + switch (pseudoInstr) { + case X86::CTSELECT_I386_FP32rr: { + // Check if both operands are simple memory loads + FPLoadMemOperands TrueMemOps = + getFPLoadMemOperands(TrueReg, MRI, X86::LD_Fp32m); + FPLoadMemOperands FalseMemOps = + getFPLoadMemOperands(FalseReg, MRI, X86::LD_Fp32m); + + int ResultSlot = MFI.CreateStackObject(RegSizeInByte, Align(4), false); + + if (TrueMemOps.IsValid && FalseMemOps.IsValid) { + // Optimized path: load directly from memory as integers + // Works for both frame index loads (stack parameters) and + // constant pool loads (constants) + emitCtSelectFromMemory(1, TrueMemOps, FalseMemOps, ResultSlot); + + // Erase the original FP load instructions since we're not using them + // and have loaded the data directly as integers instead + if (MRI.hasOneUse(TrueReg)) { + if (MachineInstr *TrueDefMI = MRI.getVRegDef(TrueReg)) + TrueDefMI->eraseFromParent(); + } + if (MRI.hasOneUse(FalseReg)) { + if (MachineInstr *FalseDefMI = MRI.getVRegDef(FalseReg)) + FalseDefMI->eraseFromParent(); + } + } else { + // General path: spill FP registers to stack first + int TrueSlot = MFI.CreateStackObject(RegSizeInByte, Align(4), false); + int FalseSlot = MFI.CreateStackObject(RegSizeInByte, Align(4), false); + + storeFpToSlot(X86::ST_Fp32m, TrueSlot, TrueReg); + storeFpToSlot(X86::ST_Fp32m, FalseSlot, FalseReg); + + emitCtSelectWithPseudo(1, TrueSlot, FalseSlot, ResultSlot); + } + + // Load result back as f32 + addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::LD_Fp32m), DestReg), + ResultSlot); + break; + } + case X86::CTSELECT_I386_FP64rr: { + unsigned StackSlotSize = 8; + + // Check if both operands are simple memory loads + FPLoadMemOperands TrueMemOps = + getFPLoadMemOperands(TrueReg, MRI, X86::LD_Fp64m); + FPLoadMemOperands FalseMemOps = + getFPLoadMemOperands(FalseReg, MRI, X86::LD_Fp64m); + + int ResultSlot = MFI.CreateStackObject(StackSlotSize, Align(4), false); + + if (TrueMemOps.IsValid && FalseMemOps.IsValid) { + // Optimized path: load directly from memory as integers + // Works for both frame index loads (stack parameters) and + // constant pool loads (constants) + emitCtSelectFromMemory(StackSlotSize / RegSizeInByte, TrueMemOps, + FalseMemOps, ResultSlot); + + // Erase the original FP load instructions since we're not using them + if (MRI.hasOneUse(TrueReg)) { + if (MachineInstr *TrueDefMI = MRI.getVRegDef(TrueReg)) + TrueDefMI->eraseFromParent(); + } + if (MRI.hasOneUse(FalseReg)) { + if (MachineInstr *FalseDefMI = MRI.getVRegDef(FalseReg)) + FalseDefMI->eraseFromParent(); + } + } else { + // General path: spill FP registers to stack first + int TrueSlot = MFI.CreateStackObject(StackSlotSize, Align(4), false); + int FalseSlot = MFI.CreateStackObject(StackSlotSize, Align(4), false); + + storeFpToSlot(X86::ST_Fp64m, TrueSlot, TrueReg); + storeFpToSlot(X86::ST_Fp64m, FalseSlot, FalseReg); + + emitCtSelectWithPseudo(StackSlotSize / RegSizeInByte, TrueSlot, FalseSlot, + ResultSlot); + } + + // Load result back as f64 + addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::LD_Fp64m), DestReg), + ResultSlot); + break; + } + case X86::CTSELECT_I386_FP80rr: { + // f80 is 80 bits (10 bytes), but stored with 12-byte alignment + unsigned StackObjectSize = 12; + + // Check if both operands are simple memory loads + FPLoadMemOperands TrueMemOps = + getFPLoadMemOperands(TrueReg, MRI, X86::LD_Fp80m); + FPLoadMemOperands FalseMemOps = + getFPLoadMemOperands(FalseReg, MRI, X86::LD_Fp80m); + + int ResultSlot = MFI.CreateStackObject(StackObjectSize, Align(4), false); + + if (TrueMemOps.IsValid && FalseMemOps.IsValid) { + // Optimized path: load directly from memory as integers + // Works for both frame index loads (stack parameters) and + // constant pool loads (constants) + emitCtSelectFromMemory(StackObjectSize / RegSizeInByte, TrueMemOps, + FalseMemOps, ResultSlot); + + // Erase the original FP load instructions since we're not using them + if (MRI.hasOneUse(TrueReg)) { + if (MachineInstr *TrueDefMI = MRI.getVRegDef(TrueReg)) + TrueDefMI->eraseFromParent(); + } + if (MRI.hasOneUse(FalseReg)) { + if (MachineInstr *FalseDefMI = MRI.getVRegDef(FalseReg)) + FalseDefMI->eraseFromParent(); + } + } else { + // General path: spill FP registers to stack first + int TrueSlot = MFI.CreateStackObject(StackObjectSize, Align(4), false); + int FalseSlot = MFI.CreateStackObject(StackObjectSize, Align(4), false); + + storeFpToSlot(X86::ST_FpP80m, TrueSlot, TrueReg); + storeFpToSlot(X86::ST_FpP80m, FalseSlot, FalseReg); + + emitCtSelectWithPseudo(StackObjectSize / RegSizeInByte, TrueSlot, + FalseSlot, ResultSlot); + } + + // Load result back as f80 + addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::LD_Fp80m), DestReg), + ResultSlot); + break; + } + default: + llvm_unreachable("Invalid CTSELECT opcode"); + } + + MI.eraseFromParent(); + + return BB; +} + MachineBasicBlock * X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const { @@ -37734,6 +38465,25 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case X86::CMOV_VK64: return EmitLoweredSelect(MI, BB); + case X86::CTSELECT_I386_GR8rr: + return emitCTSelectI386WithConditionMaterialization( + MI, BB, X86::CTSELECT_I386_INT_GR8rr); + + case X86::CTSELECT_I386_GR16rr: + return emitCTSelectI386WithConditionMaterialization( + MI, BB, X86::CTSELECT_I386_INT_GR16rr); + + case X86::CTSELECT_I386_GR32rr: + return emitCTSelectI386WithConditionMaterialization( + MI, BB, X86::CTSELECT_I386_INT_GR32rr); + + case X86::CTSELECT_I386_FP32rr: + return emitCTSelectI386WithFpType(MI, BB, X86::CTSELECT_I386_FP32rr); + case X86::CTSELECT_I386_FP64rr: + return emitCTSelectI386WithFpType(MI, BB, X86::CTSELECT_I386_FP64rr); + case X86::CTSELECT_I386_FP80rr: + return emitCTSelectI386WithFpType(MI, BB, X86::CTSELECT_I386_FP80rr); + case X86::FP80_ADDr: case X86::FP80_ADDm32: { // Change the floating point control register to use double extended @@ -41695,7 +42445,7 @@ static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL, if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) || X86::mayFoldLoad(peekThroughOneUseBitcasts(N1), Subtarget)) return SDValue(); - Imm = llvm::rotl(Imm, 4); + Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4); return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0, DAG.getTargetConstant(Imm, DL, MVT::i8)); }; @@ -44662,16 +45412,10 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( } case X86ISD::PCMPGT: // icmp sgt(0, R) == ashr(R, BitWidth-1). - if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode())) { - // iff we only need the signbit then we can use R directly. - if (OriginalDemandedBits.isSignMask()) - return TLO.CombineTo(Op, Op.getOperand(1)); - // otherwise we just need R's signbit for the comparison. - APInt SignMask = APInt::getSignMask(BitWidth); - if (SimplifyDemandedBits(Op.getOperand(1), SignMask, OriginalDemandedElts, - Known, TLO, Depth + 1)) - return true; - } + // iff we only need the sign bit then we can use R directly. + if (OriginalDemandedBits.isSignMask() && + ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode())) + return TLO.CombineTo(Op, Op.getOperand(1)); break; case X86ISD::MOVMSK: { SDValue Src = Op.getOperand(0); @@ -47581,15 +48325,6 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, DL, DAG, Subtarget)) return V; - // If the sign bit is known then BLENDV can be folded away. - if (N->getOpcode() == X86ISD::BLENDV) { - KnownBits KnownCond = DAG.computeKnownBits(Cond); - if (KnownCond.isNegative()) - return LHS; - if (KnownCond.isNonNegative()) - return RHS; - } - if (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV) { SmallVector CondMask; if (createShuffleMaskFromVSELECT(CondMask, Cond, diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index b7151f65942b4..d759895719388 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -114,6 +114,10 @@ namespace llvm { /// X86 Select SELECTS, + /// X86 Constant-time Select, implemented with CMOV instruction. This is + /// used to implement constant-time select. + CTSELECT, + // Same as SETCC except it's materialized with a sbb and the value is all // one's or all zero's. SETCC_CARRY, // R = carry_bit ? ~0 : 0 @@ -1139,6 +1143,8 @@ namespace llvm { /// SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; + bool isSelectSupported(SelectSupportKind Kind) const override; + /// Replace the results of node with an illegal result /// type with new values built out of custom code. /// @@ -1765,6 +1771,7 @@ namespace llvm { SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/llvm/lib/Target/X86/X86InstrCMovSetCC.td index 7d5d7cf4a83ab..9c34889f03354 100644 --- a/llvm/lib/Target/X86/X86InstrCMovSetCC.td +++ b/llvm/lib/Target/X86/X86InstrCMovSetCC.td @@ -106,6 +106,211 @@ let Predicates = [HasCMOV, HasNDD] in { def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, timm:$cond, EFLAGS), (CMOV64rm_ND GR64:$src2, addr:$src1, (inv_cond_XFORM timm:$cond))>; } + +// Create pseudo instruction and do the pattern matching to them. +// We use a machine pass to lower these pseudos into cmov, in order +// to avoid backend optimizations +let Uses = [EFLAGS], isNotDuplicable = 1, isPseudo = 1 in { + + multiclass CTSELECT { + // register-only + let isCommutable = 0, SchedRW = [WriteCMOV], Predicates = [HasNativeCMOV], + AsmString = "ctselect\\t$dst, $src1, $src2, $cond" in { + def rr : PseudoI<(outs t.RegClass:$dst), + (ins t.RegClass:$src1, t.RegClass:$src2, i8imm:$cond), + [(set t.RegClass:$dst, (X86ctselect t.RegClass:$src1, t.RegClass:$src2, timm:$cond, EFLAGS))]>; + } + + // register-memory + let SchedRW = [WriteCMOV.Folded, WriteCMOV.ReadAfterFold], Predicates = [HasNativeCMOV], + AsmString = "ctselect\\t$dst, $src1, $src2, $cond" in { + def rm : PseudoI<(outs t.RegClass:$dst), + (ins t.RegClass:$src1, t.MemOperand:$src2, i8imm:$cond), + [(set t.RegClass:$dst, (X86ctselect t.RegClass:$src1, (t.LoadNode addr:$src2), timm:$cond, EFLAGS))]>; + } + } +} + +let isCodeGenOnly = 1, hasSideEffects = 1, ForceDisassemble = 1 in { + let Constraints = "$dst = $src1" in { + defm CTSELECT16 : CTSELECT; + defm CTSELECT32 : CTSELECT; + defm CTSELECT64 : CTSELECT; + } +} + +// CTSELECT_VEC base class +class CTSELECT_VEC + : PseudoI< + (outs VRc:$dst, VRc:$tmpx, GRc:$tmpg), + (ins VRc:$t, VRc:$f, i8imm:$cond), + [] + > { + let Uses = [EFLAGS]; + let isPseudo = 1; + let isNotDuplicable = 1; + let hasSideEffects = 1; + let AsmString = "ctselect\t$dst, $f, $t, $cond"; + let SchedRW = []; +} + +// Width-specific class aliases +class CTSELECT_VEC128 : CTSELECT_VEC; +class CTSELECT_VEC128X : CTSELECT_VEC; +class CTSELECT_VEC256 : CTSELECT_VEC; +class CTSELECT_VEC512 : CTSELECT_VEC; + + +//===----------------------------------------------------------------------===// +// 128-bit pseudos (SSE2 baseline; we use PXOR/PAND/MOVD/PSHUFD in the expander) +//===----------------------------------------------------------------------===// + +let Predicates = [HasSSE1] in { + + def CTSELECT_V4F32 : CTSELECT_VEC128 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } +} + +let Predicates = [HasSSE2] in { + + def CTSELECT_V2F64 : CTSELECT_VEC128 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + def CTSELECT_V4I32 : CTSELECT_VEC128 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + def CTSELECT_V2I64 : CTSELECT_VEC128 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + def CTSELECT_V8I16 : CTSELECT_VEC128 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + def CTSELECT_V16I8 : CTSELECT_VEC128 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + + // If your build has v8f16, keep this; otherwise comment it out. + def CTSELECT_V8F16 : CTSELECT_VEC128 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } +} + +let Predicates = [HasAVX] in { + + def CTSELECT_V4F32X : CTSELECT_VEC128X { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + def CTSELECT_V2F64X : CTSELECT_VEC128X { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + def CTSELECT_V4I32X : CTSELECT_VEC128X { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + def CTSELECT_V2I64X : CTSELECT_VEC128X { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + def CTSELECT_V8I16X : CTSELECT_VEC128X { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + def CTSELECT_V16I8X : CTSELECT_VEC128X { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + + // If your build has v8f16, keep this; otherwise comment it out. + def CTSELECT_V8F16X : CTSELECT_VEC128X { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } +} + +//===----------------------------------------------------------------------===// +// 256-bit pseudos +//===----------------------------------------------------------------------===// +let Predicates = [HasAVX] in { + + def CTSELECT_V8F32 : CTSELECT_VEC256 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + def CTSELECT_V4F64 : CTSELECT_VEC256 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + def CTSELECT_V8I32 : CTSELECT_VEC256 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + def CTSELECT_V4I64 : CTSELECT_VEC256 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + def CTSELECT_V16I16 : CTSELECT_VEC256 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + def CTSELECT_V32I8 : CTSELECT_VEC256 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + + // If your build has v16f16, keep this; otherwise comment it out. + def CTSELECT_V16F16 : CTSELECT_VEC256 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } +} + +//===----------------------------------------------------------------------===// +// Selection patterns: X86ctselect(...), EFLAGS -> CTSELECT_V* +// +// NOTE: +// * The SDNode carries Glue from CMP/TEST (due to SDNPInGlue). +// * We list EFLAGS explicitly in the pattern (X86 style) to model the arch read. +// * Temps (tmpx/tmpy,tmpg) are not in the pattern; they’re outs allocated by RA. +//===----------------------------------------------------------------------===// + +let Predicates = [HasSSE1] in { + + // 128-bit float (bitwise-equivalent ops in expander) + def : Pat<(v4f32 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V4F32 VR128:$t, VR128:$f, timm:$cc)>; +} + +let Predicates = [HasSSE2] in { + + // 128-bit integer + def : Pat<(v4i32 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V4I32 VR128:$t, VR128:$f, timm:$cc)>; + def : Pat<(v2i64 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V2I64 VR128:$t, VR128:$f, timm:$cc)>; + def : Pat<(v8i16 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V8I16 VR128:$t, VR128:$f, timm:$cc)>; + def : Pat<(v16i8 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V16I8 VR128:$t, VR128:$f, timm:$cc)>; + def : Pat<(v2f64 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V2F64 VR128:$t, VR128:$f, timm:$cc)>; + + // 128-bit f16 (optional) + def : Pat<(v8f16 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V8F16 VR128:$t, VR128:$f, timm:$cc)>; +} + +let Predicates = [HasAVX] in { + + // 256-bit integer + def : Pat<(v8i32 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V8I32 VR256:$t, VR256:$f, timm:$cc)>; + def : Pat<(v4i64 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V4I64 VR256:$t, VR256:$f, timm:$cc)>; + def : Pat<(v16i16 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V16I16 VR256:$t, VR256:$f, timm:$cc)>; + def : Pat<(v32i8 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V32I8 VR256:$t, VR256:$f, timm:$cc)>; + + // 256-bit float (bitwise-equivalent ops in expander) + def : Pat<(v8f32 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V8F32 VR256:$t, VR256:$f, timm:$cc)>; + def : Pat<(v4f64 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V4F64 VR256:$t, VR256:$f, timm:$cc)>; + + // 256-bit f16 (optional) + def : Pat<(v16f16 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V16F16 VR256:$t, VR256:$f, timm:$cc)>; +} + let Predicates = [HasCMOV, HasCF] in { def : Pat<(X86cmov GR16:$src1, 0, timm:$cond, EFLAGS), (CFCMOV16rr GR16:$src1, (inv_cond_XFORM timm:$cond))>; diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td index ec31675731b79..d40c91b52c808 100644 --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -693,6 +693,87 @@ def : Pat<(v16f32 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)), def : Pat<(v8f64 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)), (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>; +// CTSELECT +// Enhanced CTSELECT pseudos for i386 with temporary register allocation +// These use a two-phase approach: +// 1. Custom inserter materializes condition byte from EFLAGS +// 2. Post-RA expansion generates constant-time instruction bundles + +let isPseudo = 1, isNotDuplicable = 1 in { + // Phase 1: Initial pseudos that consume EFLAGS (via custom inserter) + // These are matched by patterns and convert EFLAGS to condition byte + class CTSELECT_I386_INITIAL + : PseudoI<(outs RC:$dst), + (ins RC:$src1, RC:$src2, i8imm:$cond), + [(set RC:$dst, (VT(X86ctselect RC:$src1, RC:$src2, timm:$cond, + EFLAGS)))]> { + let Uses = [EFLAGS]; + let Defs = [EFLAGS]; + let usesCustomInserter = 1; + let hasNoSchedulingInfo = 1; + } + + // Phase 2: Internal pseudos with pre-materialized condition byte (post-RA expansion) + // These generate the actual constant-time instruction bundles + class CTSELECT_I386_INTERNAL + : PseudoI<(outs RC:$dst, ByteRC:$tmp_byte, RC:$tmp_mask), + (ins RC:$src1, RC:$src2, ByteRC:$cond_byte), []> { + let hasNoSchedulingInfo = 1; + let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_byte,@earlyclobber $tmp_mask"; + let Defs = [EFLAGS]; // NEG instruction in post-RA expansion clobbers EFLAGS + } +} + +// Phase 1 pseudos for non-CMOV targets (custom inserter materializes condition) +let isCodeGenOnly = 1, hasSideEffects = 1, ForceDisassemble = 1 in { + let Predicates = [NoNativeCMOV] in { + def CTSELECT_I386_GR8rr : CTSELECT_I386_INITIAL; + def CTSELECT_I386_GR16rr : CTSELECT_I386_INITIAL; + def CTSELECT_I386_GR32rr : CTSELECT_I386_INITIAL; + } +} + +// Phase 2 pseudos (post-RA expansion with pre-materialized condition byte) +let isCodeGenOnly = 1, hasSideEffects = 1, ForceDisassemble = 1 in { + let Predicates = [NoNativeCMOV] in { + def CTSELECT_I386_INT_GR8rr : + CTSELECT_I386_INTERNAL; + def CTSELECT_I386_INT_GR16rr : + CTSELECT_I386_INTERNAL; + def CTSELECT_I386_INT_GR32rr : + CTSELECT_I386_INTERNAL; + } +} + +let hasSideEffects = 1, + ForceDisassemble = 1, + Constraints = "$dst = $src1" in { + + let Predicates = [FPStackf32] in + def CTSELECT_I386_FP32rr : CTSELECT_I386_INITIAL; + + let Predicates = [FPStackf64] in + def CTSELECT_I386_FP64rr : CTSELECT_I386_INITIAL; + + def CTSELECT_I386_FP80rr : CTSELECT_I386_INITIAL; +} + +// Pattern matching for non-native-CMOV CTSELECT (routes to custom inserter for condition materialization) +// NoNativeCMOV ensures these patterns are used when actual CMOV instruction is not available +// even if canUseCMOV() is true (e.g., i386 with SSE which can emulate CMOV) +let Predicates = [NoNativeCMOV] in { + def : Pat<(i8(X86ctselect GR8:$src1, GR8:$src2, timm:$cond, EFLAGS)), + (CTSELECT_I386_GR8rr GR8:$src1, GR8:$src2, timm:$cond)>; + + def : Pat<(i16(X86ctselect GR16:$src1, GR16:$src2, timm:$cond, EFLAGS)), + (CTSELECT_I386_GR16rr GR16:$src1, GR16:$src2, timm:$cond)>; + + def : Pat<(i32(X86ctselect GR32:$src1, GR32:$src2, timm:$cond, EFLAGS)), + (CTSELECT_I386_GR32rr GR32:$src1, GR32:$src2, timm:$cond)>; + + // i64 patterns handled automatically by type legalization +} + //===----------------------------------------------------------------------===// // Normal-Instructions-With-Lock-Prefix Pseudo Instructions //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86InstrFragments.td b/llvm/lib/Target/X86/X86InstrFragments.td index 116986a0fffea..4c9e5bae3b46c 100644 --- a/llvm/lib/Target/X86/X86InstrFragments.td +++ b/llvm/lib/Target/X86/X86InstrFragments.td @@ -28,6 +28,10 @@ def SDTX86Cmov : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>, SDTCisVT<4, i32>]>; +def SDTX86CtSelect : SDTypeProfile<1, 4, + [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, + SDTCisVT<3, i8>, SDTCisVT<4, i32>]>; + // Unary and binary operator instructions that set EFLAGS as a side-effect. def SDTUnaryArithWithFlags : SDTypeProfile<2, 1, [SDTCisSameAs<0, 2>, @@ -151,6 +155,7 @@ def X86ctest : SDNode<"X86ISD::CTEST", SDTX86Ccmp>; def X86cload : SDNode<"X86ISD::CLOAD", SDTX86Cload, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def X86cstore : SDNode<"X86ISD::CSTORE", SDTX86Cstore, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def X86ctselect: SDNode<"X86ISD::CTSELECT", SDTX86CtSelect, [SDNPInGlue]>; def X86cmov : SDNode<"X86ISD::CMOV", SDTX86Cmov>; def X86brcond : SDNode<"X86ISD::BRCOND", SDTX86BrCond, [SDNPHasChain]>; diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 6b2a7a4ec3583..765db86ffafb3 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -475,6 +475,556 @@ bool X86InstrInfo::isFrameOperand(const MachineInstr &MI, unsigned int Op, return false; } +struct CtSelectInstructions { + unsigned PAndOpc; + unsigned PAndnOpc; + unsigned POrOpc; + unsigned BroadcastOpc; + unsigned IntMoveOpc; + unsigned MoveOpc; + bool Use256; + bool UseBlendInstr; +}; + +static CtSelectInstructions +getCtSelectInstructions(unsigned Opcode, const X86Subtarget &Subtarget) { + CtSelectInstructions Instructions = {}; + + switch (Opcode) { + case X86::CTSELECT_V2F64: + if (Subtarget.hasSSE2()) { + Instructions.PAndOpc = X86::PANDrr; + Instructions.PAndnOpc = X86::PANDNrr; + Instructions.POrOpc = X86::PORrr; + Instructions.BroadcastOpc = X86::PSHUFDri; + Instructions.IntMoveOpc = X86::MOVDI2PDIrr; + Instructions.MoveOpc = X86::MOVAPDrr; + Instructions.UseBlendInstr = true; + } else { + llvm_unreachable("Double precision vectors require SSE2"); + } + break; + case X86::CTSELECT_V4F32: + if (Subtarget.hasSSE41()) { + Instructions.PAndOpc = X86::PANDrr; + Instructions.PAndnOpc = X86::PANDNrr; + Instructions.POrOpc = X86::PORrr; + Instructions.BroadcastOpc = X86::PSHUFDri; + Instructions.IntMoveOpc = X86::MOVDI2PDIrr; + Instructions.MoveOpc = X86::MOVAPSrr; + Instructions.UseBlendInstr = true; + } else if (Subtarget.hasSSE2()) { + Instructions.PAndOpc = X86::PANDrr; + Instructions.PAndnOpc = X86::PANDNrr; + Instructions.POrOpc = X86::PORrr; + Instructions.BroadcastOpc = X86::PSHUFDri; + Instructions.IntMoveOpc = X86::MOVDI2PDIrr; + Instructions.MoveOpc = X86::MOVAPSrr; + } else { + // fallback to SSE1, only support four 32-bit single precision + // floating-point values + Instructions.PAndOpc = X86::ANDPSrr; + Instructions.PAndnOpc = X86::ANDNPSrr; + Instructions.POrOpc = X86::ORPSrr; + Instructions.BroadcastOpc = X86::SHUFPSrri; + Instructions.IntMoveOpc = X86::MOVSS2DIrr; + Instructions.MoveOpc = X86::MOVAPSrr; + } + break; + case X86::CTSELECT_V4I32: + case X86::CTSELECT_V2I64: + case X86::CTSELECT_V8I16: + case X86::CTSELECT_V16I8: + if (Subtarget.hasSSE2()) { + Instructions.PAndOpc = X86::PANDrr; + Instructions.PAndnOpc = X86::PANDNrr; + Instructions.POrOpc = X86::PORrr; + Instructions.BroadcastOpc = X86::PSHUFDri; + Instructions.IntMoveOpc = X86::MOVDI2PDIrr; + Instructions.MoveOpc = X86::MOVDQArr; + } else { + llvm_unreachable("Integer vector operations require SSE2"); + } + break; + case X86::CTSELECT_V8F16: + if (Subtarget.hasSSE2()) { + Instructions.PAndOpc = X86::PANDrr; + Instructions.PAndnOpc = X86::PANDNrr; + Instructions.POrOpc = X86::PORrr; + Instructions.BroadcastOpc = X86::PSHUFDri; + Instructions.IntMoveOpc = X86::MOVDI2PDIrr; + Instructions.MoveOpc = X86::MOVDQArr; + } else { + llvm_unreachable("FP16 vector operations require SSE2"); + } + break; + case X86::CTSELECT_V4F32X: + case X86::CTSELECT_V4I32X: + case X86::CTSELECT_V2F64X: + case X86::CTSELECT_V2I64X: + case X86::CTSELECT_V8I16X: + case X86::CTSELECT_V16I8X: + case X86::CTSELECT_V8F16X: + if (Subtarget.hasAVX()) { + Instructions.PAndOpc = X86::VPANDrr; + Instructions.PAndnOpc = X86::VPANDNrr; + Instructions.POrOpc = X86::VPORrr; + Instructions.BroadcastOpc = X86::VPSHUFDri; + Instructions.IntMoveOpc = X86::VMOVDI2PDIrr; + Instructions.MoveOpc = (Opcode == X86::CTSELECT_V4F32X) ? X86::VMOVAPSrr + : (Opcode == X86::CTSELECT_V2F64X) + ? X86::VMOVAPDrr + : X86::VMOVDQArr; + } else { + llvm_unreachable("AVX variants require AVX support"); + } + break; + case X86::CTSELECT_V8F32: + case X86::CTSELECT_V8I32: + if (Subtarget.hasAVX()) { + Instructions.PAndOpc = X86::VPANDYrr; + Instructions.PAndnOpc = X86::VPANDNYrr; + Instructions.POrOpc = X86::VPORYrr; + Instructions.BroadcastOpc = X86::VPERMILPSYri; + Instructions.IntMoveOpc = X86::VMOVDI2PDIrr; + Instructions.MoveOpc = + (Opcode == X86::CTSELECT_V8F32) ? X86::VMOVAPSYrr : X86::VMOVDQAYrr; + Instructions.Use256 = true; + } else { + llvm_unreachable("256-bit vectors require AVX"); + } + break; + case X86::CTSELECT_V4F64: + case X86::CTSELECT_V4I64: + if (Subtarget.hasAVX()) { + Instructions.PAndOpc = X86::VPANDYrr; + Instructions.PAndnOpc = X86::VPANDNYrr; + Instructions.POrOpc = X86::VPORYrr; + Instructions.BroadcastOpc = X86::VPERMILPDYri; + Instructions.IntMoveOpc = X86::VMOVDI2PDIrr; + Instructions.MoveOpc = + (Opcode == X86::CTSELECT_V4F64) ? X86::VMOVAPDYrr : X86::VMOVDQAYrr; + Instructions.Use256 = true; + } else { + llvm_unreachable("256-bit vectors require AVX"); + } + break; + case X86::CTSELECT_V16I16: + case X86::CTSELECT_V32I8: + case X86::CTSELECT_V16F16: + if (Subtarget.hasAVX2()) { + Instructions.PAndOpc = X86::VPANDYrr; + Instructions.PAndnOpc = X86::VPANDNYrr; + Instructions.POrOpc = X86::VPORYrr; + Instructions.BroadcastOpc = X86::VPERMILPSYri; + Instructions.IntMoveOpc = X86::VMOVDI2PDIrr; + Instructions.MoveOpc = X86::VMOVDQAYrr; + Instructions.Use256 = true; + } else if (Subtarget.hasAVX()) { + Instructions.PAndOpc = X86::VPANDYrr; + Instructions.PAndnOpc = X86::VPANDNYrr; + Instructions.POrOpc = X86::VPORYrr; + Instructions.BroadcastOpc = X86::VPERMILPSYri; + Instructions.IntMoveOpc = X86::VMOVDI2PDIrr; + Instructions.MoveOpc = X86::VMOVDQAYrr; + Instructions.Use256 = true; + } else { + llvm_unreachable("256-bit integer vectors require AVX"); + } + break; + default: + llvm_unreachable("Unexpected CTSELECT opcode"); + } + + return Instructions; +} + +bool X86InstrInfo::expandCtSelectVector(MachineInstr &MI) const { + unsigned Opcode = MI.getOpcode(); + const DebugLoc &DL = MI.getDebugLoc(); + auto Instruction = getCtSelectInstructions(Opcode, Subtarget); + + MachineBasicBlock *MBB = MI.getParent(); + + // Operand layout matches the TableGen definition: + // (outs VR128:$dst, VR128:$tmpx, GR32:$tmpg), + // (ins VR128:$t, VR128:$f, i8imm:$cond) + Register Dst = MI.getOperand(0).getReg(); + Register MaskReg = MI.getOperand(1).getReg(); // vector mask temp + Register TmpGPR = MI.getOperand(2).getReg(); // scalar mask temp (GPR32) + Register FalseVal = MI.getOperand(3).getReg(); // true_value + Register TrueVal = MI.getOperand(4).getReg(); // false_value + X86::CondCode CC = X86::CondCode(MI.getOperand(5).getImm()); // condition + + MachineInstr *FirstInstr = nullptr; + MachineInstr *LastInstr = nullptr; + auto recordInstr = [&](MachineInstrBuilder MIB) { + MachineInstr *NewMI = MIB.getInstr(); + LastInstr = NewMI; + if (!FirstInstr) + FirstInstr = NewMI; + }; + + // Create scalar mask in tempGPR and broadcast to vector mask + recordInstr(BuildMI(*MBB, MI, DL, get(X86::MOV32ri), TmpGPR) + .addImm(0) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); + + const TargetRegisterInfo *TRI = &getRegisterInfo(); + auto SubReg = TRI->getSubReg(TmpGPR, X86::sub_8bit); + recordInstr(BuildMI(*MBB, MI, DL, get(X86::SETCCr)) + .addReg(SubReg) + .addImm(CC) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); + + // Zero-extend byte to 32-bit register (movzbl %al, %eax) + recordInstr(BuildMI(*MBB, MI, DL, get(X86::MOVZX32rr8), TmpGPR) + .addReg(SubReg) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); + + if (Instruction.UseBlendInstr && Subtarget.hasSSE41()) { + // Shift left 31 bits to convert 1 -> 0x80000000, 0 -> 0x00000000 (shll $31, + // %eax) + recordInstr(BuildMI(*MBB, MI, DL, get(X86::SHL32ri), TmpGPR) + .addReg(TmpGPR) + .addImm(31)); + } else { + // Negate to convert 1 -> 0xFFFFFFFF, 0 -> 0x00000000 (negl %eax) + recordInstr(BuildMI(*MBB, MI, DL, get(X86::NEG32r), TmpGPR) + .addReg(TmpGPR)); + } + + // Broadcast to TmpX (vector mask) + recordInstr(BuildMI(*MBB, MI, DL, get(X86::PXORrr), MaskReg) + .addReg(MaskReg) + .addReg(MaskReg) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); + + // Move scalar mask to vector register + recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.IntMoveOpc), MaskReg) + .addReg(TmpGPR) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); + + if (Instruction.Use256) { + // Broadcast to 256-bit vector register + recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.BroadcastOpc), MaskReg) + .addReg(MaskReg) + .addImm(0) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); + } else { + if (Subtarget.hasSSE2() || Subtarget.hasAVX()) { + recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.BroadcastOpc), MaskReg) + .addReg(MaskReg) + .addImm(0x00) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); + } else { + recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.BroadcastOpc), MaskReg) + .addReg(MaskReg) + .addReg(MaskReg) + .addImm(0x00) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); + } + } + + if (Instruction.UseBlendInstr && Subtarget.hasSSE41()) { + // Use dedicated blend instructions for SSE4.1+ + unsigned BlendOpc; + switch (Opcode) { + case X86::CTSELECT_V4F32: + BlendOpc = X86::BLENDVPSrr0; + break; + case X86::CTSELECT_V2F64: + BlendOpc = X86::BLENDVPDrr0; + break; + default: + // alias for pblendvb that takes xmm0 as implicit mask register + BlendOpc = X86::PBLENDVBrr0; + break; + } + + // Check if XMM0 is used as one of source registers, if yes then save it + // in Dst register and update FalseVal and TrueVal to Dst register + bool DidSaveXMM0 = false; + Register SavedXMM0 = X86::XMM0; + if (FalseVal == X86::XMM0 || TrueVal == X86::XMM0) { + Register SrcXMM0 = (FalseVal == X86::XMM0) ? FalseVal : TrueVal; + + // if XMM0 is one of the source registers, it will not match with Dst + // registers, so we need to move it to Dst register + recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst) + .addReg(SrcXMM0) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); + + // update FalseVal and TrueVal to Dst register + if (FalseVal == X86::XMM0) + FalseVal = Dst; + if (TrueVal == X86::XMM0) + TrueVal = Dst; + + // update SavedXMM0 to Dst register + SavedXMM0 = Dst; + + // set DidSaveXMM0 to true to indicate that we saved XMM0 into Dst + // register + DidSaveXMM0 = true; + } else if (MaskReg != X86::XMM0 && Dst != X86::XMM0) { + + // if XMM0 is not allocated for any of the register, we stil need to save + // and restore it after using as mask register + recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst) + .addReg(X86::XMM0) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); + SavedXMM0 = Dst; + DidSaveXMM0 = true; + } + + if (MaskReg != X86::XMM0) { + // BLENDV uses XMM0 as implicit mask register + // https://www.felixcloutier.com/x86/pblendvb + recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), X86::XMM0) + .addReg(MaskReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge)); + + // move FalseVal to mask (use MaskReg as the dst of the blend) + recordInstr(BuildMI(*MBB, MI, DL, get(X86::MOVAPSrr), MaskReg) + .addReg(FalseVal) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); + + // MaskReg := blend(MaskReg /*false*/, TrueVal /*true*/) ; mask in + // xmm0 + recordInstr(BuildMI(*MBB, MI, DL, get(BlendOpc), MaskReg) + .addReg(MaskReg) + .addReg(TrueVal) + .addReg(X86::XMM0) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); + + // restore XMM0 from SavedXMM0 if we saved it into Dst + if (DidSaveXMM0) { + recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), X86::XMM0) + .addReg(SavedXMM0) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); + } + // dst = result (now in MaskReg) + recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst) + .addReg(MaskReg) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); + } else { + // move FalseVal to Dst register since MaskReg is XMM0 and Dst is not + recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst) + .addReg(FalseVal) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); + + // Dst := blend(Dst /*false*/, TrueVal /*true*/) ; mask in + // xmm0 + recordInstr(BuildMI(*MBB, MI, DL, get(BlendOpc), Dst) + .addReg(Dst) + .addReg(TrueVal) + .addReg(X86::XMM0) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); + } + } else { + + // dst = mask + recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst) + .addReg(MaskReg) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); + + // mask &= true_val + recordInstr(BuildMI(*MBB, MI, DL, get(X86::PANDrr), MaskReg) + .addReg(MaskReg) + .addReg(TrueVal) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); + + // dst = ~mask & false_val + recordInstr(BuildMI(*MBB, MI, DL, get(X86::PANDNrr), Dst) + .addReg(Dst) + .addReg(FalseVal) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); + + // dst |= mask; (mask & t) | (~mask & f) + recordInstr(BuildMI(*MBB, MI, DL, get(X86::PORrr), Dst) + .addReg(Dst) + .addReg(MaskReg) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); + } + + assert(FirstInstr && LastInstr && "Expected at least one expanded instruction"); + auto BundleEnd = LastInstr->getIterator(); + finalizeBundle(*MBB, FirstInstr->getIterator(), std::next(BundleEnd)); + + MI.eraseFromParent(); + + return true; +} + +bool X86InstrInfo::expandCtSelectWithCMOV(MachineInstr &MI) const { + MachineBasicBlock *MBB = MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + // CTSELECT pseudo has: (outs dst), (ins true_val, false_val, cond) + MachineOperand &OperandRes = MI.getOperand(0); // destination register + MachineOperand &OperandTrue = MI.getOperand(1); // true value + MachineOperand &OperandCond = MI.getOperand(3); // condition code + + assert(OperandTrue.isReg() && OperandRes.isReg() && OperandCond.isImm() && + "Invalid operand types"); + assert(OperandTrue.getReg() == OperandRes.getReg() && + "Result register different from True register"); + + assert(Subtarget.hasCMOV() && "target does not support CMOV instructions"); + + unsigned Opcode = 0; + + switch (MI.getOpcode()) { + case X86::CTSELECT16rr: + Opcode = X86::CMOV16rr; + break; + case X86::CTSELECT32rr: + Opcode = X86::CMOV32rr; + break; + case X86::CTSELECT64rr: + Opcode = X86::CMOV64rr; + break; + case X86::CTSELECT16rm: + Opcode = X86::CMOV16rm; + break; + case X86::CTSELECT32rm: + Opcode = X86::CMOV32rm; + break; + case X86::CTSELECT64rm: + Opcode = X86::CMOV64rm; + break; + default: + llvm_unreachable("Invalid CTSELECT opcode"); + } + + if (!Subtarget.hasCMOV()) { + llvm_unreachable("target does not support cmov"); + } + + // Build CMOV instruction: copy the first 3 operands (dst, true, false) + // and add condition code + MachineInstrBuilder CmovBuilder = BuildMI(*MBB, MI, DL, get(Opcode)); + for (unsigned i = 0u; i < MI.getNumOperands(); ++i) { // Copy + CmovBuilder.add(MI.getOperand(i)); + } + + // Remove the original CTSELECT instruction + MI.eraseFromParent(); + return true; +} + +/// Expand i386-specific CTSELECT pseudo instructions (post-RA, constant-time) +/// These internal pseudos receive a pre-materialized condition byte from the +/// custom inserter, avoiding EFLAGS corruption issues during i64 type legalization. +bool X86InstrInfo::expandCtSelectIntWithoutCMOV(MachineInstr &MI) const { + MachineBasicBlock *MBB = MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + // CTSELECT_I386_INT_GRxxrr has operands: (outs dst, tmp_byte, tmp_mask), + // (ins src1, src2, cond_byte) + // Note: cond_byte is pre-materialized by custom inserter, not EFLAGS-dependent + Register DstReg = MI.getOperand(0).getReg(); + Register TmpByteReg = MI.getOperand(1).getReg(); + Register TmpMaskReg = MI.getOperand(2).getReg(); + Register Src1Reg = MI.getOperand(3).getReg(); + Register Src2Reg = MI.getOperand(4).getReg(); + Register CondByteReg = MI.getOperand(5).getReg(); // Pre-materialized condition byte + + // Determine instruction opcodes based on register width + unsigned MovZXOp, NegOp, MovOp, AndOp, NotOp, OrOp; + if (MI.getOpcode() == X86::CTSELECT_I386_INT_GR8rr) { + MovZXOp = 0; // No zero-extend needed for GR8 + NegOp = X86::NEG8r; + MovOp = X86::MOV8rr; + AndOp = X86::AND8rr; + NotOp = X86::NOT8r; + OrOp = X86::OR8rr; + } else if (MI.getOpcode() == X86::CTSELECT_I386_INT_GR16rr) { + MovZXOp = X86::MOVZX16rr8; + NegOp = X86::NEG16r; + MovOp = X86::MOV16rr; + AndOp = X86::AND16rr; + NotOp = X86::NOT16r; + OrOp = X86::OR16rr; + } else { // X86::CTSELECT_I386_INT_GR32rr + MovZXOp = X86::MOVZX32rr8; + NegOp = X86::NEG32r; + MovOp = X86::MOV32rr; + AndOp = X86::AND32rr; + NotOp = X86::NOT32r; + OrOp = X86::OR32rr; + } + + // 7-instruction constant-time selection bundle (no SETCC inside): + // result = (true_val & mask) | (false_val & ~mask) + // The condition byte is already materialized, avoiding EFLAGS dependency + + // Step 1: Copy pre-materialized condition byte to TmpByteReg + // This allows the bundle to work with allocated temporaries + auto I1 = BuildMI(*MBB, MI, DL, get(X86::MOV8rr), TmpByteReg) + .addReg(CondByteReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + auto BundleStart = I1->getIterator(); + + // Step 2: Zero-extend condition byte to register width (0 or 1) + if (MI.getOpcode() != X86::CTSELECT_I386_INT_GR8rr) { + BuildMI(*MBB, MI, DL, get(MovZXOp), TmpMaskReg) + .addReg(TmpByteReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + } + + // Step 3: Convert condition to bitmask (NEG: 1 -> 0xFFFF..., 0 -> 0x0000...) + Register MaskReg = (MI.getOpcode() == X86::CTSELECT_I386_INT_GR8rr) ? TmpByteReg : TmpMaskReg; + BuildMI(*MBB, MI, DL, get(NegOp), MaskReg) + .addReg(MaskReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // Step 4,5: Apply mask to true value - copy src1 to dest, then AND with mask + BuildMI(*MBB, MI, DL, get(MovOp), DstReg) + .addReg(Src1Reg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + BuildMI(*MBB, MI, DL, get(AndOp), DstReg) + .addReg(DstReg) + .addReg(MaskReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // Step 6: Create inverted mask inline (~mask) + BuildMI(*MBB, MI, DL, get(NotOp), MaskReg) + .addReg(MaskReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // Step 7: Apply inverted mask to false value - reuse mask register directly + BuildMI(*MBB, MI, DL, get(AndOp), MaskReg) + .addReg(MaskReg) + .addReg(Src2Reg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // Step 8: Final result: (src1 & mask) | (src2 & ~mask) + auto LI = BuildMI(*MBB, MI, DL, get(OrOp), DstReg) + .addReg(DstReg) + .addReg(MaskReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // Bundle all generated instructions for atomic execution before removing MI + auto BundleEnd = std::next(LI->getIterator()); + if (BundleStart != BundleEnd) { + // Only bundle if we have multiple instructions + finalizeBundle(*MBB, BundleStart, BundleEnd); + } + + // TODO: Optimization opportunity - The register allocator may choose callee-saved + // registers (e.g., %ebx, %esi) for TmpByteReg/TmpMaskReg, causing unnecessary + // save/restore overhead. Consider constraining these to caller-saved register + // classes (e.g., GR8_AL, GR32_CallSaved) in the TableGen definitions to improve + // constant-time performance by eliminating prologue/epilogue instructions. + + // Remove the original pseudo instruction + MI.eraseFromParent(); + return true; +} + static bool isFrameLoadOpcode(int Opcode, TypeSize &MemBytes) { switch (Opcode) { default: @@ -6402,6 +6952,43 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case X86::ADD64ri32_DB: MIB->setDesc(get(X86::OR64ri32)); break; + + case X86::CTSELECT64rr: + case X86::CTSELECT32rr: + case X86::CTSELECT16rr: + case X86::CTSELECT64rm: + case X86::CTSELECT32rm: + case X86::CTSELECT16rm: + // These CTSELECT pseudos are only selected when CMOV is available + // Pattern matching ensures we use CTSELECT_I386 when CMOV is not available + return expandCtSelectWithCMOV(MI); + + // non-cmov CTSELECT expansion (post-RA, constant-time) + // These are the internal pseudos with pre-materialized condition byte + case X86::CTSELECT_I386_INT_GR8rr: + case X86::CTSELECT_I386_INT_GR16rr: + case X86::CTSELECT_I386_INT_GR32rr: + return expandCtSelectIntWithoutCMOV(MI); + + case X86::CTSELECT_V2F64: + case X86::CTSELECT_V4F32: + case X86::CTSELECT_V2I64: + case X86::CTSELECT_V4I32: + case X86::CTSELECT_V8I16: + case X86::CTSELECT_V16I8: + case X86::CTSELECT_V2F64X: + case X86::CTSELECT_V4F32X: + case X86::CTSELECT_V2I64X: + case X86::CTSELECT_V4I32X: + case X86::CTSELECT_V8I16X: + case X86::CTSELECT_V16I8X: + case X86::CTSELECT_V4I64: + case X86::CTSELECT_V8I32: + case X86::CTSELECT_V16I16: + case X86::CTSELECT_V32I8: + case X86::CTSELECT_V4F64: + case X86::CTSELECT_V8F32: + return expandCtSelectVector(MI); } return false; } @@ -10800,27 +11387,39 @@ void X86InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB, if (!ST.hasSSE1()) return; - BuildMI(MBB, Iter, DL, get(X86::V_SET0), Reg); + // PXOR is safe to use because it doesn't affect flags. + BuildMI(MBB, Iter, DL, get(X86::PXORrr), Reg) + .addReg(Reg, RegState::Undef) + .addReg(Reg, RegState::Undef); } else if (X86::VR256RegClass.contains(Reg)) { // YMM# if (!ST.hasAVX()) return; - BuildMI(MBB, Iter, DL, get(X86::AVX_SET0), Reg); + // VPXOR is safe to use because it doesn't affect flags. + BuildMI(MBB, Iter, DL, get(X86::VPXORrr), Reg) + .addReg(Reg, RegState::Undef) + .addReg(Reg, RegState::Undef); } else if (X86::VR512RegClass.contains(Reg)) { // ZMM# if (!ST.hasAVX512()) return; - BuildMI(MBB, Iter, DL, get(X86::AVX512_512_SET0), Reg); + // VPXORY is safe to use because it doesn't affect flags. + BuildMI(MBB, Iter, DL, get(X86::VPXORYrr), Reg) + .addReg(Reg, RegState::Undef) + .addReg(Reg, RegState::Undef); } else if (X86::VK1RegClass.contains(Reg) || X86::VK2RegClass.contains(Reg) || X86::VK4RegClass.contains(Reg) || X86::VK8RegClass.contains(Reg) || X86::VK16RegClass.contains(Reg)) { if (!ST.hasVLX()) return; - unsigned Op = ST.hasBWI() ? X86::KSET0Q : X86::KSET0W; - BuildMI(MBB, Iter, DL, get(Op), Reg); + // KXOR is safe to use because it doesn't affect flags. + unsigned Op = ST.hasBWI() ? X86::KXORQkk : X86::KXORWkk; + BuildMI(MBB, Iter, DL, get(Op), Reg) + .addReg(Reg, RegState::Undef) + .addReg(Reg, RegState::Undef); } } diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h index 5f75559bd9598..ebd7e070d5fe8 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -724,6 +724,12 @@ class X86InstrInfo final : public X86GenInstrInfo { bool isFrameOperand(const MachineInstr &MI, unsigned int Op, int &FrameIndex) const; + /// Expand the CTSELECT pseudo-instructions. + bool expandCtSelectWithCMOV(MachineInstr &MI) const; + bool expandCtSelectIntWithoutCMOV(MachineInstr &MI) const; + + bool expandCtSelectVector(MachineInstr &MI) const; + /// Returns true iff the routine could find two commutable operands in the /// given machine instruction with 3 vector inputs. /// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. Their diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td index 98104a6fad1a9..6b585a5b0b436 100644 --- a/llvm/lib/Target/X86/X86InstrPredicates.td +++ b/llvm/lib/Target/X86/X86InstrPredicates.td @@ -49,6 +49,11 @@ def HasZU : Predicate<"Subtarget->hasZU()">; def HasCF : Predicate<"Subtarget->hasCF()">; def HasCMOV : Predicate<"Subtarget->canUseCMOV()">; def NoCMOV : Predicate<"!Subtarget->canUseCMOV()">; +// Predicates for native CMOV instruction (checks hasCMOV(), not canUseCMOV()) +// HasCMOV may be true even without native CMOV (e.g., via SSE emulation) +// Use HasNativeCMOV/NoNativeCMOV for constant-time code that requires actual CMOV +def HasNativeCMOV : Predicate<"Subtarget->hasCMOV()">; +def NoNativeCMOV : Predicate<"!Subtarget->hasCMOV()">; def HasNOPL : Predicate<"Subtarget->hasNOPL()">; def HasMMX : Predicate<"Subtarget->hasMMX()">; def HasSSE1 : Predicate<"Subtarget->hasSSE1()">; diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index 9a76abcd351bf..66c9d75053640 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -617,10 +617,11 @@ void X86PassConfig::addPreEmitPass2() { // ObjC runtime functions present in the module. const Function &F = MF.getFunction(); const Module *M = F.getParent(); - return M->getModuleFlag("kcfi") || + return M->getModuleFlag("kcfi") || F.hasFnAttribute("ct-select") || (TT.isOSDarwin() && (M->getFunction("objc_retainAutoreleasedReturnValue") || - M->getFunction("objc_unsafeClaimAutoreleasedReturnValue"))); + M->getFunction("objc_unsafeClaimAutoreleasedReturnValue"))) || + F.hasFnAttribute("ct-select"); })); // Analyzes and emits pseudos to support Win x64 Unwind V2. This pass must run diff --git a/llvm/test/CodeGen/X86/ctselect-edge-cases.ll b/llvm/test/CodeGen/X86/ctselect-edge-cases.ll new file mode 100644 index 0000000000000..0797265972a1f --- /dev/null +++ b/llvm/test/CodeGen/X86/ctselect-edge-cases.ll @@ -0,0 +1,409 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=X64 +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=X32 + +; Test ct.select edge cases and corner cases + +; Test with very large integers +define i128 @test_ctselect_i128(i1 %cond, i128 %a, i128 %b) { +; X64-LABEL: test_ctselect_i128: +; X64: # %bb.0: +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovneq %rsi, %rax +; X64-NEXT: cmovneq %rdx, %r8 +; X64-NEXT: movq %r8, %rdx +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_i128: +; X32: # %bb.0: +; X32-NEXT: pushl %edi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: pushl %eax +; X32-NEXT: .cfi_def_cfa_offset 16 +; X32-NEXT: .cfi_offset %esi, -12 +; X32-NEXT: .cfi_offset %edi, -8 +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %esi +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %edi +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %edx +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl %ecx, 12(%eax) +; X32-NEXT: movl %edx, 8(%eax) +; X32-NEXT: movl %edi, 4(%eax) +; X32-NEXT: movl %esi, (%eax) +; X32-NEXT: addl $4, %esp +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: popl %edi +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: retl $4 + %result = call i128 @llvm.ct.select.i128(i1 %cond, i128 %a, i128 %b) + ret i128 %result +} + +; Test with small integer types +define i1 @test_ctselect_i1(i1 %cond, i1 %a, i1 %b) { +; X64-LABEL: test_ctselect_i1: +; X64: # %bb.0: +; X64-NEXT: movl %edx, %eax +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovnel %esi, %eax +; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_i1: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; X32-NEXT: # kill: def $al killed $al killed $eax +; X32-NEXT: retl + %result = call i1 @llvm.ct.select.i1(i1 %cond, i1 %a, i1 %b) + ret i1 %result +} + +; Test with extremal values +define i32 @test_ctselect_extremal_values(i1 %cond) { +; X64-LABEL: test_ctselect_extremal_values: +; X64: # %bb.0: +; X64-NEXT: testb $1, %dil +; X64-NEXT: movl $2147483647, %ecx # imm = 0x7FFFFFFF +; X64-NEXT: movl $-2147483648, %eax # imm = 0x80000000 +; X64-NEXT: cmovnel %ecx, %eax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_extremal_values: +; X32: # %bb.0: +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: movl $2147483647, %ecx # imm = 0x7FFFFFFF +; X32-NEXT: movl $-2147483648, %eax # imm = 0x80000000 +; X32-NEXT: cmovnel %ecx, %eax +; X32-NEXT: retl + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 2147483647, i32 -2147483648) + ret i32 %result +} + +; Test with floating point special values +define float @test_ctselect_f32_special_values(i1 %cond) { +; X64-LABEL: test_ctselect_f32_special_values: +; X64: # %bb.0: +; X64-NEXT: testb $1, %dil +; X64-NEXT: movl $2143289344, %eax # imm = 0x7FC00000 +; X64-NEXT: movl $2139095040, %ecx # imm = 0x7F800000 +; X64-NEXT: cmovnel %eax, %ecx +; X64-NEXT: movd %ecx, %xmm0 +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_f32_special_values: +; X32: # %bb.0: +; X32-NEXT: pushl %edi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: pushl %eax +; X32-NEXT: .cfi_def_cfa_offset 16 +; X32-NEXT: .cfi_offset %esi, -12 +; X32-NEXT: .cfi_offset %edi, -8 +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: sete %al +; X32-NEXT: movl {{\.?LCPI[0-9]+_[0-9]+}}, %ecx +; X32-NEXT: movl {{\.?LCPI[0-9]+_[0-9]+}}, %edx +; X32-NEXT: movb %al, %ah +; X32-NEXT: movzbl %ah, %edi +; X32-NEXT: negl %edi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: andl %edi, %esi +; X32-NEXT: notl %edi +; X32-NEXT: andl %ecx, %edi +; X32-NEXT: orl %edi, %esi +; X32-NEXT: movl %esi, (%esp) +; X32-NEXT: flds (%esp) +; X32-NEXT: addl $4, %esp +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: popl %edi +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: retl + %result = call float @llvm.ct.select.f32(i1 %cond, float 0x7FF8000000000000, float 0x7FF0000000000000) + ret float %result +} + +define double @test_ctselect_f64_special_values(i1 %cond) { +; X64-LABEL: test_ctselect_f64_special_values: +; X64: # %bb.0: +; X64-NEXT: testb $1, %dil +; X64-NEXT: movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000 +; X64-NEXT: movabsq $9218868437227405312, %rcx # imm = 0x7FF0000000000000 +; X64-NEXT: cmovneq %rax, %rcx +; X64-NEXT: movq %rcx, %xmm0 +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_f64_special_values: +; X32: # %bb.0: +; X32-NEXT: pushl %edi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: subl $24, %esp +; X32-NEXT: .cfi_def_cfa_offset 36 +; X32-NEXT: .cfi_offset %esi, -12 +; X32-NEXT: .cfi_offset %edi, -8 +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} +; X32-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} +; X32-NEXT: sete %al +; X32-NEXT: fxch %st(1) +; X32-NEXT: fstpl {{[0-9]+}}(%esp) +; X32-NEXT: fstpl (%esp) +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl (%esp), %edx +; X32-NEXT: movb %al, %ah +; X32-NEXT: movzbl %ah, %edi +; X32-NEXT: negl %edi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: andl %edi, %esi +; X32-NEXT: notl %edi +; X32-NEXT: andl %ecx, %edi +; X32-NEXT: orl %edi, %esi +; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movb %al, %ah +; X32-NEXT: movzbl %ah, %edi +; X32-NEXT: negl %edi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: andl %edi, %esi +; X32-NEXT: notl %edi +; X32-NEXT: andl %ecx, %edi +; X32-NEXT: orl %edi, %esi +; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X32-NEXT: fldl {{[0-9]+}}(%esp) +; X32-NEXT: addl $24, %esp +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: popl %edi +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: retl + %result = call double @llvm.ct.select.f64(i1 %cond, double 0x7FF8000000000000, double 0x7FF0000000000000) + ret double %result +} + +; Test with null pointers +define ptr @test_ctselect_null_ptr(i1 %cond, ptr %ptr) { +; X64-LABEL: test_ctselect_null_ptr: +; X64: # %bb.0: +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovneq %rsi, %rax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_null_ptr: +; X32: # %bb.0: +; X32-NEXT: xorl %eax, %eax +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; X32-NEXT: retl + %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %ptr, ptr null) + ret ptr %result +} + +; Test with function pointers +define ptr @test_ctselect_function_ptr(i1 %cond, ptr %func1, ptr %func2) { +; X64-LABEL: test_ctselect_function_ptr: +; X64: # %bb.0: +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovneq %rsi, %rax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_function_ptr: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; X32-NEXT: retl + %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %func1, ptr %func2) + ret ptr %result +} + +; Test with volatile loads +define i32 @test_ctselect_volatile_load(i1 %cond, ptr %p1, ptr %p2) { +; X64-LABEL: test_ctselect_volatile_load: +; X64: # %bb.0: +; X64-NEXT: movl (%rsi), %ecx +; X64-NEXT: movl (%rdx), %eax +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovnel %ecx, %eax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_volatile_load: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl (%ecx), %ecx +; X32-NEXT: movl (%eax), %eax +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel %ecx, %eax +; X32-NEXT: retl + %a = load volatile i32, ptr %p1 + %b = load volatile i32, ptr %p2 + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +; Test with atomic loads +define i32 @test_ctselect_atomic_load(i1 %cond, ptr %p1, ptr %p2) { +; X64-LABEL: test_ctselect_atomic_load: +; X64: # %bb.0: +; X64-NEXT: movl (%rsi), %ecx +; X64-NEXT: movl (%rdx), %eax +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovnel %ecx, %eax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_atomic_load: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl (%ecx), %ecx +; X32-NEXT: movl (%eax), %eax +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel %ecx, %eax +; X32-NEXT: retl + %a = load atomic i32, ptr %p1 acquire, align 4 + %b = load atomic i32, ptr %p2 acquire, align 4 + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +; Test with condition from icmp on pointers +define ptr @test_ctselect_ptr_cmp(ptr %p1, ptr %p2, ptr %a, ptr %b) { +; X64-LABEL: test_ctselect_ptr_cmp: +; X64: # %bb.0: +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: cmpq %rsi, %rdi +; X64-NEXT: sete %cl +; X64-NEXT: testb %cl, %cl +; X64-NEXT: cmovneq %rdx, %rax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_ptr_cmp: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: sete %cl +; X32-NEXT: testb %cl, %cl +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; X32-NEXT: retl + %cmp = icmp eq ptr %p1, %p2 + %result = call ptr @llvm.ct.select.p0(i1 %cmp, ptr %a, ptr %b) + ret ptr %result +} + +; Test with struct pointer types (struct types themselves may not be directly supported) +%struct.pair = type { i32, i32 } + +define ptr @test_ctselect_struct_ptr(i1 %cond, ptr %a, ptr %b) { +; X64-LABEL: test_ctselect_struct_ptr: +; X64: # %bb.0: +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovneq %rsi, %rax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_struct_ptr: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; X32-NEXT: retl + %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b) + ret ptr %result +} + +; Test with deeply nested conditions (stress test for instruction selection) +define i32 @test_ctselect_deeply_nested(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) { +; X64-LABEL: test_ctselect_deeply_nested: +; X64: # %bb.0: +; X64-NEXT: movl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movl {{[0-9]+}}(%rsp), %r10d +; X64-NEXT: movl {{[0-9]+}}(%rsp), %r11d +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovnel %r8d, %r9d +; X64-NEXT: testb $1, %sil +; X64-NEXT: cmovnel %r9d, %r11d +; X64-NEXT: testb $1, %dl +; X64-NEXT: cmovnel %r11d, %r10d +; X64-NEXT: testb $1, %cl +; X64-NEXT: cmovnel %r10d, %eax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_deeply_nested: +; X32: # %bb.0: +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: .cfi_offset %esi, -8 +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %esi +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel %esi, %edx +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel %edx, %ecx +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel %ecx, %eax +; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: retl + %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b) + %sel2 = call i32 @llvm.ct.select.i32(i1 %c2, i32 %sel1, i32 %c) + %sel3 = call i32 @llvm.ct.select.i32(i1 %c3, i32 %sel2, i32 %d) + %sel4 = call i32 @llvm.ct.select.i32(i1 %c4, i32 %sel3, i32 %e) + ret i32 %sel4 +} + +; Test with misaligned loads +define i32 @test_ctselect_misaligned_load(i1 %cond, ptr %p1, ptr %p2) { +; X64-LABEL: test_ctselect_misaligned_load: +; X64: # %bb.0: +; X64-NEXT: movl (%rdx), %eax +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovnel (%rsi), %eax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_misaligned_load: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl (%eax), %eax +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel (%ecx), %eax +; X32-NEXT: retl + %a = load i32, ptr %p1, align 1 + %b = load i32, ptr %p2, align 1 + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +; Declare the intrinsics +declare i1 @llvm.ct.select.i1(i1, i1, i1) +declare i128 @llvm.ct.select.i128(i1, i128, i128) +declare i32 @llvm.ct.select.i32(i1, i32, i32) +declare float @llvm.ct.select.f32(i1, float, float) +declare double @llvm.ct.select.f64(i1, double, double) +declare ptr @llvm.ct.select.p0(i1, ptr, ptr) diff --git a/llvm/test/CodeGen/X86/ctselect-i386-fp.ll b/llvm/test/CodeGen/X86/ctselect-i386-fp.ll new file mode 100644 index 0000000000000..ea943307c644f --- /dev/null +++ b/llvm/test/CodeGen/X86/ctselect-i386-fp.ll @@ -0,0 +1,722 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov | FileCheck %s --check-prefix=I386-NOCMOV +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=I386-CMOV +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov -verify-machineinstrs | FileCheck %s --check-prefix=I386-NOCMOV + +; Comprehensive CTSELECT tests for i386 targets with floating-point types +; - Without CMOV: constant-time implementation using FP->int conversion + existing post-RA CTSELECT +; - With CMOV: CMOV-based implementation +; - Verifies security properties: no conditional branches, constant execution time +; Strategy: FP values stored to memory, converted to integers, CTSELECT on integers, converted back to FP + +; Test basic f32 functionality +define float @test_ctselect_f32_basic(i1 %cond, float %a, float %b) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_f32_basic: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: pushl %eax +; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi +; I386-NOCMOV-NEXT: movl %esi, (%esp) +; I386-NOCMOV-NEXT: flds (%esp) +; I386-NOCMOV-NEXT: addl $4, %esp +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %edi +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_f32_basic: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: pushl %edi +; I386-CMOV-NEXT: pushl %esi +; I386-CMOV-NEXT: pushl %eax +; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: sete %al +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi +; I386-CMOV-NEXT: movl %esi, (%esp) +; I386-CMOV-NEXT: flds (%esp) +; I386-CMOV-NEXT: addl $4, %esp +; I386-CMOV-NEXT: popl %esi +; I386-CMOV-NEXT: popl %edi +; I386-CMOV-NEXT: retl + %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b) + ret float %result +} + +; Test f32 with different condition codes +define float @test_ctselect_f32_eq(float %x, float %y, float %a, float %b) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_f32_eq: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: pushl %eax +; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fucompp +; I386-NOCMOV-NEXT: fnstsw %ax +; I386-NOCMOV-NEXT: # kill: def $ah killed $ah killed $ax +; I386-NOCMOV-NEXT: sahf +; I386-NOCMOV-NEXT: setnp %al +; I386-NOCMOV-NEXT: sete %cl +; I386-NOCMOV-NEXT: testb %al, %cl +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi +; I386-NOCMOV-NEXT: movl %esi, (%esp) +; I386-NOCMOV-NEXT: flds (%esp) +; I386-NOCMOV-NEXT: addl $4, %esp +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %edi +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_f32_eq: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: pushl %edi +; I386-CMOV-NEXT: pushl %esi +; I386-CMOV-NEXT: pushl %eax +; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: fucompi %st(1), %st +; I386-CMOV-NEXT: fstp %st(0) +; I386-CMOV-NEXT: setnp %al +; I386-CMOV-NEXT: sete %cl +; I386-CMOV-NEXT: testb %al, %cl +; I386-CMOV-NEXT: sete %al +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi +; I386-CMOV-NEXT: movl %esi, (%esp) +; I386-CMOV-NEXT: flds (%esp) +; I386-CMOV-NEXT: addl $4, %esp +; I386-CMOV-NEXT: popl %esi +; I386-CMOV-NEXT: popl %edi +; I386-CMOV-NEXT: retl + %cmp = fcmp oeq float %x, %y + %result = call float @llvm.ct.select.f32(i1 %cmp, float %a, float %b) + ret float %result +} + +; Test basic f64 functionality +define double @test_ctselect_f64_basic(i1 %cond, double %a, double %b) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_f64_basic: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: subl $8, %esp +; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi +; I386-NOCMOV-NEXT: movl %esi, (%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi +; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fldl (%esp) +; I386-NOCMOV-NEXT: addl $8, %esp +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %edi +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_f64_basic: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: pushl %edi +; I386-CMOV-NEXT: pushl %esi +; I386-CMOV-NEXT: subl $8, %esp +; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: sete %al +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi +; I386-CMOV-NEXT: movl %esi, (%esp) +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi +; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: fldl (%esp) +; I386-CMOV-NEXT: addl $8, %esp +; I386-CMOV-NEXT: popl %esi +; I386-CMOV-NEXT: popl %edi +; I386-CMOV-NEXT: retl + %result = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b) + ret double %result +} + +; Test basic x86_fp80 functionality +define x86_fp80 @test_ctselect_f80_basic(i1 %cond, x86_fp80 %a, x86_fp80 %b) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_f80_basic: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: subl $12, %esp +; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi +; I386-NOCMOV-NEXT: movl %esi, (%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi +; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi +; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fldt (%esp) +; I386-NOCMOV-NEXT: addl $12, %esp +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %edi +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_f80_basic: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: pushl %edi +; I386-CMOV-NEXT: pushl %esi +; I386-CMOV-NEXT: subl $12, %esp +; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: sete %al +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi +; I386-CMOV-NEXT: movl %esi, (%esp) +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi +; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi +; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: fldt (%esp) +; I386-CMOV-NEXT: addl $12, %esp +; I386-CMOV-NEXT: popl %esi +; I386-CMOV-NEXT: popl %edi +; I386-CMOV-NEXT: retl + %result = call x86_fp80 @llvm.ct.select.f80(i1 %cond, x86_fp80 %a, x86_fp80 %b) + ret x86_fp80 %result +} + +; Test f32 with complex conditions +define float @test_ctselect_f32_gt(float %x, float %y, float %a, float %b) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_f32_gt: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: pushl %eax +; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fucompp +; I386-NOCMOV-NEXT: fnstsw %ax +; I386-NOCMOV-NEXT: # kill: def $ah killed $ah killed $ax +; I386-NOCMOV-NEXT: sahf +; I386-NOCMOV-NEXT: seta %al +; I386-NOCMOV-NEXT: testb %al, %al +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi +; I386-NOCMOV-NEXT: movl %esi, (%esp) +; I386-NOCMOV-NEXT: flds (%esp) +; I386-NOCMOV-NEXT: addl $4, %esp +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %edi +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_f32_gt: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: pushl %edi +; I386-CMOV-NEXT: pushl %esi +; I386-CMOV-NEXT: pushl %eax +; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: fucompi %st(1), %st +; I386-CMOV-NEXT: fstp %st(0) +; I386-CMOV-NEXT: seta %al +; I386-CMOV-NEXT: testb %al, %al +; I386-CMOV-NEXT: sete %al +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi +; I386-CMOV-NEXT: movl %esi, (%esp) +; I386-CMOV-NEXT: flds (%esp) +; I386-CMOV-NEXT: addl $4, %esp +; I386-CMOV-NEXT: popl %esi +; I386-CMOV-NEXT: popl %edi +; I386-CMOV-NEXT: retl + %cmp = fcmp ogt float %x, %y + %result = call float @llvm.ct.select.f32(i1 %cmp, float %a, float %b) + ret float %result +} + +; Test constant-time properties: verify no branches in generated code +define float @test_ctselect_f32_no_branches(i1 %cond, float %a, float %b) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_f32_no_branches: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: pushl %eax +; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi +; I386-NOCMOV-NEXT: movl %esi, (%esp) +; I386-NOCMOV-NEXT: flds (%esp) +; I386-NOCMOV-NEXT: addl $4, %esp +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %edi +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_f32_no_branches: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: pushl %edi +; I386-CMOV-NEXT: pushl %esi +; I386-CMOV-NEXT: pushl %eax +; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: sete %al +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi +; I386-CMOV-NEXT: movl %esi, (%esp) +; I386-CMOV-NEXT: flds (%esp) +; I386-CMOV-NEXT: addl $4, %esp +; I386-CMOV-NEXT: popl %esi +; I386-CMOV-NEXT: popl %edi +; I386-CMOV-NEXT: retl + %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b) + ret float %result +} + +; Test that BUNDLE directives are present for constant-time guarantees +define float @test_ctselect_f32_bundled(i1 %cond, float %a, float %b) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_f32_bundled: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: pushl %eax +; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi +; I386-NOCMOV-NEXT: movl %esi, (%esp) +; I386-NOCMOV-NEXT: flds (%esp) +; I386-NOCMOV-NEXT: addl $4, %esp +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %edi +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_f32_bundled: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: pushl %edi +; I386-CMOV-NEXT: pushl %esi +; I386-CMOV-NEXT: pushl %eax +; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: sete %al +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi +; I386-CMOV-NEXT: movl %esi, (%esp) +; I386-CMOV-NEXT: flds (%esp) +; I386-CMOV-NEXT: addl $4, %esp +; I386-CMOV-NEXT: popl %esi +; I386-CMOV-NEXT: popl %edi +; I386-CMOV-NEXT: retl + %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b) + ret float %result +} + +; Test edge case: NaN handling +define float @test_ctselect_f32_nan(i1 %cond) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_f32_nan: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: subl $12, %esp +; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} +; I386-NOCMOV-NEXT: fldz +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: fxch %st(1) +; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fstps (%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl (%esp), %edx +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi +; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: addl $12, %esp +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %edi +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_f32_nan: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: pushl %edi +; I386-CMOV-NEXT: pushl %esi +; I386-CMOV-NEXT: subl $12, %esp +; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} +; I386-CMOV-NEXT: fldz +; I386-CMOV-NEXT: sete %al +; I386-CMOV-NEXT: fxch %st(1) +; I386-CMOV-NEXT: fstps {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: fstps (%esp) +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl (%esp), %edx +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi +; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: addl $12, %esp +; I386-CMOV-NEXT: popl %esi +; I386-CMOV-NEXT: popl %edi +; I386-CMOV-NEXT: retl + %nan = bitcast i32 2139095040 to float ; 0x7F800000 = +inf + %zero = bitcast i32 0 to float + %result = call float @llvm.ct.select.f32(i1 %cond, float %nan, float %zero) + ret float %result +} + +; Test memory alignment for f80 +define x86_fp80 @test_ctselect_f80_alignment(i1 %cond, x86_fp80 %a, x86_fp80 %b) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_f80_alignment: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: subl $12, %esp +; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi +; I386-NOCMOV-NEXT: movl %esi, (%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi +; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi +; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fldt (%esp) +; I386-NOCMOV-NEXT: addl $12, %esp +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %edi +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_f80_alignment: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: pushl %edi +; I386-CMOV-NEXT: pushl %esi +; I386-CMOV-NEXT: subl $12, %esp +; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: sete %al +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi +; I386-CMOV-NEXT: movl %esi, (%esp) +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi +; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi +; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: fldt (%esp) +; I386-CMOV-NEXT: addl $12, %esp +; I386-CMOV-NEXT: popl %esi +; I386-CMOV-NEXT: popl %edi +; I386-CMOV-NEXT: retl + %result = call x86_fp80 @llvm.ct.select.f80(i1 %cond, x86_fp80 %a, x86_fp80 %b) + ret x86_fp80 %result +} + +; Stress test: multiple CTSELECT operations +define float @test_ctselect_f32_multiple(i1 %cond1, i1 %cond2, float %a, float %b, float %c, float %d) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_f32_multiple: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: subl $8, %esp +; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi +; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi +; I386-NOCMOV-NEXT: movl %esi, (%esp) +; I386-NOCMOV-NEXT: flds (%esp) +; I386-NOCMOV-NEXT: addl $8, %esp +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %edi +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_f32_multiple: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: pushl %edi +; I386-CMOV-NEXT: pushl %esi +; I386-CMOV-NEXT: subl $8, %esp +; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: sete %al +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi +; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: sete %al +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi +; I386-CMOV-NEXT: movl %esi, (%esp) +; I386-CMOV-NEXT: flds (%esp) +; I386-CMOV-NEXT: addl $8, %esp +; I386-CMOV-NEXT: popl %esi +; I386-CMOV-NEXT: popl %edi +; I386-CMOV-NEXT: retl + %sel1 = call float @llvm.ct.select.f32(i1 %cond1, float %a, float %b) + %sel2 = call float @llvm.ct.select.f32(i1 %cond2, float %sel1, float %c) + ret float %sel2 +} + +; Declare intrinsics +declare float @llvm.ct.select.f32(i1, float, float) +declare double @llvm.ct.select.f64(i1, double, double) +declare x86_fp80 @llvm.ct.select.f80(i1, x86_fp80, x86_fp80) diff --git a/llvm/test/CodeGen/X86/ctselect-i386-mmx.ll b/llvm/test/CodeGen/X86/ctselect-i386-mmx.ll new file mode 100644 index 0000000000000..bc7980c357e0e --- /dev/null +++ b/llvm/test/CodeGen/X86/ctselect-i386-mmx.ll @@ -0,0 +1,428 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=-cmov,+mmx < %s | FileCheck %s --check-prefix=I386-NOCMOV +; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+cmov,+mmx < %s | FileCheck %s --check-prefix=I386-CMOV +; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=-cmov,+mmx -verify-machineinstrs < %s | FileCheck %s --check-prefix=I386-NOCMOV + +; Test constant-time selection with MMX intrinsics to exercise VR64 CTSELECT +; These tests use MMX intrinsics to create <1 x i64> values that get allocated to VR64 registers + +; Test MMX ct.select using paddd intrinsic to force VR64 allocation +define <1 x i64> @test_mmx_ctselect_with_paddd(i32 %cond, i64 %a, i64 %b) { +; I386-NOCMOV-LABEL: test_mmx_ctselect_with_paddd: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %ebp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; I386-NOCMOV-NEXT: pushl %ebx +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20 +; I386-NOCMOV-NEXT: subl $20, %esp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 40 +; I386-NOCMOV-NEXT: .cfi_offset %esi, -20 +; I386-NOCMOV-NEXT: .cfi_offset %edi, -16 +; I386-NOCMOV-NEXT: .cfi_offset %ebx, -12 +; I386-NOCMOV-NEXT: .cfi_offset %ebp, -8 +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi +; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: setne %bl +; I386-NOCMOV-NEXT: testb %bl, %bl +; I386-NOCMOV-NEXT: sete %bh +; I386-NOCMOV-NEXT: movb %bh, %al +; I386-NOCMOV-NEXT: movzbl %al, %ebp +; I386-NOCMOV-NEXT: negl %ebp +; I386-NOCMOV-NEXT: movl %esi, %edi +; I386-NOCMOV-NEXT: andl %ebp, %edi +; I386-NOCMOV-NEXT: notl %ebp +; I386-NOCMOV-NEXT: andl %ecx, %ebp +; I386-NOCMOV-NEXT: orl %ebp, %edi +; I386-NOCMOV-NEXT: testb %bl, %bl +; I386-NOCMOV-NEXT: movl %edi, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edi +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %esi +; I386-NOCMOV-NEXT: negl %esi +; I386-NOCMOV-NEXT: movl %edx, %ecx +; I386-NOCMOV-NEXT: andl %esi, %ecx +; I386-NOCMOV-NEXT: notl %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: orl %esi, %ecx +; I386-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0 +; I386-NOCMOV-NEXT: paddd %mm0, %mm0 +; I386-NOCMOV-NEXT: movq %mm0, (%esp) +; I386-NOCMOV-NEXT: movl (%esp), %eax +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: addl $20, %esp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20 +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; I386-NOCMOV-NEXT: popl %edi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; I386-NOCMOV-NEXT: popl %ebp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_mmx_ctselect_with_paddd: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: subl $20, %esp +; I386-CMOV-NEXT: .cfi_def_cfa_offset 24 +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: setne %dl +; I386-CMOV-NEXT: testb %dl, %dl +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0 +; I386-CMOV-NEXT: paddd %mm0, %mm0 +; I386-CMOV-NEXT: movq %mm0, (%esp) +; I386-CMOV-NEXT: movl (%esp), %eax +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: addl $20, %esp +; I386-CMOV-NEXT: .cfi_def_cfa_offset 4 +; I386-CMOV-NEXT: retl + %mmx_a = bitcast i64 %a to <1 x i64> + %mmx_b = bitcast i64 %b to <1 x i64> + %cmp = icmp ne i32 %cond, 0 + %sel = call <1 x i64> @llvm.ct.select.v1i64(i1 %cmp, <1 x i64> %mmx_a, <1 x i64> %mmx_b) + %result = call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %sel, <1 x i64> %sel) + ret <1 x i64> %result +} + +; Test MMX ct.select using psllw intrinsic +define <1 x i64> @test_mmx_ctselect_with_psllw(i32 %cond, i64 %a, i64 %b) { +; I386-NOCMOV-LABEL: test_mmx_ctselect_with_psllw: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %ebp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; I386-NOCMOV-NEXT: pushl %ebx +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20 +; I386-NOCMOV-NEXT: subl $20, %esp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 40 +; I386-NOCMOV-NEXT: .cfi_offset %esi, -20 +; I386-NOCMOV-NEXT: .cfi_offset %edi, -16 +; I386-NOCMOV-NEXT: .cfi_offset %ebx, -12 +; I386-NOCMOV-NEXT: .cfi_offset %ebp, -8 +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi +; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: setne %bl +; I386-NOCMOV-NEXT: testb %bl, %bl +; I386-NOCMOV-NEXT: sete %bh +; I386-NOCMOV-NEXT: movb %bh, %al +; I386-NOCMOV-NEXT: movzbl %al, %ebp +; I386-NOCMOV-NEXT: negl %ebp +; I386-NOCMOV-NEXT: movl %esi, %edi +; I386-NOCMOV-NEXT: andl %ebp, %edi +; I386-NOCMOV-NEXT: notl %ebp +; I386-NOCMOV-NEXT: andl %ecx, %ebp +; I386-NOCMOV-NEXT: orl %ebp, %edi +; I386-NOCMOV-NEXT: testb %bl, %bl +; I386-NOCMOV-NEXT: movl %edi, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edi +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %esi +; I386-NOCMOV-NEXT: negl %esi +; I386-NOCMOV-NEXT: movl %edx, %ecx +; I386-NOCMOV-NEXT: andl %esi, %ecx +; I386-NOCMOV-NEXT: notl %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: orl %esi, %ecx +; I386-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0 +; I386-NOCMOV-NEXT: psllw %mm0, %mm0 +; I386-NOCMOV-NEXT: movq %mm0, (%esp) +; I386-NOCMOV-NEXT: movl (%esp), %eax +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: addl $20, %esp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20 +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; I386-NOCMOV-NEXT: popl %edi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; I386-NOCMOV-NEXT: popl %ebp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_mmx_ctselect_with_psllw: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: subl $20, %esp +; I386-CMOV-NEXT: .cfi_def_cfa_offset 24 +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: setne %dl +; I386-CMOV-NEXT: testb %dl, %dl +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0 +; I386-CMOV-NEXT: psllw %mm0, %mm0 +; I386-CMOV-NEXT: movq %mm0, (%esp) +; I386-CMOV-NEXT: movl (%esp), %eax +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: addl $20, %esp +; I386-CMOV-NEXT: .cfi_def_cfa_offset 4 +; I386-CMOV-NEXT: retl + %mmx_a = bitcast i64 %a to <1 x i64> + %mmx_b = bitcast i64 %b to <1 x i64> + %cmp = icmp ne i32 %cond, 0 + %sel = call <1 x i64> @llvm.ct.select.v1i64(i1 %cmp, <1 x i64> %mmx_a, <1 x i64> %mmx_b) + %result = call <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64> %sel, <1 x i64> %sel) + ret <1 x i64> %result +} + +; Test nested MMX ct.selects with pand intrinsic +define <1 x i64> @test_mmx_nested_ctselect_with_pand(i32 %cond1, i32 %cond2, i64 %a, i64 %b, i64 %c) { +; I386-NOCMOV-LABEL: test_mmx_nested_ctselect_with_pand: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %ebp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; I386-NOCMOV-NEXT: pushl %ebx +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20 +; I386-NOCMOV-NEXT: subl $20, %esp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 40 +; I386-NOCMOV-NEXT: .cfi_offset %esi, -20 +; I386-NOCMOV-NEXT: .cfi_offset %edi, -16 +; I386-NOCMOV-NEXT: .cfi_offset %ebx, -12 +; I386-NOCMOV-NEXT: .cfi_offset %ebp, -8 +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi +; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: setne %bl +; I386-NOCMOV-NEXT: testb %bl, %bl +; I386-NOCMOV-NEXT: sete %bh +; I386-NOCMOV-NEXT: movb %bh, %cl +; I386-NOCMOV-NEXT: movzbl %cl, %ebp +; I386-NOCMOV-NEXT: negl %ebp +; I386-NOCMOV-NEXT: movl %edx, %edi +; I386-NOCMOV-NEXT: andl %ebp, %edi +; I386-NOCMOV-NEXT: notl %ebp +; I386-NOCMOV-NEXT: andl %eax, %ebp +; I386-NOCMOV-NEXT: orl %ebp, %edi +; I386-NOCMOV-NEXT: testb %bl, %bl +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: sete %dl +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-NOCMOV-NEXT: movb %dl, %dh +; I386-NOCMOV-NEXT: movzbl %dh, %ebp +; I386-NOCMOV-NEXT: negl %ebp +; I386-NOCMOV-NEXT: movl %esi, %ebx +; I386-NOCMOV-NEXT: andl %ebp, %ebx +; I386-NOCMOV-NEXT: notl %ebp +; I386-NOCMOV-NEXT: andl %eax, %ebp +; I386-NOCMOV-NEXT: orl %ebp, %ebx +; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: setne %dl +; I386-NOCMOV-NEXT: testb %dl, %dl +; I386-NOCMOV-NEXT: sete %dh +; I386-NOCMOV-NEXT: movb %dh, %al +; I386-NOCMOV-NEXT: movzbl %al, %ebp +; I386-NOCMOV-NEXT: negl %ebp +; I386-NOCMOV-NEXT: movl %ecx, %esi +; I386-NOCMOV-NEXT: andl %ebp, %esi +; I386-NOCMOV-NEXT: notl %ebp +; I386-NOCMOV-NEXT: andl %ebx, %ebp +; I386-NOCMOV-NEXT: orl %ebp, %esi +; I386-NOCMOV-NEXT: testb %dl, %dl +; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ebx +; I386-NOCMOV-NEXT: movb %al, %dl +; I386-NOCMOV-NEXT: movzbl %dl, %esi +; I386-NOCMOV-NEXT: negl %esi +; I386-NOCMOV-NEXT: movl %ebx, %ecx +; I386-NOCMOV-NEXT: andl %esi, %ecx +; I386-NOCMOV-NEXT: notl %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: orl %esi, %ecx +; I386-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0 +; I386-NOCMOV-NEXT: pand %mm0, %mm0 +; I386-NOCMOV-NEXT: movq %mm0, (%esp) +; I386-NOCMOV-NEXT: movl (%esp), %eax +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: addl $20, %esp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20 +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; I386-NOCMOV-NEXT: popl %edi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; I386-NOCMOV-NEXT: popl %ebp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_mmx_nested_ctselect_with_pand: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: pushl %ebx +; I386-CMOV-NEXT: .cfi_def_cfa_offset 8 +; I386-CMOV-NEXT: pushl %esi +; I386-CMOV-NEXT: .cfi_def_cfa_offset 12 +; I386-CMOV-NEXT: subl $20, %esp +; I386-CMOV-NEXT: .cfi_def_cfa_offset 32 +; I386-CMOV-NEXT: .cfi_offset %esi, -12 +; I386-CMOV-NEXT: .cfi_offset %ebx, -8 +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %esi +; I386-CMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: setne %bl +; I386-CMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: setne %bh +; I386-CMOV-NEXT: testb %bh, %bh +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %esi +; I386-CMOV-NEXT: testb %bl, %bl +; I386-CMOV-NEXT: cmovnel %esi, %edx +; I386-CMOV-NEXT: movl %edx, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: cmovnel %ecx, %eax +; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0 +; I386-CMOV-NEXT: pand %mm0, %mm0 +; I386-CMOV-NEXT: movq %mm0, (%esp) +; I386-CMOV-NEXT: movl (%esp), %eax +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: addl $20, %esp +; I386-CMOV-NEXT: .cfi_def_cfa_offset 12 +; I386-CMOV-NEXT: popl %esi +; I386-CMOV-NEXT: .cfi_def_cfa_offset 8 +; I386-CMOV-NEXT: popl %ebx +; I386-CMOV-NEXT: .cfi_def_cfa_offset 4 +; I386-CMOV-NEXT: retl + %mmx_a = bitcast i64 %a to <1 x i64> + %mmx_b = bitcast i64 %b to <1 x i64> + %mmx_c = bitcast i64 %c to <1 x i64> + %cmp1 = icmp ne i32 %cond1, 0 + %cmp2 = icmp ne i32 %cond2, 0 + %sel1 = call <1 x i64> @llvm.ct.select.v1i64(i1 %cmp2, <1 x i64> %mmx_a, <1 x i64> %mmx_b) + %sel2 = call <1 x i64> @llvm.ct.select.v1i64(i1 %cmp1, <1 x i64> %sel1, <1 x i64> %mmx_c) + %result = call <1 x i64> @llvm.x86.mmx.pand(<1 x i64> %sel2, <1 x i64> %sel2) + ret <1 x i64> %result +} + +; Test MMX ct.select with por intrinsic +define <1 x i64> @test_mmx_ctselect_with_por(i32 %cond, i64 %a, i64 %b) { +; I386-NOCMOV-LABEL: test_mmx_ctselect_with_por: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %ebp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; I386-NOCMOV-NEXT: pushl %ebx +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20 +; I386-NOCMOV-NEXT: subl $20, %esp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 40 +; I386-NOCMOV-NEXT: .cfi_offset %esi, -20 +; I386-NOCMOV-NEXT: .cfi_offset %edi, -16 +; I386-NOCMOV-NEXT: .cfi_offset %ebx, -12 +; I386-NOCMOV-NEXT: .cfi_offset %ebp, -8 +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi +; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: setne %bl +; I386-NOCMOV-NEXT: testb %bl, %bl +; I386-NOCMOV-NEXT: sete %bh +; I386-NOCMOV-NEXT: movb %bh, %al +; I386-NOCMOV-NEXT: movzbl %al, %ebp +; I386-NOCMOV-NEXT: negl %ebp +; I386-NOCMOV-NEXT: movl %esi, %edi +; I386-NOCMOV-NEXT: andl %ebp, %edi +; I386-NOCMOV-NEXT: notl %ebp +; I386-NOCMOV-NEXT: andl %ecx, %ebp +; I386-NOCMOV-NEXT: orl %ebp, %edi +; I386-NOCMOV-NEXT: testb %bl, %bl +; I386-NOCMOV-NEXT: movl %edi, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edi +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %esi +; I386-NOCMOV-NEXT: negl %esi +; I386-NOCMOV-NEXT: movl %edx, %ecx +; I386-NOCMOV-NEXT: andl %esi, %ecx +; I386-NOCMOV-NEXT: notl %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: orl %esi, %ecx +; I386-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0 +; I386-NOCMOV-NEXT: por %mm0, %mm0 +; I386-NOCMOV-NEXT: movq %mm0, (%esp) +; I386-NOCMOV-NEXT: movl (%esp), %eax +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: addl $20, %esp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20 +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; I386-NOCMOV-NEXT: popl %edi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; I386-NOCMOV-NEXT: popl %ebp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_mmx_ctselect_with_por: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: subl $20, %esp +; I386-CMOV-NEXT: .cfi_def_cfa_offset 24 +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: setne %dl +; I386-CMOV-NEXT: testb %dl, %dl +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0 +; I386-CMOV-NEXT: por %mm0, %mm0 +; I386-CMOV-NEXT: movq %mm0, (%esp) +; I386-CMOV-NEXT: movl (%esp), %eax +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: addl $20, %esp +; I386-CMOV-NEXT: .cfi_def_cfa_offset 4 +; I386-CMOV-NEXT: retl + %mmx_a = bitcast i64 %a to <1 x i64> + %mmx_b = bitcast i64 %b to <1 x i64> + %cmp = icmp ne i32 %cond, 0 + %sel = call <1 x i64> @llvm.ct.select.v1i64(i1 %cmp, <1 x i64> %mmx_a, <1 x i64> %mmx_b) + %result = call <1 x i64> @llvm.x86.mmx.por(<1 x i64> %sel, <1 x i64> %sel) + ret <1 x i64> %result +} + +; Declare MMX intrinsics +declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.x86.mmx.pand(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.x86.mmx.por(<1 x i64>, <1 x i64>) + +; Declare constant-time selection intrinsic +declare <1 x i64> @llvm.ct.select.v1i64(i1, <1 x i64>, <1 x i64>) diff --git a/llvm/test/CodeGen/X86/ctselect-i386.ll b/llvm/test/CodeGen/X86/ctselect-i386.ll new file mode 100644 index 0000000000000..d7345f1121540 --- /dev/null +++ b/llvm/test/CodeGen/X86/ctselect-i386.ll @@ -0,0 +1,267 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov | FileCheck %s --check-prefix=I386-NOCMOV +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=I386-CMOV +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov -verify-machineinstrs | FileCheck %s --check-prefix=I386-NOCMOV + +; Comprehensive CTSELECT tests for i386 targets with scalar integer types +; - Without CMOV: constant-time implementation using post-RA expansion with bundled instructions +; - With CMOV: CMOV-based implementation +; - Verifies security properties: no conditional branches, constant execution time +; All expansion happens post-RA for better optimization control and constant-time guarantees + +; Test basic i32 functionality +define i32 @test_ctselect_i32_basic(i1 %cond, i32 %a, i32 %b) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_i32_basic: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %ebx +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %bl +; I386-NOCMOV-NEXT: movb %bl, %bh +; I386-NOCMOV-NEXT: movzbl %bh, %esi +; I386-NOCMOV-NEXT: negl %esi +; I386-NOCMOV-NEXT: movl %edx, %eax +; I386-NOCMOV-NEXT: andl %esi, %eax +; I386-NOCMOV-NEXT: notl %esi +; I386-NOCMOV-NEXT: andl %ecx, %esi +; I386-NOCMOV-NEXT: orl %esi, %eax +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_i32_basic: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: retl + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +; Test i16 functionality +define i16 @test_ctselect_i16_basic(i1 %cond, i16 %a, i16 %b) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_i16_basic: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %ebx +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movzwl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %bl +; I386-NOCMOV-NEXT: movb %bl, %bh +; I386-NOCMOV-NEXT: movzbw %bh, %si +; I386-NOCMOV-NEXT: negw %si +; I386-NOCMOV-NEXT: movw %dx, %ax +; I386-NOCMOV-NEXT: andw %si, %ax +; I386-NOCMOV-NEXT: notw %si +; I386-NOCMOV-NEXT: andw %cx, %si +; I386-NOCMOV-NEXT: orw %si, %ax +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_i16_basic: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: cmovnew {{[0-9]+}}(%esp), %ax +; I386-CMOV-NEXT: retl + %result = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b) + ret i16 %result +} + +; Test i8 functionality +define i8 @test_ctselect_i8_basic(i1 %cond, i8 %a, i8 %b) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_i8_basic: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %ah +; I386-NOCMOV-NEXT: movb %ah, %ch +; I386-NOCMOV-NEXT: negb %ch +; I386-NOCMOV-NEXT: movb %dl, %al +; I386-NOCMOV-NEXT: andb %ch, %al +; I386-NOCMOV-NEXT: notb %ch +; I386-NOCMOV-NEXT: andb %cl, %ch +; I386-NOCMOV-NEXT: orb %ch, %al +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_i8_basic: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: # kill: def $al killed $al killed $eax +; I386-CMOV-NEXT: retl + %result = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b) + ret i8 %result +} + +; Test security property: constant-time execution for cryptographic use case +define i32 @test_crypto_key_select(i32 %secret_bit, i32 %key1, i32 %key2) nounwind { +; I386-NOCMOV-LABEL: test_crypto_key_select: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %ebx +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: setne %al +; I386-NOCMOV-NEXT: testb %al, %al +; I386-NOCMOV-NEXT: sete %bl +; I386-NOCMOV-NEXT: movb %bl, %bh +; I386-NOCMOV-NEXT: movzbl %bh, %esi +; I386-NOCMOV-NEXT: negl %esi +; I386-NOCMOV-NEXT: movl %edx, %eax +; I386-NOCMOV-NEXT: andl %esi, %eax +; I386-NOCMOV-NEXT: notl %esi +; I386-NOCMOV-NEXT: andl %ecx, %esi +; I386-NOCMOV-NEXT: orl %esi, %eax +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_crypto_key_select: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: setne %cl +; I386-CMOV-NEXT: testb %cl, %cl +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: retl + %cond = icmp ne i32 %secret_bit, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %key1, i32 %key2) + ret i32 %result +} + +; Test that no conditional branches appear in constant-time path +define i32 @test_no_conditional_branches(i32 %secret, i32 %val1, i32 %val2) nounwind { +; I386-NOCMOV-LABEL: test_no_conditional_branches: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %ebx +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: setne %al +; I386-NOCMOV-NEXT: testb %al, %al +; I386-NOCMOV-NEXT: sete %bl +; I386-NOCMOV-NEXT: movb %bl, %bh +; I386-NOCMOV-NEXT: movzbl %bh, %esi +; I386-NOCMOV-NEXT: negl %esi +; I386-NOCMOV-NEXT: movl %edx, %eax +; I386-NOCMOV-NEXT: andl %esi, %eax +; I386-NOCMOV-NEXT: notl %esi +; I386-NOCMOV-NEXT: andl %ecx, %esi +; I386-NOCMOV-NEXT: orl %esi, %eax +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_no_conditional_branches: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: setne %cl +; I386-CMOV-NEXT: testb %cl, %cl +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: retl + %cond = icmp ne i32 %secret, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %val1, i32 %val2) + ret i32 %result +} + +; Test with comparison condition +define i32 @test_ctselect_i32_cmp(i32 %a, i32 %b, i32 %c) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_i32_cmp: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %ebx +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: cmpl %edx, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: testb %al, %al +; I386-NOCMOV-NEXT: sete %bl +; I386-NOCMOV-NEXT: movb %bl, %bh +; I386-NOCMOV-NEXT: movzbl %bh, %esi +; I386-NOCMOV-NEXT: negl %esi +; I386-NOCMOV-NEXT: movl %edx, %eax +; I386-NOCMOV-NEXT: andl %esi, %eax +; I386-NOCMOV-NEXT: notl %esi +; I386-NOCMOV-NEXT: andl %ecx, %esi +; I386-NOCMOV-NEXT: orl %esi, %eax +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_i32_cmp: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: cmpl %eax, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: sete %cl +; I386-CMOV-NEXT: testb %cl, %cl +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: retl + %cond = icmp eq i32 %a, %c + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %b, i32 %c) + ret i32 %result +} + +; Test nested selects +define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_nested: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %ebx +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %bl +; I386-NOCMOV-NEXT: movb %bl, %bh +; I386-NOCMOV-NEXT: movzbl %bh, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %eax, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi +; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %dl +; I386-NOCMOV-NEXT: movb %dl, %dh +; I386-NOCMOV-NEXT: movzbl %dh, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %ecx, %eax +; I386-NOCMOV-NEXT: andl %edi, %eax +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %esi, %edi +; I386-NOCMOV-NEXT: orl %edi, %eax +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %edi +; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_nested: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: cmovnel %ecx, %eax +; I386-CMOV-NEXT: retl + %sel1 = call i32 @llvm.ct.select.i32(i1 %cond2, i32 %a, i32 %b) + %sel2 = call i32 @llvm.ct.select.i32(i1 %cond1, i32 %sel1, i32 %c) + ret i32 %sel2 +} + +; Declare ct.select intrinsics +declare i8 @llvm.ct.select.i8(i1, i8, i8) +declare i16 @llvm.ct.select.i16(i1, i16, i16) +declare i32 @llvm.ct.select.i32(i1, i32, i32) diff --git a/llvm/test/CodeGen/X86/ctselect-optimization.ll b/llvm/test/CodeGen/X86/ctselect-optimization.ll new file mode 100644 index 0000000000000..481d49971a937 --- /dev/null +++ b/llvm/test/CodeGen/X86/ctselect-optimization.ll @@ -0,0 +1,304 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+cmov | FileCheck %s + +; Test ct.select optimization patterns + +; Test smin(x, 0) pattern optimization +define i32 @test_ctselect_smin_zero(i32 %x) { +; CHECK-LABEL: test_ctselect_smin_zero: +; CHECK: # %bb.0: +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: sets %cl +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: cmovnel %edi, %eax +; CHECK-NEXT: retq + %cmp = icmp slt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0) + ret i32 %result +} + +; Test smax(x, 0) pattern optimization +define i32 @test_ctselect_smax_zero(i32 %x) { +; CHECK-LABEL: test_ctselect_smax_zero: +; CHECK: # %bb.0: +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: setg %cl +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: cmovnel %edi, %eax +; CHECK-NEXT: retq + %cmp = icmp sgt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0) + ret i32 %result +} + +; Test generic smin pattern +define i32 @test_ctselect_smin_generic(i32 %x, i32 %y) { +; CHECK-LABEL: test_ctselect_smin_generic: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: cmpl %esi, %edi +; CHECK-NEXT: setl %cl +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: cmovnel %edi, %eax +; CHECK-NEXT: retq + %cmp = icmp slt i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y) + ret i32 %result +} + +; Test generic smax pattern +define i32 @test_ctselect_smax_generic(i32 %x, i32 %y) { +; CHECK-LABEL: test_ctselect_smax_generic: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: cmpl %esi, %edi +; CHECK-NEXT: setg %cl +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: cmovnel %edi, %eax +; CHECK-NEXT: retq + %cmp = icmp sgt i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y) + ret i32 %result +} + +; Test umin pattern +define i32 @test_ctselect_umin_generic(i32 %x, i32 %y) { +; CHECK-LABEL: test_ctselect_umin_generic: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: cmpl %esi, %edi +; CHECK-NEXT: setb %cl +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: cmovnel %edi, %eax +; CHECK-NEXT: retq + %cmp = icmp ult i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y) + ret i32 %result +} + +; Test umax pattern +define i32 @test_ctselect_umax_generic(i32 %x, i32 %y) { +; CHECK-LABEL: test_ctselect_umax_generic: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: cmpl %esi, %edi +; CHECK-NEXT: seta %cl +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: cmovnel %edi, %eax +; CHECK-NEXT: retq + %cmp = icmp ugt i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y) + ret i32 %result +} + +; Test abs pattern +define i32 @test_ctselect_abs(i32 %x) { +; CHECK-LABEL: test_ctselect_abs: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: negl %ecx +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: sets %dl +; CHECK-NEXT: testb %dl, %dl +; CHECK-NEXT: cmovnel %ecx, %eax +; CHECK-NEXT: retq + %neg = sub i32 0, %x + %cmp = icmp slt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %neg, i32 %x) + ret i32 %result +} + +; Test nabs pattern (negative abs) +define i32 @test_ctselect_nabs(i32 %x) { +; CHECK-LABEL: test_ctselect_nabs: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: negl %eax +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: sets %cl +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: cmovnel %edi, %eax +; CHECK-NEXT: retq + %neg = sub i32 0, %x + %cmp = icmp slt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %neg) + ret i32 %result +} + +; Test sign extension pattern +define i32 @test_ctselect_sign_extend(i32 %x) { +; CHECK-LABEL: test_ctselect_sign_extend: +; CHECK: # %bb.0: +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: sets %cl +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: movl $-1, %ecx +; CHECK-NEXT: cmovnel %ecx, %eax +; CHECK-NEXT: retq + %cmp = icmp slt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 -1, i32 0) + ret i32 %result +} + +; Test zero extension pattern +define i32 @test_ctselect_zero_extend(i32 %x) { +; CHECK-LABEL: test_ctselect_zero_extend: +; CHECK: # %bb.0: +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: setne %cl +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: movl $1, %ecx +; CHECK-NEXT: cmovnel %ecx, %eax +; CHECK-NEXT: retq + %cmp = icmp ne i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 1, i32 0) + ret i32 %result +} + +; Test mask generation pattern +define i32 @test_ctselect_mask_generation(i32 %x) { +; CHECK-LABEL: test_ctselect_mask_generation: +; CHECK: # %bb.0: +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: sets %cl +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: movl $-1, %ecx +; CHECK-NEXT: cmovnel %ecx, %eax +; CHECK-NEXT: retq + %cmp = icmp slt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 -1, i32 0) + ret i32 %result +} + +; Test constant folding with known condition +define i32 @test_ctselect_constant_folding_true(i32 %a, i32 %b) { +; CHECK-LABEL: test_ctselect_constant_folding_true: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: movb $1, %cl +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: cmovnel %edi, %eax +; CHECK-NEXT: retq + %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b) + ret i32 %result +} + +define i32 @test_ctselect_constant_folding_false(i32 %a, i32 %b) { +; CHECK-LABEL: test_ctselect_constant_folding_false: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: cmovnel %edi, %eax +; CHECK-NEXT: retq + %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b) + ret i32 %result +} + +; Test with identical operands +define i32 @test_ctselect_identical_operands(i1 %cond, i32 %x) { +; CHECK-LABEL: test_ctselect_identical_operands: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: cmovnel %esi, %eax +; CHECK-NEXT: retq + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %x) + ret i32 %result +} + +; Test with inverted condition +define i32 @test_ctselect_inverted_condition(i32 %x, i32 %y, i32 %a, i32 %b) { +; CHECK-LABEL: test_ctselect_inverted_condition: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: cmpl %esi, %edi +; CHECK-NEXT: sete %dl +; CHECK-NEXT: testb %dl, %dl +; CHECK-NEXT: cmovnel %ecx, %eax +; CHECK-NEXT: retq + %cmp = icmp eq i32 %x, %y + %not_cmp = xor i1 %cmp, true + %result = call i32 @llvm.ct.select.i32(i1 %not_cmp, i32 %a, i32 %b) + ret i32 %result +} + +; Test for 64-bit specific optimizations +define i64 @test_ctselect_i64_smin_zero(i64 %x) { +; CHECK-LABEL: test_ctselect_i64_smin_zero: +; CHECK: # %bb.0: +; CHECK-NEXT: testq %rdi, %rdi +; CHECK-NEXT: sets %cl +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: cmovneq %rdi, %rax +; CHECK-NEXT: retq + %cmp = icmp slt i64 %x, 0 + %result = call i64 @llvm.ct.select.i64(i1 %cmp, i64 %x, i64 0) + ret i64 %result +} + +; Test for floating point optimizations +define float @test_ctselect_f32_zero_positive(float %x) { +; CHECK-LABEL: test_ctselect_f32_zero_positive: +; CHECK: # %bb.0: +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 +; CHECK-NEXT: seta %cl +; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: cmovnel %eax, %edx +; CHECK-NEXT: movd %edx, %xmm0 +; CHECK-NEXT: retq + %cmp = fcmp ogt float %x, 0.0 + %result = call float @llvm.ct.select.f32(i1 %cmp, float %x, float 0.0) + ret float %result +} + +define double @test_ctselect_f64_zero_positive(double %x) { +; CHECK-LABEL: test_ctselect_f64_zero_positive: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: xorpd %xmm1, %xmm1 +; CHECK-NEXT: ucomisd %xmm1, %xmm0 +; CHECK-NEXT: seta %cl +; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: cmovneq %rax, %rdx +; CHECK-NEXT: movq %rdx, %xmm0 +; CHECK-NEXT: retq + %cmp = fcmp ogt double %x, 0.0 + %result = call double @llvm.ct.select.f64(i1 %cmp, double %x, double 0.0) + ret double %result +} + +; Test chain of ct.select operations +define i32 @test_ctselect_chain(i1 %c1, i1 %c2, i1 %c3, i32 %a, i32 %b, i32 %c, i32 %d) { +; CHECK-LABEL: test_ctselect_chain: +; CHECK: # %bb.0: +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: cmovnel %ecx, %r8d +; CHECK-NEXT: testb $1, %sil +; CHECK-NEXT: cmovnel %r8d, %r9d +; CHECK-NEXT: testb $1, %dl +; CHECK-NEXT: cmovnel %r9d, %eax +; CHECK-NEXT: retq + %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b) + %sel2 = call i32 @llvm.ct.select.i32(i1 %c2, i32 %sel1, i32 %c) + %sel3 = call i32 @llvm.ct.select.i32(i1 %c3, i32 %sel2, i32 %d) + ret i32 %sel3 +} + +; Declare the intrinsics +declare i32 @llvm.ct.select.i32(i1, i32, i32) +declare i64 @llvm.ct.select.i64(i1, i64, i64) +declare float @llvm.ct.select.f32(i1, float, float) +declare double @llvm.ct.select.f64(i1, double, double) diff --git a/llvm/test/CodeGen/X86/ctselect-vector.ll b/llvm/test/CodeGen/X86/ctselect-vector.ll new file mode 100644 index 0000000000000..2206e32cd6d34 --- /dev/null +++ b/llvm/test/CodeGen/X86/ctselect-vector.ll @@ -0,0 +1,1274 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 + +; Test ct.select functionality for vector types + +; 128-bit vectors +define <4 x i32> @test_ctselect_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: test_ctselect_v4i32: +; SSE2: # %bb.0: +; SSE2-NEXT: testb $1, %dil +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_ctselect_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: testb $1, %dil +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: pxor %xmm3, %xmm3 +; AVX-NEXT: movd %eax, %xmm3 +; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; AVX-NEXT: movdqa %xmm3, %xmm2 +; AVX-NEXT: pand %xmm0, %xmm3 +; AVX-NEXT: pandn %xmm1, %xmm2 +; AVX-NEXT: por %xmm3, %xmm2 +; AVX-NEXT: vmovaps %xmm2, %xmm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: test_ctselect_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: testb $1, %dil +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: pxor %xmm3, %xmm3 +; AVX2-NEXT: movd %eax, %xmm3 +; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; AVX2-NEXT: movdqa %xmm3, %xmm2 +; AVX2-NEXT: pand %xmm0, %xmm3 +; AVX2-NEXT: pandn %xmm1, %xmm2 +; AVX2-NEXT: por %xmm3, %xmm2 +; AVX2-NEXT: vmovaps %xmm2, %xmm0 +; AVX2-NEXT: retq +; AVX512-LABEL: test_ctselect_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: testb %dil, %dil +; AVX512-NEXT: je .LBB0_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: vmovaps %xmm0, %xmm1 +; AVX512-NEXT: .LBB0_2: +; AVX512-NEXT: vmovaps %xmm1, %xmm0 +; AVX512-NEXT: retq + %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %result +} + +define <4 x float> @test_ctselect_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: test_ctselect_v4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: testb $1, %dil +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; SSE2-NEXT: movaps %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_ctselect_v4f32: +; AVX: # %bb.0: +; AVX-NEXT: testb $1, %dil +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: pxor %xmm3, %xmm3 +; AVX-NEXT: movd %eax, %xmm3 +; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; AVX-NEXT: movdqa %xmm3, %xmm2 +; AVX-NEXT: pand %xmm0, %xmm3 +; AVX-NEXT: pandn %xmm1, %xmm2 +; AVX-NEXT: por %xmm3, %xmm2 +; AVX-NEXT: vmovaps %xmm2, %xmm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: test_ctselect_v4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: testb $1, %dil +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: pxor %xmm3, %xmm3 +; AVX2-NEXT: movd %eax, %xmm3 +; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; AVX2-NEXT: movdqa %xmm3, %xmm2 +; AVX2-NEXT: pand %xmm0, %xmm3 +; AVX2-NEXT: pandn %xmm1, %xmm2 +; AVX2-NEXT: por %xmm3, %xmm2 +; AVX2-NEXT: vmovaps %xmm2, %xmm0 +; AVX2-NEXT: retq +; AVX512-LABEL: test_ctselect_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: testb %dil, %dil +; AVX512-NEXT: je .LBB1_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: vmovaps %xmm0, %xmm1 +; AVX512-NEXT: .LBB1_2: +; AVX512-NEXT: vmovaps %xmm1, %xmm0 +; AVX512-NEXT: retq + %result = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) + ret <4 x float> %result +} + +define <2 x i64> @test_ctselect_v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) { +; SSE2-LABEL: test_ctselect_v2i64: +; SSE2: # %bb.0: +; SSE2-NEXT: testb $1, %dil +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_ctselect_v2i64: +; AVX: # %bb.0: +; AVX-NEXT: testb $1, %dil +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: pxor %xmm3, %xmm3 +; AVX-NEXT: movd %eax, %xmm3 +; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; AVX-NEXT: movdqa %xmm3, %xmm2 +; AVX-NEXT: pand %xmm0, %xmm3 +; AVX-NEXT: pandn %xmm1, %xmm2 +; AVX-NEXT: por %xmm3, %xmm2 +; AVX-NEXT: vmovaps %xmm2, %xmm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: test_ctselect_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: testb $1, %dil +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: pxor %xmm3, %xmm3 +; AVX2-NEXT: movd %eax, %xmm3 +; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; AVX2-NEXT: movdqa %xmm3, %xmm2 +; AVX2-NEXT: pand %xmm0, %xmm3 +; AVX2-NEXT: pandn %xmm1, %xmm2 +; AVX2-NEXT: por %xmm3, %xmm2 +; AVX2-NEXT: vmovaps %xmm2, %xmm0 +; AVX2-NEXT: retq +; AVX512-LABEL: test_ctselect_v2i64: +; AVX512: # %bb.0: +; AVX512-NEXT: testb %dil, %dil +; AVX512-NEXT: je .LBB2_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: vmovaps %xmm0, %xmm1 +; AVX512-NEXT: .LBB2_2: +; AVX512-NEXT: vmovaps %xmm1, %xmm0 +; AVX512-NEXT: retq + %result = call <2 x i64> @llvm.ct.select.v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) + ret <2 x i64> %result +} + +define <2 x double> @test_ctselect_v2f64(i1 %cond, <2 x double> %a, <2 x double> %b) { +; SSE2-LABEL: test_ctselect_v2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: testb $1, %dil +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; SSE2-NEXT: movapd %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_ctselect_v2f64: +; AVX: # %bb.0: +; AVX-NEXT: testb $1, %dil +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: pxor %xmm3, %xmm3 +; AVX-NEXT: movd %eax, %xmm3 +; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; AVX-NEXT: movdqa %xmm3, %xmm2 +; AVX-NEXT: pand %xmm0, %xmm3 +; AVX-NEXT: pandn %xmm1, %xmm2 +; AVX-NEXT: por %xmm3, %xmm2 +; AVX-NEXT: vmovaps %xmm2, %xmm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: test_ctselect_v2f64: +; AVX2: # %bb.0: +; AVX2-NEXT: testb $1, %dil +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: pxor %xmm3, %xmm3 +; AVX2-NEXT: movd %eax, %xmm3 +; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; AVX2-NEXT: movdqa %xmm3, %xmm2 +; AVX2-NEXT: pand %xmm0, %xmm3 +; AVX2-NEXT: pandn %xmm1, %xmm2 +; AVX2-NEXT: por %xmm3, %xmm2 +; AVX2-NEXT: vmovaps %xmm2, %xmm0 +; AVX2-NEXT: retq +; AVX512-LABEL: test_ctselect_v2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: testb %dil, %dil +; AVX512-NEXT: je .LBB3_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: vmovapd %xmm0, %xmm1 +; AVX512-NEXT: .LBB3_2: +; AVX512-NEXT: vmovapd %xmm1, %xmm0 +; AVX512-NEXT: retq + %result = call <2 x double> @llvm.ct.select.v2f64(i1 %cond, <2 x double> %a, <2 x double> %b) + ret <2 x double> %result +} + +; 256-bit vectors +define <8 x i32> @test_ctselect_v8i32(i1 %cond, <8 x i32> %a, <8 x i32> %b) { +; SSE2-LABEL: test_ctselect_v8i32: +; SSE2: # %bb.0: +; SSE2-NEXT: testb $1, %dil +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: movd %eax, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pand %xmm0, %xmm5 +; SSE2-NEXT: pandn %xmm2, %xmm4 +; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm4, %xmm0 +; SSE2-NEXT: movaps %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_ctselect_v8i32: +; AVX: # %bb.0: +; AVX-NEXT: testb $1, %dil +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: pxor %ymm3, %ymm3 +; AVX-NEXT: vmovd %eax, %ymm3 +; AVX-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4] +; AVX-NEXT: vmovdqa %ymm3, %ymm2 +; AVX-NEXT: pand %ymm0, %ymm3 +; AVX-NEXT: pandn %ymm1, %ymm2 +; AVX-NEXT: por %ymm3, %ymm2 +; AVX-NEXT: vmovaps %ymm2, %ymm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: test_ctselect_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: testb $1, %dil +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: pxor %ymm3, %ymm3 +; AVX2-NEXT: vmovd %eax, %ymm3 +; AVX2-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vmovdqa %ymm3, %ymm2 +; AVX2-NEXT: pand %ymm0, %ymm3 +; AVX2-NEXT: pandn %ymm1, %ymm2 +; AVX2-NEXT: por %ymm3, %ymm2 +; AVX2-NEXT: vmovaps %ymm2, %ymm0 +; AVX2-NEXT: retq +; AVX512-LABEL: test_ctselect_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: testb %dil, %dil +; AVX512-NEXT: je .LBB4_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: vmovaps %ymm0, %ymm1 +; AVX512-NEXT: .LBB4_2: +; AVX512-NEXT: vmovaps %ymm1, %ymm0 +; AVX512-NEXT: retq + %result = call <8 x i32> @llvm.ct.select.v8i32(i1 %cond, <8 x i32> %a, <8 x i32> %b) + ret <8 x i32> %result +} + +define <8 x float> @test_ctselect_v8f32(i1 %cond, <8 x float> %a, <8 x float> %b) { +; SSE2-LABEL: test_ctselect_v8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: testb $1, %dil +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: movd %eax, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] +; SSE2-NEXT: movaps %xmm5, %xmm4 +; SSE2-NEXT: pand %xmm0, %xmm5 +; SSE2-NEXT: pandn %xmm2, %xmm4 +; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm4, %xmm0 +; SSE2-NEXT: movaps %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_ctselect_v8f32: +; AVX: # %bb.0: +; AVX-NEXT: testb $1, %dil +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: pxor %ymm3, %ymm3 +; AVX-NEXT: vmovd %eax, %ymm3 +; AVX-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4] +; AVX-NEXT: vmovdqa %ymm3, %ymm2 +; AVX-NEXT: pand %ymm0, %ymm3 +; AVX-NEXT: pandn %ymm1, %ymm2 +; AVX-NEXT: por %ymm3, %ymm2 +; AVX-NEXT: vmovaps %ymm2, %ymm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: test_ctselect_v8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: testb $1, %dil +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: pxor %ymm3, %ymm3 +; AVX2-NEXT: vmovd %eax, %ymm3 +; AVX2-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vmovdqa %ymm3, %ymm2 +; AVX2-NEXT: pand %ymm0, %ymm3 +; AVX2-NEXT: pandn %ymm1, %ymm2 +; AVX2-NEXT: por %ymm3, %ymm2 +; AVX2-NEXT: vmovaps %ymm2, %ymm0 +; AVX2-NEXT: retq +; AVX512-LABEL: test_ctselect_v8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: testb %dil, %dil +; AVX512-NEXT: je .LBB5_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: vmovaps %ymm0, %ymm1 +; AVX512-NEXT: .LBB5_2: +; AVX512-NEXT: vmovaps %ymm1, %ymm0 +; AVX512-NEXT: retq + %result = call <8 x float> @llvm.ct.select.v8f32(i1 %cond, <8 x float> %a, <8 x float> %b) + ret <8 x float> %result +} + +define <4 x i64> @test_ctselect_v4i64(i1 %cond, <4 x i64> %a, <4 x i64> %b) { +; SSE2-LABEL: test_ctselect_v4i64: +; SSE2: # %bb.0: +; SSE2-NEXT: testb $1, %dil +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: movd %eax, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pand %xmm0, %xmm5 +; SSE2-NEXT: pandn %xmm2, %xmm4 +; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm4, %xmm0 +; SSE2-NEXT: movaps %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_ctselect_v4i64: +; AVX: # %bb.0: +; AVX-NEXT: testb $1, %dil +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: pxor %ymm3, %ymm3 +; AVX-NEXT: vmovd %eax, %ymm3 +; AVX-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,0,2,2] +; AVX-NEXT: vmovdqa %ymm3, %ymm2 +; AVX-NEXT: pand %ymm0, %ymm3 +; AVX-NEXT: pandn %ymm1, %ymm2 +; AVX-NEXT: por %ymm3, %ymm2 +; AVX-NEXT: vmovaps %ymm2, %ymm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: test_ctselect_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: testb $1, %dil +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: pxor %ymm3, %ymm3 +; AVX2-NEXT: vmovd %eax, %ymm3 +; AVX2-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,0,2,2] +; AVX2-NEXT: vmovdqa %ymm3, %ymm2 +; AVX2-NEXT: pand %ymm0, %ymm3 +; AVX2-NEXT: pandn %ymm1, %ymm2 +; AVX2-NEXT: por %ymm3, %ymm2 +; AVX2-NEXT: vmovaps %ymm2, %ymm0 +; AVX2-NEXT: retq +; AVX512-LABEL: test_ctselect_v4i64: +; AVX512: # %bb.0: +; AVX512-NEXT: testb %dil, %dil +; AVX512-NEXT: je .LBB6_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: vmovaps %ymm0, %ymm1 +; AVX512-NEXT: .LBB6_2: +; AVX512-NEXT: vmovaps %ymm1, %ymm0 +; AVX512-NEXT: retq + %result = call <4 x i64> @llvm.ct.select.v4i64(i1 %cond, <4 x i64> %a, <4 x i64> %b) + ret <4 x i64> %result +} + +define <4 x double> @test_ctselect_v4f64(i1 %cond, <4 x double> %a, <4 x double> %b) { +; SSE2-LABEL: test_ctselect_v4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: testb $1, %dil +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: movd %eax, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] +; SSE2-NEXT: movapd %xmm5, %xmm4 +; SSE2-NEXT: pand %xmm0, %xmm5 +; SSE2-NEXT: pandn %xmm2, %xmm4 +; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: movapd %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm4, %xmm0 +; SSE2-NEXT: movaps %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_ctselect_v4f64: +; AVX: # %bb.0: +; AVX-NEXT: testb $1, %dil +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: pxor %ymm3, %ymm3 +; AVX-NEXT: vmovd %eax, %ymm3 +; AVX-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,0,2,2] +; AVX-NEXT: vmovdqa %ymm3, %ymm2 +; AVX-NEXT: pand %ymm0, %ymm3 +; AVX-NEXT: pandn %ymm1, %ymm2 +; AVX-NEXT: por %ymm3, %ymm2 +; AVX-NEXT: vmovaps %ymm2, %ymm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: test_ctselect_v4f64: +; AVX2: # %bb.0: +; AVX2-NEXT: testb $1, %dil +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: pxor %ymm3, %ymm3 +; AVX2-NEXT: vmovd %eax, %ymm3 +; AVX2-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,0,2,2] +; AVX2-NEXT: vmovdqa %ymm3, %ymm2 +; AVX2-NEXT: pand %ymm0, %ymm3 +; AVX2-NEXT: pandn %ymm1, %ymm2 +; AVX2-NEXT: por %ymm3, %ymm2 +; AVX2-NEXT: vmovaps %ymm2, %ymm0 +; AVX2-NEXT: retq +; AVX512-LABEL: test_ctselect_v4f64: +; AVX512: # %bb.0: +; AVX512-NEXT: testb %dil, %dil +; AVX512-NEXT: je .LBB7_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: vmovapd %ymm0, %ymm1 +; AVX512-NEXT: .LBB7_2: +; AVX512-NEXT: vmovapd %ymm1, %ymm0 +; AVX512-NEXT: retq + %result = call <4 x double> @llvm.ct.select.v4f64(i1 %cond, <4 x double> %a, <4 x double> %b) + ret <4 x double> %result +} + +; 512-bit vectors (AVX512 only) +define <16 x i32> @test_ctselect_v16i32(i1 %cond, <16 x i32> %a, <16 x i32> %b) { +; SSE2-LABEL: test_ctselect_v16i32: +; SSE2: # %bb.0: +; SSE2-NEXT: testb $1, %dil +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm9, %xmm9 +; SSE2-NEXT: movd %eax, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0] +; SSE2-NEXT: movdqa %xmm9, %xmm8 +; SSE2-NEXT: pand %xmm0, %xmm9 +; SSE2-NEXT: pandn %xmm4, %xmm8 +; SSE2-NEXT: por %xmm9, %xmm8 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm5, %xmm4 +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm6, %xmm5 +; SSE2-NEXT: por %xmm0, %xmm5 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm7, %xmm6 +; SSE2-NEXT: por %xmm0, %xmm6 +; SSE2-NEXT: movaps %xmm8, %xmm0 +; SSE2-NEXT: movaps %xmm4, %xmm1 +; SSE2-NEXT: movaps %xmm5, %xmm2 +; SSE2-NEXT: movaps %xmm6, %xmm3 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_ctselect_v16i32: +; AVX: # %bb.0: +; AVX-NEXT: testb $1, %dil +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: pxor %ymm5, %ymm5 +; AVX-NEXT: vmovd %eax, %ymm5 +; AVX-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4] +; AVX-NEXT: vmovdqa %ymm5, %ymm4 +; AVX-NEXT: pand %ymm0, %ymm5 +; AVX-NEXT: pandn %ymm2, %ymm4 +; AVX-NEXT: por %ymm5, %ymm4 +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: pxor %ymm0, %ymm0 +; AVX-NEXT: vmovd %eax, %ymm0 +; AVX-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX-NEXT: vmovdqa %ymm0, %ymm2 +; AVX-NEXT: pand %ymm1, %ymm0 +; AVX-NEXT: pandn %ymm3, %ymm2 +; AVX-NEXT: por %ymm0, %ymm2 +; AVX-NEXT: vmovaps %ymm4, %ymm0 +; AVX-NEXT: vmovaps %ymm2, %ymm1 +; AVX-NEXT: retq +; +; AVX2-LABEL: test_ctselect_v16i32: +; AVX2: # %bb.0: +; AVX2-NEXT: testb $1, %dil +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: pxor %ymm5, %ymm5 +; AVX2-NEXT: vmovd %eax, %ymm5 +; AVX2-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vmovdqa %ymm5, %ymm4 +; AVX2-NEXT: pand %ymm0, %ymm5 +; AVX2-NEXT: pandn %ymm2, %ymm4 +; AVX2-NEXT: por %ymm5, %ymm4 +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: pxor %ymm0, %ymm0 +; AVX2-NEXT: vmovd %eax, %ymm0 +; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vmovdqa %ymm0, %ymm2 +; AVX2-NEXT: pand %ymm1, %ymm0 +; AVX2-NEXT: pandn %ymm3, %ymm2 +; AVX2-NEXT: por %ymm0, %ymm2 +; AVX2-NEXT: vmovaps %ymm4, %ymm0 +; AVX2-NEXT: vmovaps %ymm2, %ymm1 +; AVX2-NEXT: retq +; AVX512-LABEL: test_ctselect_v16i32: +; AVX512: # %bb.0: +; AVX512-NEXT: testb %dil, %dil +; AVX512-NEXT: je .LBB8_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: vmovaps %zmm0, %zmm1 +; AVX512-NEXT: .LBB8_2: +; AVX512-NEXT: vmovaps %zmm1, %zmm0 +; AVX512-NEXT: retq + %result = call <16 x i32> @llvm.ct.select.v16i32(i1 %cond, <16 x i32> %a, <16 x i32> %b) + ret <16 x i32> %result +} + +define <16 x float> @test_ctselect_v16f32(i1 %cond, <16 x float> %a, <16 x float> %b) { +; SSE2-LABEL: test_ctselect_v16f32: +; SSE2: # %bb.0: +; SSE2-NEXT: testb $1, %dil +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm9, %xmm9 +; SSE2-NEXT: movd %eax, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0] +; SSE2-NEXT: movaps %xmm9, %xmm8 +; SSE2-NEXT: pand %xmm0, %xmm9 +; SSE2-NEXT: pandn %xmm4, %xmm8 +; SSE2-NEXT: por %xmm9, %xmm8 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: movaps %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm5, %xmm4 +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: movaps %xmm0, %xmm5 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm6, %xmm5 +; SSE2-NEXT: por %xmm0, %xmm5 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: movaps %xmm0, %xmm6 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm7, %xmm6 +; SSE2-NEXT: por %xmm0, %xmm6 +; SSE2-NEXT: movaps %xmm8, %xmm0 +; SSE2-NEXT: movaps %xmm4, %xmm1 +; SSE2-NEXT: movaps %xmm5, %xmm2 +; SSE2-NEXT: movaps %xmm6, %xmm3 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_ctselect_v16f32: +; AVX: # %bb.0: +; AVX-NEXT: testb $1, %dil +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: pxor %ymm5, %ymm5 +; AVX-NEXT: vmovd %eax, %ymm5 +; AVX-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4] +; AVX-NEXT: vmovdqa %ymm5, %ymm4 +; AVX-NEXT: pand %ymm0, %ymm5 +; AVX-NEXT: pandn %ymm2, %ymm4 +; AVX-NEXT: por %ymm5, %ymm4 +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: pxor %ymm0, %ymm0 +; AVX-NEXT: vmovd %eax, %ymm0 +; AVX-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX-NEXT: vmovdqa %ymm0, %ymm2 +; AVX-NEXT: pand %ymm1, %ymm0 +; AVX-NEXT: pandn %ymm3, %ymm2 +; AVX-NEXT: por %ymm0, %ymm2 +; AVX-NEXT: vmovaps %ymm4, %ymm0 +; AVX-NEXT: vmovaps %ymm2, %ymm1 +; AVX-NEXT: retq +; +; AVX2-LABEL: test_ctselect_v16f32: +; AVX2: # %bb.0: +; AVX2-NEXT: testb $1, %dil +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: pxor %ymm5, %ymm5 +; AVX2-NEXT: vmovd %eax, %ymm5 +; AVX2-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vmovdqa %ymm5, %ymm4 +; AVX2-NEXT: pand %ymm0, %ymm5 +; AVX2-NEXT: pandn %ymm2, %ymm4 +; AVX2-NEXT: por %ymm5, %ymm4 +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: pxor %ymm0, %ymm0 +; AVX2-NEXT: vmovd %eax, %ymm0 +; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vmovdqa %ymm0, %ymm2 +; AVX2-NEXT: pand %ymm1, %ymm0 +; AVX2-NEXT: pandn %ymm3, %ymm2 +; AVX2-NEXT: por %ymm0, %ymm2 +; AVX2-NEXT: vmovaps %ymm4, %ymm0 +; AVX2-NEXT: vmovaps %ymm2, %ymm1 +; AVX2-NEXT: retq +; AVX512-LABEL: test_ctselect_v16f32: +; AVX512: # %bb.0: +; AVX512-NEXT: testb %dil, %dil +; AVX512-NEXT: je .LBB9_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: vmovaps %zmm0, %zmm1 +; AVX512-NEXT: .LBB9_2: +; AVX512-NEXT: vmovaps %zmm1, %zmm0 +; AVX512-NEXT: retq + %result = call <16 x float> @llvm.ct.select.v16f32(i1 %cond, <16 x float> %a, <16 x float> %b) + ret <16 x float> %result +} + +define <8 x i64> @test_ctselect_v8i64(i1 %cond, <8 x i64> %a, <8 x i64> %b) { +; SSE2-LABEL: test_ctselect_v8i64: +; SSE2: # %bb.0: +; SSE2-NEXT: testb $1, %dil +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm9, %xmm9 +; SSE2-NEXT: movd %eax, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0] +; SSE2-NEXT: movdqa %xmm9, %xmm8 +; SSE2-NEXT: pand %xmm0, %xmm9 +; SSE2-NEXT: pandn %xmm4, %xmm8 +; SSE2-NEXT: por %xmm9, %xmm8 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm5, %xmm4 +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm6, %xmm5 +; SSE2-NEXT: por %xmm0, %xmm5 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm7, %xmm6 +; SSE2-NEXT: por %xmm0, %xmm6 +; SSE2-NEXT: movaps %xmm8, %xmm0 +; SSE2-NEXT: movaps %xmm4, %xmm1 +; SSE2-NEXT: movaps %xmm5, %xmm2 +; SSE2-NEXT: movaps %xmm6, %xmm3 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_ctselect_v8i64: +; AVX: # %bb.0: +; AVX-NEXT: testb $1, %dil +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: pxor %ymm5, %ymm5 +; AVX-NEXT: vmovd %eax, %ymm5 +; AVX-NEXT: vpermilpd {{.*#+}} ymm5 = ymm5[0,0,2,2] +; AVX-NEXT: vmovdqa %ymm5, %ymm4 +; AVX-NEXT: pand %ymm0, %ymm5 +; AVX-NEXT: pandn %ymm2, %ymm4 +; AVX-NEXT: por %ymm5, %ymm4 +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: pxor %ymm0, %ymm0 +; AVX-NEXT: vmovd %eax, %ymm0 +; AVX-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX-NEXT: vmovdqa %ymm0, %ymm2 +; AVX-NEXT: pand %ymm1, %ymm0 +; AVX-NEXT: pandn %ymm3, %ymm2 +; AVX-NEXT: por %ymm0, %ymm2 +; AVX-NEXT: vmovaps %ymm4, %ymm0 +; AVX-NEXT: vmovaps %ymm2, %ymm1 +; AVX-NEXT: retq +; +; AVX2-LABEL: test_ctselect_v8i64: +; AVX2: # %bb.0: +; AVX2-NEXT: testb $1, %dil +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: pxor %ymm5, %ymm5 +; AVX2-NEXT: vmovd %eax, %ymm5 +; AVX2-NEXT: vpermilpd {{.*#+}} ymm5 = ymm5[0,0,2,2] +; AVX2-NEXT: vmovdqa %ymm5, %ymm4 +; AVX2-NEXT: pand %ymm0, %ymm5 +; AVX2-NEXT: pandn %ymm2, %ymm4 +; AVX2-NEXT: por %ymm5, %ymm4 +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: pxor %ymm0, %ymm0 +; AVX2-NEXT: vmovd %eax, %ymm0 +; AVX2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX2-NEXT: vmovdqa %ymm0, %ymm2 +; AVX2-NEXT: pand %ymm1, %ymm0 +; AVX2-NEXT: pandn %ymm3, %ymm2 +; AVX2-NEXT: por %ymm0, %ymm2 +; AVX2-NEXT: vmovaps %ymm4, %ymm0 +; AVX2-NEXT: vmovaps %ymm2, %ymm1 +; AVX2-NEXT: retq +; AVX512-LABEL: test_ctselect_v8i64: +; AVX512: # %bb.0: +; AVX512-NEXT: testb %dil, %dil +; AVX512-NEXT: je .LBB10_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: vmovaps %zmm0, %zmm1 +; AVX512-NEXT: .LBB10_2: +; AVX512-NEXT: vmovaps %zmm1, %zmm0 +; AVX512-NEXT: retq + %result = call <8 x i64> @llvm.ct.select.v8i64(i1 %cond, <8 x i64> %a, <8 x i64> %b) + ret <8 x i64> %result +} + +define <8 x double> @test_ctselect_v8f64(i1 %cond, <8 x double> %a, <8 x double> %b) { +; SSE2-LABEL: test_ctselect_v8f64: +; SSE2: # %bb.0: +; SSE2-NEXT: testb $1, %dil +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm9, %xmm9 +; SSE2-NEXT: movd %eax, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0] +; SSE2-NEXT: movapd %xmm9, %xmm8 +; SSE2-NEXT: pand %xmm0, %xmm9 +; SSE2-NEXT: pandn %xmm4, %xmm8 +; SSE2-NEXT: por %xmm9, %xmm8 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: movapd %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm5, %xmm4 +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: movapd %xmm0, %xmm5 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm6, %xmm5 +; SSE2-NEXT: por %xmm0, %xmm5 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: movapd %xmm0, %xmm6 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm7, %xmm6 +; SSE2-NEXT: por %xmm0, %xmm6 +; SSE2-NEXT: movaps %xmm8, %xmm0 +; SSE2-NEXT: movaps %xmm4, %xmm1 +; SSE2-NEXT: movaps %xmm5, %xmm2 +; SSE2-NEXT: movaps %xmm6, %xmm3 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_ctselect_v8f64: +; AVX: # %bb.0: +; AVX-NEXT: testb $1, %dil +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: pxor %ymm5, %ymm5 +; AVX-NEXT: vmovd %eax, %ymm5 +; AVX-NEXT: vpermilpd {{.*#+}} ymm5 = ymm5[0,0,2,2] +; AVX-NEXT: vmovdqa %ymm5, %ymm4 +; AVX-NEXT: pand %ymm0, %ymm5 +; AVX-NEXT: pandn %ymm2, %ymm4 +; AVX-NEXT: por %ymm5, %ymm4 +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: pxor %ymm0, %ymm0 +; AVX-NEXT: vmovd %eax, %ymm0 +; AVX-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX-NEXT: vmovdqa %ymm0, %ymm2 +; AVX-NEXT: pand %ymm1, %ymm0 +; AVX-NEXT: pandn %ymm3, %ymm2 +; AVX-NEXT: por %ymm0, %ymm2 +; AVX-NEXT: vmovaps %ymm4, %ymm0 +; AVX-NEXT: vmovaps %ymm2, %ymm1 +; AVX-NEXT: retq +; +; AVX2-LABEL: test_ctselect_v8f64: +; AVX2: # %bb.0: +; AVX2-NEXT: testb $1, %dil +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: pxor %ymm5, %ymm5 +; AVX2-NEXT: vmovd %eax, %ymm5 +; AVX2-NEXT: vpermilpd {{.*#+}} ymm5 = ymm5[0,0,2,2] +; AVX2-NEXT: vmovdqa %ymm5, %ymm4 +; AVX2-NEXT: pand %ymm0, %ymm5 +; AVX2-NEXT: pandn %ymm2, %ymm4 +; AVX2-NEXT: por %ymm5, %ymm4 +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: pxor %ymm0, %ymm0 +; AVX2-NEXT: vmovd %eax, %ymm0 +; AVX2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX2-NEXT: vmovdqa %ymm0, %ymm2 +; AVX2-NEXT: pand %ymm1, %ymm0 +; AVX2-NEXT: pandn %ymm3, %ymm2 +; AVX2-NEXT: por %ymm0, %ymm2 +; AVX2-NEXT: vmovaps %ymm4, %ymm0 +; AVX2-NEXT: vmovaps %ymm2, %ymm1 +; AVX2-NEXT: retq +; AVX512-LABEL: test_ctselect_v8f64: +; AVX512: # %bb.0: +; AVX512-NEXT: testb %dil, %dil +; AVX512-NEXT: je .LBB11_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: vmovapd %zmm0, %zmm1 +; AVX512-NEXT: .LBB11_2: +; AVX512-NEXT: vmovapd %zmm1, %zmm0 +; AVX512-NEXT: retq + %result = call <8 x double> @llvm.ct.select.v8f64(i1 %cond, <8 x double> %a, <8 x double> %b) + ret <8 x double> %result +} + +; Test with constant conditions for vector types +define <4 x i32> @test_ctselect_v4i32_const_true(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: test_ctselect_v4i32_const_true: +; SSE2: # %bb.0: +; SSE2-NEXT: movb $1, %al +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_ctselect_v4i32_const_true: +; AVX: # %bb.0: +; AVX-NEXT: movb $1, %al +; AVX-NEXT: testb %al, %al +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: pxor %xmm3, %xmm3 +; AVX-NEXT: movd %eax, %xmm3 +; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; AVX-NEXT: movdqa %xmm3, %xmm2 +; AVX-NEXT: pand %xmm0, %xmm3 +; AVX-NEXT: pandn %xmm1, %xmm2 +; AVX-NEXT: por %xmm3, %xmm2 +; AVX-NEXT: vmovaps %xmm2, %xmm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: test_ctselect_v4i32_const_true: +; AVX2: # %bb.0: +; AVX2-NEXT: movb $1, %al +; AVX2-NEXT: testb %al, %al +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: pxor %xmm3, %xmm3 +; AVX2-NEXT: movd %eax, %xmm3 +; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; AVX2-NEXT: movdqa %xmm3, %xmm2 +; AVX2-NEXT: pand %xmm0, %xmm3 +; AVX2-NEXT: pandn %xmm1, %xmm2 +; AVX2-NEXT: por %xmm3, %xmm2 +; AVX2-NEXT: vmovaps %xmm2, %xmm0 +; AVX2-NEXT: retq +; AVX512-LABEL: test_ctselect_v4i32_const_true: +; AVX512: # %bb.0: +; AVX512-NEXT: retq + %result = call <4 x i32> @llvm.ct.select.v4i32(i1 true, <4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %result +} + +define <4 x i32> @test_ctselect_v4i32_const_false(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: test_ctselect_v4i32_const_false: +; SSE2: # %bb.0: +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_ctselect_v4i32_const_false: +; AVX: # %bb.0: +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: testb %al, %al +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: pxor %xmm3, %xmm3 +; AVX-NEXT: movd %eax, %xmm3 +; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; AVX-NEXT: movdqa %xmm3, %xmm2 +; AVX-NEXT: pand %xmm0, %xmm3 +; AVX-NEXT: pandn %xmm1, %xmm2 +; AVX-NEXT: por %xmm3, %xmm2 +; AVX-NEXT: vmovaps %xmm2, %xmm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: test_ctselect_v4i32_const_false: +; AVX2: # %bb.0: +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: testb %al, %al +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: pxor %xmm3, %xmm3 +; AVX2-NEXT: movd %eax, %xmm3 +; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; AVX2-NEXT: movdqa %xmm3, %xmm2 +; AVX2-NEXT: pand %xmm0, %xmm3 +; AVX2-NEXT: pandn %xmm1, %xmm2 +; AVX2-NEXT: por %xmm3, %xmm2 +; AVX2-NEXT: vmovaps %xmm2, %xmm0 +; AVX2-NEXT: retq +; AVX512-LABEL: test_ctselect_v4i32_const_false: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovaps %xmm1, %xmm0 +; AVX512-NEXT: retq + %result = call <4 x i32> @llvm.ct.select.v4i32(i1 false, <4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %result +} + +; Test with comparison conditions for vector types +define <4 x i32> @test_ctselect_v4i32_icmp(i32 %x, i32 %y, <4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: test_ctselect_v4i32_icmp: +; SSE2: # %bb.0: +; SSE2-NEXT: cmpl %esi, %edi +; SSE2-NEXT: sete %al +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_ctselect_v4i32_icmp: +; AVX: # %bb.0: +; AVX-NEXT: cmpl %esi, %edi +; AVX-NEXT: sete %al +; AVX-NEXT: testb %al, %al +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: pxor %xmm3, %xmm3 +; AVX-NEXT: movd %eax, %xmm3 +; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; AVX-NEXT: movdqa %xmm3, %xmm2 +; AVX-NEXT: pand %xmm0, %xmm3 +; AVX-NEXT: pandn %xmm1, %xmm2 +; AVX-NEXT: por %xmm3, %xmm2 +; AVX-NEXT: vmovaps %xmm2, %xmm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: test_ctselect_v4i32_icmp: +; AVX2: # %bb.0: +; AVX2-NEXT: cmpl %esi, %edi +; AVX2-NEXT: sete %al +; AVX2-NEXT: testb %al, %al +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: pxor %xmm3, %xmm3 +; AVX2-NEXT: movd %eax, %xmm3 +; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; AVX2-NEXT: movdqa %xmm3, %xmm2 +; AVX2-NEXT: pand %xmm0, %xmm3 +; AVX2-NEXT: pandn %xmm1, %xmm2 +; AVX2-NEXT: por %xmm3, %xmm2 +; AVX2-NEXT: vmovaps %xmm2, %xmm0 +; AVX2-NEXT: retq +; AVX512-LABEL: test_ctselect_v4i32_icmp: +; AVX512: # %bb.0: +; AVX512-NEXT: cmpl %esi, %edi +; AVX512-NEXT: je .LBB14_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: vmovaps %xmm1, %xmm0 +; AVX512-NEXT: .LBB14_2: +; AVX512-NEXT: retq + %cond = icmp eq i32 %x, %y + %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %result +} + +; Declare the intrinsics +declare <4 x i32> @llvm.ct.select.v4i32(i1, <4 x i32>, <4 x i32>) +declare <4 x float> @llvm.ct.select.v4f32(i1, <4 x float>, <4 x float>) +declare <2 x i64> @llvm.ct.select.v2i64(i1, <2 x i64>, <2 x i64>) +declare <2 x double> @llvm.ct.select.v2f64(i1, <2 x double>, <2 x double>) +declare <8 x i32> @llvm.ct.select.v8i32(i1, <8 x i32>, <8 x i32>) +declare <8 x float> @llvm.ct.select.v8f32(i1, <8 x float>, <8 x float>) +declare <4 x i64> @llvm.ct.select.v4i64(i1, <4 x i64>, <4 x i64>) +declare <4 x double> @llvm.ct.select.v4f64(i1, <4 x double>, <4 x double>) +declare <16 x i32> @llvm.ct.select.v16i32(i1, <16 x i32>, <16 x i32>) +declare <16 x float> @llvm.ct.select.v16f32(i1, <16 x float>, <16 x float>) +declare <8 x i64> @llvm.ct.select.v8i64(i1, <8 x i64>, <8 x i64>) +declare <8 x double> @llvm.ct.select.v8f64(i1, <8 x double>, <8 x double>) diff --git a/llvm/test/CodeGen/X86/ctselect.ll b/llvm/test/CodeGen/X86/ctselect.ll index 095787a5e2a4b..d76ae0365f28c 100644 --- a/llvm/test/CodeGen/X86/ctselect.ll +++ b/llvm/test/CodeGen/X86/ctselect.ll @@ -8,39 +8,33 @@ define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) { ; X64-LABEL: test_ctselect_i8: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi -; X64-NEXT: andb $1, %dil -; X64-NEXT: leal -1(%rdi), %eax -; X64-NEXT: movl %edi, %ecx -; X64-NEXT: negb %cl -; X64-NEXT: andb %sil, %cl -; X64-NEXT: andb %dl, %al -; X64-NEXT: orb %cl, %al +; X64-NEXT: movl %edx, %eax +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovnel %esi, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; ; X32-LABEL: test_ctselect_i8: ; X32: # %bb.0: -; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X32-NEXT: andb $1, %al -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: negb %cl -; X32-NEXT: andb {{[0-9]+}}(%esp), %cl -; X32-NEXT: decb %al -; X32-NEXT: andb {{[0-9]+}}(%esp), %al -; X32-NEXT: orb %cl, %al +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; X32-NEXT: # kill: def $al killed $al killed $eax ; X32-NEXT: retl ; ; X32-NOCMOV-LABEL: test_ctselect_i8: ; X32-NOCMOV: # %bb.0: -; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X32-NOCMOV-NEXT: andb $1, %al -; X32-NOCMOV-NEXT: movl %eax, %ecx -; X32-NOCMOV-NEXT: negb %cl -; X32-NOCMOV-NEXT: andb {{[0-9]+}}(%esp), %cl -; X32-NOCMOV-NEXT: decb %al -; X32-NOCMOV-NEXT: andb {{[0-9]+}}(%esp), %al -; X32-NOCMOV-NEXT: orb %cl, %al +; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: sete %ah +; X32-NOCMOV-NEXT: movb %ah, %ch +; X32-NOCMOV-NEXT: negb %ch +; X32-NOCMOV-NEXT: movb %dl, %al +; X32-NOCMOV-NEXT: andb %ch, %al +; X32-NOCMOV-NEXT: notb %ch +; X32-NOCMOV-NEXT: andb %cl, %ch +; X32-NOCMOV-NEXT: orb %ch, %al ; X32-NOCMOV-NEXT: retl %result = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b) ret i8 %result @@ -49,39 +43,43 @@ define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) { define i16 @test_ctselect_i16(i1 %cond, i16 %a, i16 %b) { ; X64-LABEL: test_ctselect_i16: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi -; X64-NEXT: andl $1, %edi -; X64-NEXT: leal -1(%rdi), %ecx -; X64-NEXT: movl %edi, %eax -; X64-NEXT: negl %eax -; X64-NEXT: andl %esi, %eax -; X64-NEXT: andl %edx, %ecx -; X64-NEXT: orl %ecx, %eax +; X64-NEXT: movl %edx, %eax +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovnel %esi, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq ; ; X32-LABEL: test_ctselect_i16: ; X32: # %bb.0: -; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X32-NEXT: andl $1, %eax -; X32-NEXT: leal -1(%eax), %ecx -; X32-NEXT: andw {{[0-9]+}}(%esp), %cx -; X32-NEXT: negl %eax -; X32-NEXT: andw {{[0-9]+}}(%esp), %ax -; X32-NEXT: orl %ecx, %eax -; X32-NEXT: # kill: def $ax killed $ax killed $eax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnew {{[0-9]+}}(%esp), %ax ; X32-NEXT: retl ; ; X32-NOCMOV-LABEL: test_ctselect_i16: ; X32-NOCMOV: # %bb.0: -; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X32-NOCMOV-NEXT: andl $1, %eax -; X32-NOCMOV-NEXT: leal -1(%eax), %ecx -; X32-NOCMOV-NEXT: andw {{[0-9]+}}(%esp), %cx -; X32-NOCMOV-NEXT: negl %eax -; X32-NOCMOV-NEXT: andw {{[0-9]+}}(%esp), %ax -; X32-NOCMOV-NEXT: orl %ecx, %eax -; X32-NOCMOV-NEXT: # kill: def $ax killed $ax killed $eax +; X32-NOCMOV-NEXT: pushl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -12 +; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8 +; X32-NOCMOV-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movzwl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: sete %bl +; X32-NOCMOV-NEXT: movb %bl, %bh +; X32-NOCMOV-NEXT: movzbw %bh, %si +; X32-NOCMOV-NEXT: negw %si +; X32-NOCMOV-NEXT: movw %dx, %ax +; X32-NOCMOV-NEXT: andw %si, %ax +; X32-NOCMOV-NEXT: notw %si +; X32-NOCMOV-NEXT: andw %cx, %si +; X32-NOCMOV-NEXT: orw %si, %ax +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 ; X32-NOCMOV-NEXT: retl %result = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b) ret i16 %result @@ -90,38 +88,42 @@ define i16 @test_ctselect_i16(i1 %cond, i16 %a, i16 %b) { define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) { ; X64-LABEL: test_ctselect_i32: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi -; X64-NEXT: andl $1, %edi -; X64-NEXT: leal -1(%rdi), %eax -; X64-NEXT: movl %edi, %ecx -; X64-NEXT: negl %ecx -; X64-NEXT: andl %esi, %ecx -; X64-NEXT: andl %edx, %eax -; X64-NEXT: orl %ecx, %eax +; X64-NEXT: movl %edx, %eax +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovnel %esi, %eax ; X64-NEXT: retq ; ; X32-LABEL: test_ctselect_i32: ; X32: # %bb.0: -; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X32-NEXT: andl $1, %eax -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: negl %ecx -; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: decl %eax -; X32-NEXT: andl {{[0-9]+}}(%esp), %eax -; X32-NEXT: orl %ecx, %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax ; X32-NEXT: retl ; ; X32-NOCMOV-LABEL: test_ctselect_i32: ; X32-NOCMOV: # %bb.0: -; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X32-NOCMOV-NEXT: andl $1, %eax -; X32-NOCMOV-NEXT: movl %eax, %ecx -; X32-NOCMOV-NEXT: negl %ecx -; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %ecx -; X32-NOCMOV-NEXT: decl %eax -; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %eax -; X32-NOCMOV-NEXT: orl %ecx, %eax +; X32-NOCMOV-NEXT: pushl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -12 +; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8 +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: sete %bl +; X32-NOCMOV-NEXT: movb %bl, %bh +; X32-NOCMOV-NEXT: movzbl %bh, %esi +; X32-NOCMOV-NEXT: negl %esi +; X32-NOCMOV-NEXT: movl %edx, %eax +; X32-NOCMOV-NEXT: andl %esi, %eax +; X32-NOCMOV-NEXT: notl %esi +; X32-NOCMOV-NEXT: andl %ecx, %esi +; X32-NOCMOV-NEXT: orl %esi, %eax +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 ; X32-NOCMOV-NEXT: retl %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) ret i32 %result @@ -130,56 +132,66 @@ define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) { define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) { ; X64-LABEL: test_ctselect_i64: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi -; X64-NEXT: andl $1, %edi -; X64-NEXT: leaq -1(%rdi), %rax -; X64-NEXT: negq %rdi -; X64-NEXT: andq %rsi, %rdi -; X64-NEXT: andq %rdx, %rax -; X64-NEXT: orq %rdi, %rax +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovneq %rsi, %rax ; X64-NEXT: retq ; ; X32-LABEL: test_ctselect_i64: ; X32: # %bb.0: -; X32-NEXT: pushl %esi -; X32-NEXT: .cfi_def_cfa_offset 8 -; X32-NEXT: .cfi_offset %esi, -8 -; X32-NEXT: movzbl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: xorl %edx, %eax -; X32-NEXT: andl $1, %esi -; X32-NEXT: negl %esi -; X32-NEXT: andl %esi, %eax -; X32-NEXT: xorl %edx, %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: xorl %ecx, %edx -; X32-NEXT: andl %esi, %edx -; X32-NEXT: xorl %ecx, %edx -; X32-NEXT: popl %esi -; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %edx ; X32-NEXT: retl ; ; X32-NOCMOV-LABEL: test_ctselect_i64: ; X32-NOCMOV: # %bb.0: -; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: pushl %ebp ; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 -; X32-NOCMOV-NEXT: .cfi_offset %esi, -8 -; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %esi +; X32-NOCMOV-NEXT: pushl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: pushl %edi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 20 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -20 +; X32-NOCMOV-NEXT: .cfi_offset %edi, -16 +; X32-NOCMOV-NEXT: .cfi_offset %ebx, -12 +; X32-NOCMOV-NEXT: .cfi_offset %ebp, -8 +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NOCMOV-NEXT: xorl %edx, %eax -; X32-NOCMOV-NEXT: andl $1, %esi +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ebx +; X32-NOCMOV-NEXT: testb $1, %bl +; X32-NOCMOV-NEXT: sete %bh +; X32-NOCMOV-NEXT: movb %bh, %cl +; X32-NOCMOV-NEXT: movzbl %cl, %esi ; X32-NOCMOV-NEXT: negl %esi +; X32-NOCMOV-NEXT: movl %edx, %eax ; X32-NOCMOV-NEXT: andl %esi, %eax -; X32-NOCMOV-NEXT: xorl %edx, %eax -; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NOCMOV-NEXT: xorl %ecx, %edx +; X32-NOCMOV-NEXT: notl %esi +; X32-NOCMOV-NEXT: andl %ebp, %esi +; X32-NOCMOV-NEXT: orl %esi, %eax +; X32-NOCMOV-NEXT: testb $1, %bl +; X32-NOCMOV-NEXT: sete %cl +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X32-NOCMOV-NEXT: movb %cl, %ch +; X32-NOCMOV-NEXT: movzbl %ch, %esi +; X32-NOCMOV-NEXT: negl %esi +; X32-NOCMOV-NEXT: movl %edi, %edx ; X32-NOCMOV-NEXT: andl %esi, %edx -; X32-NOCMOV-NEXT: xorl %ecx, %edx +; X32-NOCMOV-NEXT: notl %esi +; X32-NOCMOV-NEXT: andl %ebx, %esi +; X32-NOCMOV-NEXT: orl %esi, %edx ; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; X32-NOCMOV-NEXT: popl %edi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: popl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %ebp ; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 ; X32-NOCMOV-NEXT: retl %result = call i64 @llvm.ct.select.i64(i1 %cond, i64 %a, i64 %b) @@ -189,51 +201,74 @@ define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) { define float @test_ctselect_f32(i1 %cond, float %a, float %b) { ; X64-LABEL: test_ctselect_f32: ; X64: # %bb.0: -; X64-NEXT: movd %xmm1, %eax -; X64-NEXT: movd %xmm0, %ecx -; X64-NEXT: andl $1, %edi -; X64-NEXT: movl %edi, %edx -; X64-NEXT: negl %edx -; X64-NEXT: andl %ecx, %edx -; X64-NEXT: decl %edi -; X64-NEXT: andl %eax, %edi -; X64-NEXT: orl %edx, %edi -; X64-NEXT: movd %edi, %xmm0 +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: movd %xmm1, %ecx +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovnel %eax, %ecx +; X64-NEXT: movd %ecx, %xmm0 ; X64-NEXT: retq ; ; X32-LABEL: test_ctselect_f32: ; X32: # %bb.0: -; X32-NEXT: pushl %eax +; X32-NEXT: pushl %edi ; X32-NEXT: .cfi_def_cfa_offset 8 -; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X32-NEXT: andl $1, %eax -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: negl %ecx -; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: decl %eax -; X32-NEXT: andl {{[0-9]+}}(%esp), %eax -; X32-NEXT: orl %ecx, %eax -; X32-NEXT: movl %eax, (%esp) +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: pushl %eax +; X32-NEXT: .cfi_def_cfa_offset 16 +; X32-NEXT: .cfi_offset %esi, -12 +; X32-NEXT: .cfi_offset %edi, -8 +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: sete %al +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movb %al, %ah +; X32-NEXT: movzbl %ah, %edi +; X32-NEXT: negl %edi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: andl %edi, %esi +; X32-NEXT: notl %edi +; X32-NEXT: andl %ecx, %edi +; X32-NEXT: orl %edi, %esi +; X32-NEXT: movl %esi, (%esp) ; X32-NEXT: flds (%esp) -; X32-NEXT: popl %eax +; X32-NEXT: addl $4, %esp +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: popl %edi ; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X32-NOCMOV-LABEL: test_ctselect_f32: ; X32-NOCMOV: # %bb.0: -; X32-NOCMOV-NEXT: pushl %eax +; X32-NOCMOV-NEXT: pushl %edi ; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 -; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X32-NOCMOV-NEXT: andl $1, %eax -; X32-NOCMOV-NEXT: movl %eax, %ecx -; X32-NOCMOV-NEXT: negl %ecx -; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %ecx -; X32-NOCMOV-NEXT: decl %eax -; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %eax -; X32-NOCMOV-NEXT: orl %ecx, %eax -; X32-NOCMOV-NEXT: movl %eax, (%esp) +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: pushl %eax +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -12 +; X32-NOCMOV-NEXT: .cfi_offset %edi, -8 +; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: sete %al +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: movb %al, %ah +; X32-NOCMOV-NEXT: movzbl %ah, %edi +; X32-NOCMOV-NEXT: negl %edi +; X32-NOCMOV-NEXT: movl %edx, %esi +; X32-NOCMOV-NEXT: andl %edi, %esi +; X32-NOCMOV-NEXT: notl %edi +; X32-NOCMOV-NEXT: andl %ecx, %edi +; X32-NOCMOV-NEXT: orl %edi, %esi +; X32-NOCMOV-NEXT: movl %esi, (%esp) ; X32-NOCMOV-NEXT: flds (%esp) -; X32-NOCMOV-NEXT: popl %eax +; X32-NOCMOV-NEXT: addl $4, %esp +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %edi ; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 ; X32-NOCMOV-NEXT: retl %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b) @@ -243,74 +278,96 @@ define float @test_ctselect_f32(i1 %cond, float %a, float %b) { define double @test_ctselect_f64(i1 %cond, double %a, double %b) { ; X64-LABEL: test_ctselect_f64: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi -; X64-NEXT: movq %xmm1, %rax -; X64-NEXT: movq %xmm0, %rcx -; X64-NEXT: andl $1, %edi -; X64-NEXT: movq %rdi, %rdx -; X64-NEXT: negq %rdx -; X64-NEXT: andq %rcx, %rdx -; X64-NEXT: decq %rdi -; X64-NEXT: andq %rax, %rdi -; X64-NEXT: orq %rdx, %rdi -; X64-NEXT: movq %rdi, %xmm0 +; X64-NEXT: movq %xmm0, %rax +; X64-NEXT: movq %xmm1, %rcx +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovneq %rax, %rcx +; X64-NEXT: movq %rcx, %xmm0 ; X64-NEXT: retq ; ; X32-LABEL: test_ctselect_f64: ; X32: # %bb.0: -; X32-NEXT: pushl %esi +; X32-NEXT: pushl %edi ; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 12 ; X32-NEXT: subl $8, %esp -; X32-NEXT: .cfi_def_cfa_offset 16 -; X32-NEXT: .cfi_offset %esi, -8 -; X32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: .cfi_def_cfa_offset 20 +; X32-NEXT: .cfi_offset %esi, -12 +; X32-NEXT: .cfi_offset %edi, -8 +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: sete %al +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: xorl %edx, %esi -; X32-NEXT: andl $1, %ecx -; X32-NEXT: negl %ecx -; X32-NEXT: andl %ecx, %esi -; X32-NEXT: xorl %edx, %esi -; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X32-NEXT: movb %al, %ah +; X32-NEXT: movzbl %ah, %edi +; X32-NEXT: negl %edi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: andl %edi, %esi +; X32-NEXT: notl %edi +; X32-NEXT: andl %ecx, %edi +; X32-NEXT: orl %edi, %esi +; X32-NEXT: movl %esi, (%esp) +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: xorl %eax, %edx -; X32-NEXT: andl %ecx, %edx -; X32-NEXT: xorl %eax, %edx -; X32-NEXT: movl %edx, (%esp) +; X32-NEXT: movb %al, %ah +; X32-NEXT: movzbl %ah, %edi +; X32-NEXT: negl %edi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: andl %edi, %esi +; X32-NEXT: notl %edi +; X32-NEXT: andl %ecx, %edi +; X32-NEXT: orl %edi, %esi +; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X32-NEXT: fldl (%esp) ; X32-NEXT: addl $8, %esp -; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: .cfi_def_cfa_offset 12 ; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: popl %edi ; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X32-NOCMOV-LABEL: test_ctselect_f64: ; X32-NOCMOV: # %bb.0: -; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: pushl %edi ; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 ; X32-NOCMOV-NEXT: subl $8, %esp -; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16 -; X32-NOCMOV-NEXT: .cfi_offset %esi, -8 -; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 20 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -12 +; X32-NOCMOV-NEXT: .cfi_offset %edi, -8 +; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: sete %al +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NOCMOV-NEXT: xorl %edx, %esi -; X32-NOCMOV-NEXT: andl $1, %ecx -; X32-NOCMOV-NEXT: negl %ecx -; X32-NOCMOV-NEXT: andl %ecx, %esi -; X32-NOCMOV-NEXT: xorl %edx, %esi -; X32-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: movb %al, %ah +; X32-NOCMOV-NEXT: movzbl %ah, %edi +; X32-NOCMOV-NEXT: negl %edi +; X32-NOCMOV-NEXT: movl %edx, %esi +; X32-NOCMOV-NEXT: andl %edi, %esi +; X32-NOCMOV-NEXT: notl %edi +; X32-NOCMOV-NEXT: andl %ecx, %edi +; X32-NOCMOV-NEXT: orl %edi, %esi +; X32-NOCMOV-NEXT: movl %esi, (%esp) +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NOCMOV-NEXT: xorl %eax, %edx -; X32-NOCMOV-NEXT: andl %ecx, %edx -; X32-NOCMOV-NEXT: xorl %eax, %edx -; X32-NOCMOV-NEXT: movl %edx, (%esp) +; X32-NOCMOV-NEXT: movb %al, %ah +; X32-NOCMOV-NEXT: movzbl %ah, %edi +; X32-NOCMOV-NEXT: negl %edi +; X32-NOCMOV-NEXT: movl %edx, %esi +; X32-NOCMOV-NEXT: andl %edi, %esi +; X32-NOCMOV-NEXT: notl %edi +; X32-NOCMOV-NEXT: andl %ecx, %edi +; X32-NOCMOV-NEXT: orl %edi, %esi +; X32-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X32-NOCMOV-NEXT: fldl (%esp) ; X32-NOCMOV-NEXT: addl $8, %esp -; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 ; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %edi ; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 ; X32-NOCMOV-NEXT: retl %result = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b) @@ -320,37 +377,42 @@ define double @test_ctselect_f64(i1 %cond, double %a, double %b) { define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) { ; X64-LABEL: test_ctselect_ptr: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi -; X64-NEXT: andl $1, %edi -; X64-NEXT: leaq -1(%rdi), %rax -; X64-NEXT: negq %rdi -; X64-NEXT: andq %rsi, %rdi -; X64-NEXT: andq %rdx, %rax -; X64-NEXT: orq %rdi, %rax +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovneq %rsi, %rax ; X64-NEXT: retq ; ; X32-LABEL: test_ctselect_ptr: ; X32: # %bb.0: -; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X32-NEXT: andl $1, %eax -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: negl %ecx -; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: decl %eax -; X32-NEXT: andl {{[0-9]+}}(%esp), %eax -; X32-NEXT: orl %ecx, %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax ; X32-NEXT: retl ; ; X32-NOCMOV-LABEL: test_ctselect_ptr: ; X32-NOCMOV: # %bb.0: -; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X32-NOCMOV-NEXT: andl $1, %eax -; X32-NOCMOV-NEXT: movl %eax, %ecx -; X32-NOCMOV-NEXT: negl %ecx -; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %ecx -; X32-NOCMOV-NEXT: decl %eax -; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %eax -; X32-NOCMOV-NEXT: orl %ecx, %eax +; X32-NOCMOV-NEXT: pushl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -12 +; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8 +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: sete %bl +; X32-NOCMOV-NEXT: movb %bl, %bh +; X32-NOCMOV-NEXT: movzbl %bh, %esi +; X32-NOCMOV-NEXT: negl %esi +; X32-NOCMOV-NEXT: movl %edx, %eax +; X32-NOCMOV-NEXT: andl %esi, %eax +; X32-NOCMOV-NEXT: notl %esi +; X32-NOCMOV-NEXT: andl %ecx, %esi +; X32-NOCMOV-NEXT: orl %esi, %eax +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 ; X32-NOCMOV-NEXT: retl %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b) ret ptr %result @@ -360,17 +422,45 @@ define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) { define i32 @test_ctselect_const_true(i32 %a, i32 %b) { ; X64-LABEL: test_ctselect_const_true: ; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl %esi, %eax +; X64-NEXT: movb $1, %cl +; X64-NEXT: testb %cl, %cl +; X64-NEXT: cmovnel %edi, %eax ; X64-NEXT: retq ; ; X32-LABEL: test_ctselect_const_true: ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movb $1, %cl +; X32-NEXT: testb %cl, %cl +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax ; X32-NEXT: retl ; ; X32-NOCMOV-LABEL: test_ctselect_const_true: ; X32-NOCMOV: # %bb.0: -; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: pushl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -12 +; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8 +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: movb $1, %al +; X32-NOCMOV-NEXT: testb %al, %al +; X32-NOCMOV-NEXT: sete %bl +; X32-NOCMOV-NEXT: movb %bl, %bh +; X32-NOCMOV-NEXT: movzbl %bh, %esi +; X32-NOCMOV-NEXT: negl %esi +; X32-NOCMOV-NEXT: movl %edx, %eax +; X32-NOCMOV-NEXT: andl %esi, %eax +; X32-NOCMOV-NEXT: notl %esi +; X32-NOCMOV-NEXT: andl %ecx, %esi +; X32-NOCMOV-NEXT: orl %esi, %eax +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 ; X32-NOCMOV-NEXT: retl %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b) ret i32 %result @@ -380,18 +470,44 @@ define i32 @test_ctselect_const_false(i32 %a, i32 %b) { ; X64-LABEL: test_ctselect_const_false: ; X64: # %bb.0: ; X64-NEXT: movl %esi, %eax +; X64-NEXT: xorl %ecx, %ecx +; X64-NEXT: testb %cl, %cl +; X64-NEXT: cmovnel %edi, %eax ; X64-NEXT: retq ; ; X32-LABEL: test_ctselect_const_false: ; X32: # %bb.0: -; X32-NEXT: xorl %eax, %eax -; X32-NEXT: orl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: xorl %ecx, %ecx +; X32-NEXT: testb %cl, %cl +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax ; X32-NEXT: retl ; ; X32-NOCMOV-LABEL: test_ctselect_const_false: ; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: pushl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -12 +; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8 +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NOCMOV-NEXT: xorl %eax, %eax -; X32-NOCMOV-NEXT: orl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: testb %al, %al +; X32-NOCMOV-NEXT: sete %bl +; X32-NOCMOV-NEXT: movb %bl, %bh +; X32-NOCMOV-NEXT: movzbl %bh, %esi +; X32-NOCMOV-NEXT: negl %esi +; X32-NOCMOV-NEXT: movl %edx, %eax +; X32-NOCMOV-NEXT: andl %esi, %eax +; X32-NOCMOV-NEXT: notl %esi +; X32-NOCMOV-NEXT: andl %ecx, %esi +; X32-NOCMOV-NEXT: orl %esi, %eax +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 ; X32-NOCMOV-NEXT: retl %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b) ret i32 %result @@ -401,43 +517,50 @@ define i32 @test_ctselect_const_false(i32 %a, i32 %b) { define i32 @test_ctselect_icmp_eq(i32 %x, i32 %y, i32 %a, i32 %b) { ; X64-LABEL: test_ctselect_icmp_eq: ; X64: # %bb.0: -; X64-NEXT: xorl %eax, %eax +; X64-NEXT: movl %ecx, %eax ; X64-NEXT: cmpl %esi, %edi -; X64-NEXT: sete %al -; X64-NEXT: movl %eax, %esi -; X64-NEXT: negl %esi -; X64-NEXT: andl %edx, %esi -; X64-NEXT: decl %eax -; X64-NEXT: andl %ecx, %eax -; X64-NEXT: orl %esi, %eax +; X64-NEXT: sete %cl +; X64-NEXT: testb %cl, %cl +; X64-NEXT: cmovnel %edx, %eax ; X64-NEXT: retq ; ; X32-LABEL: test_ctselect_icmp_eq: ; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: xorl %eax, %eax ; X32-NEXT: cmpl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: sete %al -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: negl %ecx -; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: decl %eax -; X32-NEXT: andl {{[0-9]+}}(%esp), %eax -; X32-NEXT: orl %ecx, %eax +; X32-NEXT: sete %cl +; X32-NEXT: testb %cl, %cl +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax ; X32-NEXT: retl ; ; X32-NOCMOV-LABEL: test_ctselect_icmp_eq: ; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: pushl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -12 +; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8 ; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NOCMOV-NEXT: xorl %eax, %eax -; X32-NOCMOV-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: cmpl {{[0-9]+}}(%esp), %eax ; X32-NOCMOV-NEXT: sete %al -; X32-NOCMOV-NEXT: movl %eax, %ecx -; X32-NOCMOV-NEXT: negl %ecx -; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %ecx -; X32-NOCMOV-NEXT: decl %eax -; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %eax -; X32-NOCMOV-NEXT: orl %ecx, %eax +; X32-NOCMOV-NEXT: testb %al, %al +; X32-NOCMOV-NEXT: sete %bl +; X32-NOCMOV-NEXT: movb %bl, %bh +; X32-NOCMOV-NEXT: movzbl %bh, %esi +; X32-NOCMOV-NEXT: negl %esi +; X32-NOCMOV-NEXT: movl %edx, %eax +; X32-NOCMOV-NEXT: andl %esi, %eax +; X32-NOCMOV-NEXT: notl %esi +; X32-NOCMOV-NEXT: andl %ecx, %esi +; X32-NOCMOV-NEXT: orl %esi, %eax +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 ; X32-NOCMOV-NEXT: retl %cond = icmp eq i32 %x, %y %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) @@ -447,43 +570,50 @@ define i32 @test_ctselect_icmp_eq(i32 %x, i32 %y, i32 %a, i32 %b) { define i32 @test_ctselect_icmp_ne(i32 %x, i32 %y, i32 %a, i32 %b) { ; X64-LABEL: test_ctselect_icmp_ne: ; X64: # %bb.0: -; X64-NEXT: xorl %eax, %eax +; X64-NEXT: movl %ecx, %eax ; X64-NEXT: cmpl %esi, %edi -; X64-NEXT: setne %al -; X64-NEXT: movl %eax, %esi -; X64-NEXT: negl %esi -; X64-NEXT: andl %edx, %esi -; X64-NEXT: decl %eax -; X64-NEXT: andl %ecx, %eax -; X64-NEXT: orl %esi, %eax +; X64-NEXT: setne %cl +; X64-NEXT: testb %cl, %cl +; X64-NEXT: cmovnel %edx, %eax ; X64-NEXT: retq ; ; X32-LABEL: test_ctselect_icmp_ne: ; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: xorl %eax, %eax ; X32-NEXT: cmpl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: setne %al -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: negl %ecx -; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: decl %eax -; X32-NEXT: andl {{[0-9]+}}(%esp), %eax -; X32-NEXT: orl %ecx, %eax +; X32-NEXT: setne %cl +; X32-NEXT: testb %cl, %cl +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax ; X32-NEXT: retl ; ; X32-NOCMOV-LABEL: test_ctselect_icmp_ne: ; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: pushl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -12 +; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8 ; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NOCMOV-NEXT: xorl %eax, %eax -; X32-NOCMOV-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: cmpl {{[0-9]+}}(%esp), %eax ; X32-NOCMOV-NEXT: setne %al -; X32-NOCMOV-NEXT: movl %eax, %ecx -; X32-NOCMOV-NEXT: negl %ecx -; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %ecx -; X32-NOCMOV-NEXT: decl %eax -; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %eax -; X32-NOCMOV-NEXT: orl %ecx, %eax +; X32-NOCMOV-NEXT: testb %al, %al +; X32-NOCMOV-NEXT: sete %bl +; X32-NOCMOV-NEXT: movb %bl, %bh +; X32-NOCMOV-NEXT: movzbl %bh, %esi +; X32-NOCMOV-NEXT: negl %esi +; X32-NOCMOV-NEXT: movl %edx, %eax +; X32-NOCMOV-NEXT: andl %esi, %eax +; X32-NOCMOV-NEXT: notl %esi +; X32-NOCMOV-NEXT: andl %ecx, %esi +; X32-NOCMOV-NEXT: orl %esi, %eax +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 ; X32-NOCMOV-NEXT: retl %cond = icmp ne i32 %x, %y %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) @@ -493,43 +623,50 @@ define i32 @test_ctselect_icmp_ne(i32 %x, i32 %y, i32 %a, i32 %b) { define i32 @test_ctselect_icmp_slt(i32 %x, i32 %y, i32 %a, i32 %b) { ; X64-LABEL: test_ctselect_icmp_slt: ; X64: # %bb.0: -; X64-NEXT: xorl %eax, %eax +; X64-NEXT: movl %ecx, %eax ; X64-NEXT: cmpl %esi, %edi -; X64-NEXT: setl %al -; X64-NEXT: movl %eax, %esi -; X64-NEXT: negl %esi -; X64-NEXT: andl %edx, %esi -; X64-NEXT: decl %eax -; X64-NEXT: andl %ecx, %eax -; X64-NEXT: orl %esi, %eax +; X64-NEXT: setl %cl +; X64-NEXT: testb %cl, %cl +; X64-NEXT: cmovnel %edx, %eax ; X64-NEXT: retq ; ; X32-LABEL: test_ctselect_icmp_slt: ; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: xorl %eax, %eax ; X32-NEXT: cmpl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: setl %al -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: negl %ecx -; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: decl %eax -; X32-NEXT: andl {{[0-9]+}}(%esp), %eax -; X32-NEXT: orl %ecx, %eax +; X32-NEXT: setl %cl +; X32-NEXT: testb %cl, %cl +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax ; X32-NEXT: retl ; ; X32-NOCMOV-LABEL: test_ctselect_icmp_slt: ; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: pushl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -12 +; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8 ; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NOCMOV-NEXT: xorl %eax, %eax -; X32-NOCMOV-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: cmpl {{[0-9]+}}(%esp), %eax ; X32-NOCMOV-NEXT: setl %al -; X32-NOCMOV-NEXT: movl %eax, %ecx -; X32-NOCMOV-NEXT: negl %ecx -; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %ecx -; X32-NOCMOV-NEXT: decl %eax -; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %eax -; X32-NOCMOV-NEXT: orl %ecx, %eax +; X32-NOCMOV-NEXT: testb %al, %al +; X32-NOCMOV-NEXT: sete %bl +; X32-NOCMOV-NEXT: movb %bl, %bh +; X32-NOCMOV-NEXT: movzbl %bh, %esi +; X32-NOCMOV-NEXT: negl %esi +; X32-NOCMOV-NEXT: movl %edx, %eax +; X32-NOCMOV-NEXT: andl %esi, %eax +; X32-NOCMOV-NEXT: notl %esi +; X32-NOCMOV-NEXT: andl %ecx, %esi +; X32-NOCMOV-NEXT: orl %esi, %eax +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 ; X32-NOCMOV-NEXT: retl %cond = icmp slt i32 %x, %y %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) @@ -539,39 +676,50 @@ define i32 @test_ctselect_icmp_slt(i32 %x, i32 %y, i32 %a, i32 %b) { define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) { ; X64-LABEL: test_ctselect_icmp_ult: ; X64: # %bb.0: -; X64-NEXT: xorl %eax, %eax +; X64-NEXT: movl %ecx, %eax ; X64-NEXT: cmpl %esi, %edi -; X64-NEXT: sbbl %eax, %eax -; X64-NEXT: andl %eax, %edx -; X64-NEXT: notl %eax -; X64-NEXT: andl %ecx, %eax -; X64-NEXT: orl %edx, %eax +; X64-NEXT: setb %cl +; X64-NEXT: testb %cl, %cl +; X64-NEXT: cmovnel %edx, %eax ; X64-NEXT: retq ; ; X32-LABEL: test_ctselect_icmp_ult: ; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: xorl %eax, %eax ; X32-NEXT: cmpl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: sbbl %eax, %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: andl %eax, %ecx -; X32-NEXT: notl %eax -; X32-NEXT: andl {{[0-9]+}}(%esp), %eax -; X32-NEXT: orl %ecx, %eax +; X32-NEXT: setb %cl +; X32-NEXT: testb %cl, %cl +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax ; X32-NEXT: retl ; ; X32-NOCMOV-LABEL: test_ctselect_icmp_ult: ; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: pushl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -12 +; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8 ; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NOCMOV-NEXT: xorl %eax, %eax -; X32-NOCMOV-NEXT: cmpl {{[0-9]+}}(%esp), %ecx -; X32-NOCMOV-NEXT: sbbl %eax, %eax -; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NOCMOV-NEXT: andl %eax, %ecx -; X32-NOCMOV-NEXT: notl %eax -; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %eax -; X32-NOCMOV-NEXT: orl %ecx, %eax +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: setb %al +; X32-NOCMOV-NEXT: testb %al, %al +; X32-NOCMOV-NEXT: sete %bl +; X32-NOCMOV-NEXT: movb %bl, %bh +; X32-NOCMOV-NEXT: movzbl %bh, %esi +; X32-NOCMOV-NEXT: negl %esi +; X32-NOCMOV-NEXT: movl %edx, %eax +; X32-NOCMOV-NEXT: andl %esi, %eax +; X32-NOCMOV-NEXT: notl %esi +; X32-NOCMOV-NEXT: andl %ecx, %esi +; X32-NOCMOV-NEXT: orl %esi, %eax +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 ; X32-NOCMOV-NEXT: retl %cond = icmp ult i32 %x, %y %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) @@ -581,45 +729,64 @@ define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) { define float @test_ctselect_fcmp_oeq(float %x, float %y, float %a, float %b) { ; X64-LABEL: test_ctselect_fcmp_oeq: ; X64: # %bb.0: -; X64-NEXT: movd %xmm3, %eax -; X64-NEXT: cmpeqss %xmm1, %xmm0 -; X64-NEXT: movd %xmm0, %ecx -; X64-NEXT: pand %xmm2, %xmm0 -; X64-NEXT: movd %xmm0, %edx -; X64-NEXT: notl %ecx -; X64-NEXT: andl %eax, %ecx -; X64-NEXT: orl %edx, %ecx +; X64-NEXT: movd %xmm2, %eax +; X64-NEXT: movd %xmm3, %ecx +; X64-NEXT: ucomiss %xmm1, %xmm0 +; X64-NEXT: setnp %dl +; X64-NEXT: sete %sil +; X64-NEXT: testb %dl, %sil +; X64-NEXT: cmovnel %eax, %ecx ; X64-NEXT: movd %ecx, %xmm0 ; X64-NEXT: retq ; ; X32-LABEL: test_ctselect_fcmp_oeq: ; X32: # %bb.0: -; X32-NEXT: pushl %eax +; X32-NEXT: pushl %edi ; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: pushl %eax +; X32-NEXT: .cfi_def_cfa_offset 16 +; X32-NEXT: .cfi_offset %esi, -12 +; X32-NEXT: .cfi_offset %edi, -8 ; X32-NEXT: flds {{[0-9]+}}(%esp) ; X32-NEXT: flds {{[0-9]+}}(%esp) ; X32-NEXT: fucompi %st(1), %st ; X32-NEXT: fstp %st(0) ; X32-NEXT: setnp %al ; X32-NEXT: sete %cl -; X32-NEXT: andb %al, %cl -; X32-NEXT: movzbl %cl, %eax -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: negl %ecx -; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: decl %eax -; X32-NEXT: andl {{[0-9]+}}(%esp), %eax -; X32-NEXT: orl %ecx, %eax -; X32-NEXT: movl %eax, (%esp) +; X32-NEXT: testb %al, %cl +; X32-NEXT: sete %al +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movb %al, %ah +; X32-NEXT: movzbl %ah, %edi +; X32-NEXT: negl %edi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: andl %edi, %esi +; X32-NEXT: notl %edi +; X32-NEXT: andl %ecx, %edi +; X32-NEXT: orl %edi, %esi +; X32-NEXT: movl %esi, (%esp) ; X32-NEXT: flds (%esp) -; X32-NEXT: popl %eax +; X32-NEXT: addl $4, %esp +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: popl %edi ; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X32-NOCMOV-LABEL: test_ctselect_fcmp_oeq: ; X32-NOCMOV: # %bb.0: -; X32-NOCMOV-NEXT: pushl %eax +; X32-NOCMOV-NEXT: pushl %edi ; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: pushl %eax +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -12 +; X32-NOCMOV-NEXT: .cfi_offset %edi, -8 ; X32-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) ; X32-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) ; X32-NOCMOV-NEXT: fucompp @@ -628,17 +795,25 @@ define float @test_ctselect_fcmp_oeq(float %x, float %y, float %a, float %b) { ; X32-NOCMOV-NEXT: sahf ; X32-NOCMOV-NEXT: setnp %al ; X32-NOCMOV-NEXT: sete %cl -; X32-NOCMOV-NEXT: andb %al, %cl -; X32-NOCMOV-NEXT: movzbl %cl, %eax -; X32-NOCMOV-NEXT: movl %eax, %ecx -; X32-NOCMOV-NEXT: negl %ecx -; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %ecx -; X32-NOCMOV-NEXT: decl %eax -; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %eax -; X32-NOCMOV-NEXT: orl %ecx, %eax -; X32-NOCMOV-NEXT: movl %eax, (%esp) +; X32-NOCMOV-NEXT: testb %al, %cl +; X32-NOCMOV-NEXT: sete %al +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: movb %al, %ah +; X32-NOCMOV-NEXT: movzbl %ah, %edi +; X32-NOCMOV-NEXT: negl %edi +; X32-NOCMOV-NEXT: movl %edx, %esi +; X32-NOCMOV-NEXT: andl %edi, %esi +; X32-NOCMOV-NEXT: notl %edi +; X32-NOCMOV-NEXT: andl %ecx, %edi +; X32-NOCMOV-NEXT: orl %edi, %esi +; X32-NOCMOV-NEXT: movl %esi, (%esp) ; X32-NOCMOV-NEXT: flds (%esp) -; X32-NOCMOV-NEXT: popl %eax +; X32-NOCMOV-NEXT: addl $4, %esp +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %edi ; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 ; X32-NOCMOV-NEXT: retl %cond = fcmp oeq float %x, %y @@ -650,51 +825,45 @@ define float @test_ctselect_fcmp_oeq(float %x, float %y, float %a, float %b) { define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) { ; X64-LABEL: test_ctselect_load: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi -; X64-NEXT: andl $1, %edi -; X64-NEXT: leal -1(%rdi), %eax -; X64-NEXT: movl %edi, %ecx -; X64-NEXT: negl %ecx -; X64-NEXT: andl (%rsi), %ecx -; X64-NEXT: andl (%rdx), %eax -; X64-NEXT: orl %ecx, %eax +; X64-NEXT: movl (%rdx), %eax +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovnel (%rsi), %eax ; X64-NEXT: retq ; ; X32-LABEL: test_ctselect_load: ; X32: # %bb.0: -; X32-NEXT: pushl %esi -; X32-NEXT: .cfi_def_cfa_offset 8 -; X32-NEXT: .cfi_offset %esi, -8 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X32-NEXT: andl $1, %eax -; X32-NEXT: movl %eax, %esi -; X32-NEXT: negl %esi -; X32-NEXT: andl (%edx), %esi -; X32-NEXT: decl %eax -; X32-NEXT: andl (%ecx), %eax -; X32-NEXT: orl %esi, %eax -; X32-NEXT: popl %esi -; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl (%eax), %eax +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel (%ecx), %eax ; X32-NEXT: retl ; ; X32-NOCMOV-LABEL: test_ctselect_load: ; X32-NOCMOV: # %bb.0: -; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: pushl %ebx ; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 -; X32-NOCMOV-NEXT: .cfi_offset %esi, -8 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -12 +; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8 +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X32-NOCMOV-NEXT: andl $1, %eax -; X32-NOCMOV-NEXT: movl %eax, %esi +; X32-NOCMOV-NEXT: movl (%ecx), %ecx +; X32-NOCMOV-NEXT: movl (%eax), %edx +; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: sete %bl +; X32-NOCMOV-NEXT: movb %bl, %bh +; X32-NOCMOV-NEXT: movzbl %bh, %esi ; X32-NOCMOV-NEXT: negl %esi -; X32-NOCMOV-NEXT: andl (%edx), %esi -; X32-NOCMOV-NEXT: decl %eax -; X32-NOCMOV-NEXT: andl (%ecx), %eax +; X32-NOCMOV-NEXT: movl %edx, %eax +; X32-NOCMOV-NEXT: andl %esi, %eax +; X32-NOCMOV-NEXT: notl %esi +; X32-NOCMOV-NEXT: andl %ecx, %esi ; X32-NOCMOV-NEXT: orl %esi, %eax ; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %ebx ; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 ; X32-NOCMOV-NEXT: retl %a = load i32, ptr %p1 @@ -707,62 +876,63 @@ define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) { define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) { ; X64-LABEL: test_ctselect_nested: ; X64: # %bb.0: -; X64-NEXT: # kill: def $esi killed $esi def $rsi -; X64-NEXT: # kill: def $edi killed $edi def $rdi -; X64-NEXT: andl $1, %esi -; X64-NEXT: leal -1(%rsi), %r9d -; X64-NEXT: movl %esi, %eax -; X64-NEXT: negl %eax -; X64-NEXT: andl %edx, %eax -; X64-NEXT: andl %ecx, %r9d -; X64-NEXT: orl %eax, %r9d -; X64-NEXT: andl $1, %edi -; X64-NEXT: leal -1(%rdi), %eax -; X64-NEXT: movl %edi, %ecx -; X64-NEXT: negl %ecx -; X64-NEXT: andl %r9d, %ecx -; X64-NEXT: andl %r8d, %eax -; X64-NEXT: orl %ecx, %eax +; X64-NEXT: movl %r8d, %eax +; X64-NEXT: testb $1, %sil +; X64-NEXT: cmovnel %edx, %ecx +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovnel %ecx, %eax ; X64-NEXT: retq ; ; X32-LABEL: test_ctselect_nested: ; X32: # %bb.0: -; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: andl $1, %ecx -; X32-NEXT: movl %ecx, %edx -; X32-NEXT: negl %edx -; X32-NEXT: andl {{[0-9]+}}(%esp), %edx -; X32-NEXT: decl %ecx -; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: orl %edx, %ecx -; X32-NEXT: andl $1, %eax -; X32-NEXT: movl %eax, %edx -; X32-NEXT: negl %edx -; X32-NEXT: andl %ecx, %edx -; X32-NEXT: decl %eax -; X32-NEXT: andl {{[0-9]+}}(%esp), %eax -; X32-NEXT: orl %edx, %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel %ecx, %eax ; X32-NEXT: retl ; ; X32-NOCMOV-LABEL: test_ctselect_nested: ; X32-NOCMOV: # %bb.0: -; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X32-NOCMOV-NEXT: andl $1, %ecx -; X32-NOCMOV-NEXT: movl %ecx, %edx -; X32-NOCMOV-NEXT: negl %edx -; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %edx -; X32-NOCMOV-NEXT: decl %ecx -; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %ecx -; X32-NOCMOV-NEXT: orl %edx, %ecx -; X32-NOCMOV-NEXT: andl $1, %eax -; X32-NOCMOV-NEXT: movl %eax, %edx -; X32-NOCMOV-NEXT: negl %edx -; X32-NOCMOV-NEXT: andl %ecx, %edx -; X32-NOCMOV-NEXT: decl %eax -; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %eax -; X32-NOCMOV-NEXT: orl %edx, %eax +; X32-NOCMOV-NEXT: pushl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %edi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -16 +; X32-NOCMOV-NEXT: .cfi_offset %edi, -12 +; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8 +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: sete %bl +; X32-NOCMOV-NEXT: movb %bl, %bh +; X32-NOCMOV-NEXT: movzbl %bh, %edi +; X32-NOCMOV-NEXT: negl %edi +; X32-NOCMOV-NEXT: movl %edx, %esi +; X32-NOCMOV-NEXT: andl %edi, %esi +; X32-NOCMOV-NEXT: notl %edi +; X32-NOCMOV-NEXT: andl %eax, %edi +; X32-NOCMOV-NEXT: orl %edi, %esi +; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: sete %dl +; X32-NOCMOV-NEXT: movb %dl, %dh +; X32-NOCMOV-NEXT: movzbl %dh, %edi +; X32-NOCMOV-NEXT: negl %edi +; X32-NOCMOV-NEXT: movl %ecx, %eax +; X32-NOCMOV-NEXT: andl %edi, %eax +; X32-NOCMOV-NEXT: notl %edi +; X32-NOCMOV-NEXT: andl %esi, %edi +; X32-NOCMOV-NEXT: orl %edi, %eax +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: popl %edi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 ; X32-NOCMOV-NEXT: retl %inner = call i32 @llvm.ct.select.i32(i1 %cond2, i32 %a, i32 %b) %result = call i32 @llvm.ct.select.i32(i1 %cond1, i32 %inner, i32 %c) From 4f620622c0aafa01754dfebf076391c103810c2d Mon Sep 17 00:00:00 2001 From: wizardengineer Date: Wed, 5 Nov 2025 23:56:12 -0500 Subject: [PATCH 2/2] [LLVM][X86] Add f80 support for ct.select Add special handling for x86_fp80 types in CTSELECT lowering by splitting them into three 32-bit chunks, performing constant-time selection on each chunk, and reassembling the result. This fixes crashes when compiling tests with f80 types. Also updated ctselect.ll to match current generic fallback implementation. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 5300 +++++++++++---------- llvm/lib/Target/X86/X86ISelLowering.h | 3781 ++++++++------- llvm/lib/Target/X86/X86InstrInfo.cpp | 919 ++-- llvm/lib/Target/X86/X86InstrInfo.h | 21 +- llvm/lib/Target/X86/X86TargetMachine.cpp | 15 +- llvm/test/CodeGen/X86/ctselect-i386-fp.ll | 272 +- 6 files changed, 5209 insertions(+), 5099 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 833afa717c32c..7c5de8a834d79 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -29,9 +29,9 @@ #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/VectorUtils.h" -#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/IntrinsicLowering.h" +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -193,10 +193,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // We don't accept any truncstore of integer registers. setTruncStoreAction(MVT::i64, MVT::i32, Expand); setTruncStoreAction(MVT::i64, MVT::i16, Expand); - setTruncStoreAction(MVT::i64, MVT::i8 , Expand); + setTruncStoreAction(MVT::i64, MVT::i8, Expand); setTruncStoreAction(MVT::i32, MVT::i16, Expand); - setTruncStoreAction(MVT::i32, MVT::i8 , Expand); - setTruncStoreAction(MVT::i16, MVT::i8, Expand); + setTruncStoreAction(MVT::i32, MVT::i8, Expand); + setTruncStoreAction(MVT::i16, MVT::i8, Expand); setTruncStoreAction(MVT::f64, MVT::f32, Expand); @@ -208,106 +208,106 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // Integer absolute. if (Subtarget.canUseCMOV()) { - setOperationAction(ISD::ABS , MVT::i16 , Custom); - setOperationAction(ISD::ABS , MVT::i32 , Custom); + setOperationAction(ISD::ABS, MVT::i16, Custom); + setOperationAction(ISD::ABS, MVT::i32, Custom); if (Subtarget.is64Bit()) - setOperationAction(ISD::ABS , MVT::i64 , Custom); + setOperationAction(ISD::ABS, MVT::i64, Custom); } // Absolute difference. for (auto Op : {ISD::ABDS, ISD::ABDU}) { - setOperationAction(Op , MVT::i8 , Custom); - setOperationAction(Op , MVT::i16 , Custom); - setOperationAction(Op , MVT::i32 , Custom); + setOperationAction(Op, MVT::i8, Custom); + setOperationAction(Op, MVT::i16, Custom); + setOperationAction(Op, MVT::i32, Custom); if (Subtarget.is64Bit()) - setOperationAction(Op , MVT::i64 , Custom); + setOperationAction(Op, MVT::i64, Custom); } // Signed saturation subtraction. - setOperationAction(ISD::SSUBSAT , MVT::i8 , Custom); - setOperationAction(ISD::SSUBSAT , MVT::i16 , Custom); - setOperationAction(ISD::SSUBSAT , MVT::i32 , Custom); + setOperationAction(ISD::SSUBSAT, MVT::i8, Custom); + setOperationAction(ISD::SSUBSAT, MVT::i16, Custom); + setOperationAction(ISD::SSUBSAT, MVT::i32, Custom); if (Subtarget.is64Bit()) - setOperationAction(ISD::SSUBSAT , MVT::i64 , Custom); + setOperationAction(ISD::SSUBSAT, MVT::i64, Custom); // Funnel shifts. for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) { // For slow shld targets we only lower for code size. LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal; - setOperationAction(ShiftOp , MVT::i8 , Custom); - setOperationAction(ShiftOp , MVT::i16 , Custom); - setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction); + setOperationAction(ShiftOp, MVT::i8, Custom); + setOperationAction(ShiftOp, MVT::i16, Custom); + setOperationAction(ShiftOp, MVT::i32, ShiftDoubleAction); if (Subtarget.is64Bit()) - setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction); + setOperationAction(ShiftOp, MVT::i64, ShiftDoubleAction); } if (!Subtarget.useSoftFloat()) { // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this // operation. - setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote); + setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote); - setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote); + setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote); // We have an algorithm for SSE2, and we turn this into a 64-bit // FILD or VCVTUSI2SS/SD for other targets. - setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom); // We have an algorithm for SSE2->double, and we turn this into a // 64-bit FILD followed by conditional FADD for other targets. - setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom); // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have // this operation. - setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote); + setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote); // SSE has no i16 to fp conversion, only i32. We promote in the handler // to allow f80 to use i16 and f64 to use i16 with sse1 only - setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom); // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not - setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom); // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 // are Legal, f80 is custom lowered. - setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom); // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have // this operation. - setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote); + setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote); // FIXME: This doesn't generate invalid exception when it should. PR44019. - setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8, Promote); - setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8, Promote); + setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom); - setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom); // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 // are Legal, f80 is custom lowered. - setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom); // Handle FP_TO_UINT by promoting the destination to a larger signed // conversion. - setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote); + setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote); // FIXME: This doesn't generate invalid exception when it should. PR44019. - setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8, Promote); - setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8, Promote); + setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote); // FIXME: This doesn't generate invalid exception when it should. PR44019. setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote); - setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom); - setOperationAction(ISD::LRINT, MVT::f32, Custom); - setOperationAction(ISD::LRINT, MVT::f64, Custom); - setOperationAction(ISD::LLRINT, MVT::f32, Custom); - setOperationAction(ISD::LLRINT, MVT::f64, Custom); + setOperationAction(ISD::LRINT, MVT::f32, Custom); + setOperationAction(ISD::LRINT, MVT::f64, Custom); + setOperationAction(ISD::LLRINT, MVT::f32, Custom); + setOperationAction(ISD::LLRINT, MVT::f64, Custom); if (!Subtarget.is64Bit()) { - setOperationAction(ISD::LRINT, MVT::i64, Custom); + setOperationAction(ISD::LRINT, MVT::i64, Custom); setOperationAction(ISD::LLRINT, MVT::i64, Custom); } } @@ -315,7 +315,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (Subtarget.hasSSE2()) { // Custom lowering for saturating float to int conversions. // We handle promotion to larger result types manually. - for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) { + for (MVT VT : {MVT::i8, MVT::i16, MVT::i32}) { setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom); setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom); } @@ -348,17 +348,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // TODO: when we have SSE, these could be more efficient, by using movd/movq. if (!Subtarget.hasSSE2()) { - setOperationAction(ISD::BITCAST , MVT::f32 , Expand); - setOperationAction(ISD::BITCAST , MVT::i32 , Expand); + setOperationAction(ISD::BITCAST, MVT::f32, Expand); + setOperationAction(ISD::BITCAST, MVT::i32, Expand); setOperationAction(ISD::FCANONICALIZE, MVT::f32, Custom); setOperationAction(ISD::FCANONICALIZE, MVT::f64, Custom); if (Subtarget.is64Bit()) { - setOperationAction(ISD::BITCAST , MVT::f64 , Expand); + setOperationAction(ISD::BITCAST, MVT::f64, Expand); // Without SSE, i64->f64 goes through memory. - setOperationAction(ISD::BITCAST , MVT::i64 , Expand); + setOperationAction(ISD::BITCAST, MVT::i64, Expand); } } else if (!Subtarget.is64Bit()) - setOperationAction(ISD::BITCAST , MVT::i64 , Custom); + setOperationAction(ISD::BITCAST, MVT::i64, Custom); // Scalar integer divide and remainder are lowered to use operations that // produce two results, to match the available instructions. This exposes @@ -370,7 +370,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // (low) operations are left as Legal, as there are single-result // instructions for this in x86. Using the two-result multiply instructions // when both high and low results are needed must be arranged by dagcombine. - for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { + for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) { setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::MULHU, VT, Expand); setOperationAction(ISD::SDIV, VT, Expand); @@ -379,47 +379,47 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UREM, VT, Expand); } - setOperationAction(ISD::BR_JT , MVT::Other, Expand); - setOperationAction(ISD::BRCOND , MVT::Other, Custom); - for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128, - MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { - setOperationAction(ISD::BR_CC, VT, Expand); + setOperationAction(ISD::BR_JT, MVT::Other, Expand); + setOperationAction(ISD::BRCOND, MVT::Other, Custom); + for (auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128, MVT::i8, MVT::i16, + MVT::i32, MVT::i64}) { + setOperationAction(ISD::BR_CC, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand); } if (Subtarget.is64Bit()) setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); - setOperationAction(ISD::FREM , MVT::f32 , Expand); - setOperationAction(ISD::FREM , MVT::f64 , Expand); - setOperationAction(ISD::FREM , MVT::f80 , Expand); - setOperationAction(ISD::FREM , MVT::f128 , Expand); + setOperationAction(ISD::FREM, MVT::f32, Expand); + setOperationAction(ISD::FREM, MVT::f64, Expand); + setOperationAction(ISD::FREM, MVT::f80, Expand); + setOperationAction(ISD::FREM, MVT::f128, Expand); if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) { - setOperationAction(ISD::GET_ROUNDING , MVT::i32 , Custom); - setOperationAction(ISD::SET_ROUNDING , MVT::Other, Custom); - setOperationAction(ISD::GET_FPENV_MEM , MVT::Other, Custom); - setOperationAction(ISD::SET_FPENV_MEM , MVT::Other, Custom); - setOperationAction(ISD::RESET_FPENV , MVT::Other, Custom); + setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom); + setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom); + setOperationAction(ISD::GET_FPENV_MEM, MVT::Other, Custom); + setOperationAction(ISD::SET_FPENV_MEM, MVT::Other, Custom); + setOperationAction(ISD::RESET_FPENV, MVT::Other, Custom); } // Promote the i8 variants and force them on up to i32 which has a shorter // encoding. - setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32); - setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32); + setOperationPromotedToType(ISD::CTTZ, MVT::i8, MVT::i32); + setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8, MVT::i32); // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to // promote that too. - setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32); - setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , MVT::i32); + setOperationPromotedToType(ISD::CTTZ, MVT::i16, MVT::i32); + setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i16, MVT::i32); if (!Subtarget.hasBMI()) { - setOperationAction(ISD::CTTZ , MVT::i32 , Custom); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal); + setOperationAction(ISD::CTTZ, MVT::i32, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Legal); if (Subtarget.is64Bit()) { - setOperationAction(ISD::CTTZ , MVT::i64 , Custom); + setOperationAction(ISD::CTTZ, MVT::i64, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal); } } @@ -427,13 +427,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (Subtarget.hasLZCNT()) { // When promoting the i8 variants, force them to i32 for a shorter // encoding. - setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32); - setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32); + setOperationPromotedToType(ISD::CTLZ, MVT::i8, MVT::i32); + setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8, MVT::i32); } else { for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) { if (VT == MVT::i64 && !Subtarget.is64Bit()) continue; - setOperationAction(ISD::CTLZ , VT, Custom); + setOperationAction(ISD::CTLZ, VT, Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom); } } @@ -478,31 +478,31 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // on the dest that popcntl hasn't had since Cannon Lake. setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32); } else { - setOperationAction(ISD::CTPOP , MVT::i8 , Custom); - setOperationAction(ISD::CTPOP , MVT::i16 , Custom); - setOperationAction(ISD::CTPOP , MVT::i32 , Custom); - setOperationAction(ISD::CTPOP , MVT::i64 , Custom); + setOperationAction(ISD::CTPOP, MVT::i8, Custom); + setOperationAction(ISD::CTPOP, MVT::i16, Custom); + setOperationAction(ISD::CTPOP, MVT::i32, Custom); + setOperationAction(ISD::CTPOP, MVT::i64, Custom); } - setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); + setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom); if (!Subtarget.hasMOVBE()) - setOperationAction(ISD::BSWAP , MVT::i16 , Expand); + setOperationAction(ISD::BSWAP, MVT::i16, Expand); // X86 wants to expand cmov itself. - for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) { + for (auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) { setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::CTSELECT, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::STRICT_FSETCC, VT, Custom); setOperationAction(ISD::STRICT_FSETCCS, VT, Custom); } - for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { + for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) { if (VT == MVT::i64 && !Subtarget.is64Bit()) continue; setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::CTSELECT, VT, Custom); - setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::SETCC, VT, Custom); } // Custom action for SELECT MMX and expand action for SELECT_CC MMX @@ -510,7 +510,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CTSELECT, MVT::x86mmx, Custom); setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand); - setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); + setOperationAction(ISD::EH_RETURN, MVT::Other, Custom); // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since // LLVM/Clang supports zero-cost DWARF and SEH exception handling. setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); @@ -518,19 +518,19 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom); // Darwin ABI issue. - for (auto VT : { MVT::i32, MVT::i64 }) { + for (auto VT : {MVT::i32, MVT::i64}) { if (VT == MVT::i64 && !Subtarget.is64Bit()) continue; - setOperationAction(ISD::ConstantPool , VT, Custom); - setOperationAction(ISD::JumpTable , VT, Custom); - setOperationAction(ISD::GlobalAddress , VT, Custom); + setOperationAction(ISD::ConstantPool, VT, Custom); + setOperationAction(ISD::JumpTable, VT, Custom); + setOperationAction(ISD::GlobalAddress, VT, Custom); setOperationAction(ISD::GlobalTLSAddress, VT, Custom); - setOperationAction(ISD::ExternalSymbol , VT, Custom); - setOperationAction(ISD::BlockAddress , VT, Custom); + setOperationAction(ISD::ExternalSymbol, VT, Custom); + setOperationAction(ISD::BlockAddress, VT, Custom); } // 64-bit shl, sra, srl (iff 32-bit x86) - for (auto VT : { MVT::i32, MVT::i64 }) { + for (auto VT : {MVT::i32, MVT::i64}) { if (VT == MVT::i64 && !Subtarget.is64Bit()) continue; setOperationAction(ISD::SHL_PARTS, VT, Custom); @@ -539,12 +539,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } if (Subtarget.hasSSEPrefetch()) - setOperationAction(ISD::PREFETCH , MVT::Other, Custom); + setOperationAction(ISD::PREFETCH, MVT::Other, Custom); - setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom); + setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom); // Expand certain atomics - for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { + for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) { setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom); setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom); @@ -588,14 +588,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal); // VASTART needs to be custom lowered to use the VarArgsFrameIndex - setOperationAction(ISD::VASTART , MVT::Other, Custom); - setOperationAction(ISD::VAEND , MVT::Other, Expand); + setOperationAction(ISD::VASTART, MVT::Other, Custom); + setOperationAction(ISD::VAEND, MVT::Other, Expand); bool Is64Bit = Subtarget.is64Bit(); - setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand); + setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand); setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand); - setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); - setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); + setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); + setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom); @@ -605,7 +605,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal); - auto setF16Action = [&] (MVT VT, LegalizeAction Action) { + auto setF16Action = [&](MVT VT, LegalizeAction Action) { setOperationAction(ISD::FABS, VT, Action); setOperationAction(ISD::FNEG, VT, Action); setOperationAction(ISD::FCOPYSIGN, VT, Expand); @@ -661,7 +661,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // non-optsize case. setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); - for (auto VT : { MVT::f32, MVT::f64 }) { + for (auto VT : {MVT::f32, MVT::f64}) { // Use ANDPD to simulate FABS. setOperationAction(ISD::FABS, VT, Custom); @@ -676,8 +676,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FSUB, VT, Custom); // We don't support sin/cos/fmod - setOperationAction(ISD::FSIN , VT, Expand); - setOperationAction(ISD::FCOS , VT, Expand); + setOperationAction(ISD::FSIN, VT, Expand); + setOperationAction(ISD::FCOS, VT, Expand); setOperationAction(ISD::FSINCOS, VT, Expand); } @@ -740,10 +740,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addRegisterClass(MVT::f64, &X86::RFP64RegClass); // Use ANDPS to simulate FABS. - setOperationAction(ISD::FABS , MVT::f32, Custom); + setOperationAction(ISD::FABS, MVT::f32, Custom); // Use XORP to simulate FNEG. - setOperationAction(ISD::FNEG , MVT::f32, Custom); + setOperationAction(ISD::FNEG, MVT::f32, Custom); if (UseX87) setOperationAction(ISD::UNDEF, MVT::f64, Expand); @@ -754,8 +754,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); // We don't support sin/cos/fmod - setOperationAction(ISD::FSIN , MVT::f32, Expand); - setOperationAction(ISD::FCOS , MVT::f32, Expand); + setOperationAction(ISD::FSIN, MVT::f32, Expand); + setOperationAction(ISD::FCOS, MVT::f32, Expand); setOperationAction(ISD::FSINCOS, MVT::f32, Expand); if (UseX87) { @@ -770,13 +770,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addRegisterClass(MVT::f64, &X86::RFP64RegClass); addRegisterClass(MVT::f32, &X86::RFP32RegClass); - for (auto VT : { MVT::f32, MVT::f64 }) { - setOperationAction(ISD::UNDEF, VT, Expand); + for (auto VT : {MVT::f32, MVT::f64}) { + setOperationAction(ISD::UNDEF, VT, Expand); setOperationAction(ISD::FCOPYSIGN, VT, Expand); // Always expand sin/cos functions even though x87 has an instruction. - setOperationAction(ISD::FSIN , VT, Expand); - setOperationAction(ISD::FCOS , VT, Expand); + setOperationAction(ISD::FSIN, VT, Expand); + setOperationAction(ISD::FCOS, VT, Expand); setOperationAction(ISD::FSINCOS, VT, Expand); } } @@ -788,7 +788,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addLegalFPImmediate(APFloat(+1.0f)); // FLD1 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS - } else // SSE immediates. + } else // SSE immediates. addLegalFPImmediate(APFloat(+0.0f)); // xorps } // Expand FP64 immediates into loads from the stack, save special cases. @@ -798,7 +798,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addLegalFPImmediate(APFloat(+1.0)); // FLD1 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS - } else // SSE immediates. + } else // SSE immediates. addLegalFPImmediate(APFloat(+0.0)); // xorpd } // Support fp16 0 immediate. @@ -806,18 +806,18 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf())); // Handle constrained floating-point operations of scalar. - setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal); - setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal); - setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal); - setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal); - setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal); - setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal); - setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal); - setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal); - setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal); - setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal); - setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal); - setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal); + setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal); + setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal); + setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal); + setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal); + setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal); + setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal); + setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal); + setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal); + setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal); + setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal); // We don't support FMA. setOperationAction(ISD::FMA, MVT::f64, Expand); @@ -826,21 +826,21 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // f80 always uses X87. if (UseX87) { addRegisterClass(MVT::f80, &X86::RFP80RegClass); - setOperationAction(ISD::UNDEF, MVT::f80, Expand); + setOperationAction(ISD::UNDEF, MVT::f80, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); { APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended()); - addLegalFPImmediate(TmpFlt); // FLD0 + addLegalFPImmediate(TmpFlt); // FLD0 TmpFlt.changeSign(); - addLegalFPImmediate(TmpFlt); // FLD0/FCHS + addLegalFPImmediate(TmpFlt); // FLD0/FCHS bool ignored; APFloat TmpFlt2(+1.0); - TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven, - &ignored); - addLegalFPImmediate(TmpFlt2); // FLD1 + TmpFlt2.convert(APFloat::x87DoubleExtended(), + APFloat::rmNearestTiesToEven, &ignored); + addLegalFPImmediate(TmpFlt2); // FLD1 TmpFlt2.changeSign(); - addLegalFPImmediate(TmpFlt2); // FLD1/FCHS + addLegalFPImmediate(TmpFlt2); // FLD1/FCHS } // Always expand sin/cos functions even though x87 has an instruction. @@ -859,9 +859,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // clang-format on setOperationAction(ISD::FFLOOR, MVT::f80, Expand); - setOperationAction(ISD::FCEIL, MVT::f80, Expand); + setOperationAction(ISD::FCEIL, MVT::f80, Expand); setOperationAction(ISD::FTRUNC, MVT::f80, Expand); - setOperationAction(ISD::FRINT, MVT::f80, Expand); + setOperationAction(ISD::FRINT, MVT::f80, Expand); setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand); setOperationAction(ISD::FROUNDEVEN, MVT::f80, Expand); setOperationAction(ISD::FMA, MVT::f80, Expand); @@ -871,12 +871,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::LLRINT, MVT::f80, Custom); // Handle constrained floating-point operations of scalar. - setOperationAction(ISD::STRICT_FADD , MVT::f80, Legal); - setOperationAction(ISD::STRICT_FSUB , MVT::f80, Legal); - setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal); - setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal); - setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal); - setOperationAction(ISD::FCANONICALIZE , MVT::f80, Custom); + setOperationAction(ISD::STRICT_FADD, MVT::f80, Legal); + setOperationAction(ISD::STRICT_FSUB, MVT::f80, Legal); + setOperationAction(ISD::STRICT_FMUL, MVT::f80, Legal); + setOperationAction(ISD::STRICT_FDIV, MVT::f80, Legal); + setOperationAction(ISD::STRICT_FSQRT, MVT::f80, Legal); + setOperationAction(ISD::FCANONICALIZE, MVT::f80, Custom); if (isTypeLegal(MVT::f16)) { setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Custom); @@ -895,16 +895,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps - setOperationAction(ISD::FADD, MVT::f128, LibCall); + setOperationAction(ISD::FADD, MVT::f128, LibCall); setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall); - setOperationAction(ISD::FSUB, MVT::f128, LibCall); + setOperationAction(ISD::FSUB, MVT::f128, LibCall); setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall); - setOperationAction(ISD::FDIV, MVT::f128, LibCall); + setOperationAction(ISD::FDIV, MVT::f128, LibCall); setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall); - setOperationAction(ISD::FMUL, MVT::f128, LibCall); + setOperationAction(ISD::FMUL, MVT::f128, LibCall); setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall); - setOperationAction(ISD::FMA, MVT::f128, LibCall); - setOperationAction(ISD::STRICT_FMA, MVT::f128, LibCall); + setOperationAction(ISD::FMA, MVT::f128, LibCall); + setOperationAction(ISD::STRICT_FMA, MVT::f128, LibCall); setOperationAction(ISD::FABS, MVT::f128, Custom); setOperationAction(ISD::FNEG, MVT::f128, Custom); @@ -920,10 +920,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_FTAN, MVT::f128, LibCall); // clang-format on // No STRICT_FSINCOS - setOperationAction(ISD::FSQRT, MVT::f128, LibCall); + setOperationAction(ISD::FSQRT, MVT::f128, LibCall); setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall); - setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom); + setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom); // We need to custom handle any FP_ROUND with an f128 input, but // LegalizeDAG uses the result type to know when to run a custom handler. @@ -953,10 +953,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } // Always use a library call for pow. - setOperationAction(ISD::FPOW , MVT::f32 , Expand); - setOperationAction(ISD::FPOW , MVT::f64 , Expand); - setOperationAction(ISD::FPOW , MVT::f80 , Expand); - setOperationAction(ISD::FPOW , MVT::f128 , Expand); + setOperationAction(ISD::FPOW, MVT::f32, Expand); + setOperationAction(ISD::FPOW, MVT::f64, Expand); + setOperationAction(ISD::FPOW, MVT::f80, Expand); + setOperationAction(ISD::FPOW, MVT::f128, Expand); setOperationAction(ISD::FLOG, MVT::f80, Expand); setOperationAction(ISD::FLOG2, MVT::f80, Expand); @@ -968,9 +968,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FMAXNUM, MVT::f80, Expand); // Some FP actions are always expanded for vector types. - for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16, - MVT::v4f32, MVT::v8f32, MVT::v16f32, - MVT::v2f64, MVT::v4f64, MVT::v8f64 }) { + for (auto VT : {MVT::v8f16, MVT::v16f16, MVT::v32f16, MVT::v4f32, MVT::v8f32, + MVT::v16f32, MVT::v2f64, MVT::v4f64, MVT::v8f64}) { // clang-format off setOperationAction(ISD::FSIN, VT, Expand); setOperationAction(ISD::FSINCOS, VT, Expand); @@ -996,11 +995,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UDIV, VT, Expand); setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Expand); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand); - setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand); - setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand); - setOperationAction(ISD::FMA, VT, Expand); + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Expand); + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Expand); + setOperationAction(ISD::FMA, VT, Expand); setOperationAction(ISD::FFLOOR, VT, Expand); setOperationAction(ISD::FCEIL, VT, Expand); setOperationAction(ISD::FTRUNC, VT, Expand); @@ -1024,7 +1023,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_TO_SINT, VT, Expand); setOperationAction(ISD::UINT_TO_FP, VT, Expand); setOperationAction(ISD::SINT_TO_FP, VT, Expand); - setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); setOperationAction(ISD::TRUNCATE, VT, Expand); setOperationAction(ISD::SIGN_EXTEND, VT, Expand); setOperationAction(ISD::ZERO_EXTEND, VT, Expand); @@ -1062,31 +1061,31 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); - setOperationAction(ISD::FMAXIMUM, MVT::f32, Custom); - setOperationAction(ISD::FMINIMUM, MVT::f32, Custom); - setOperationAction(ISD::FMAXIMUMNUM, MVT::f32, Custom); - setOperationAction(ISD::FMINIMUMNUM, MVT::f32, Custom); - - setOperationAction(ISD::FNEG, MVT::v4f32, Custom); - setOperationAction(ISD::FABS, MVT::v4f32, Custom); - setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom); - setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); - setOperationAction(ISD::VSELECT, MVT::v4f32, Custom); + setOperationAction(ISD::FMAXIMUM, MVT::f32, Custom); + setOperationAction(ISD::FMINIMUM, MVT::f32, Custom); + setOperationAction(ISD::FMAXIMUMNUM, MVT::f32, Custom); + setOperationAction(ISD::FMINIMUMNUM, MVT::f32, Custom); + + setOperationAction(ISD::FNEG, MVT::v4f32, Custom); + setOperationAction(ISD::FABS, MVT::v4f32, Custom); + setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); + setOperationAction(ISD::VSELECT, MVT::v4f32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); - setOperationAction(ISD::SELECT, MVT::v4f32, Custom); + setOperationAction(ISD::SELECT, MVT::v4f32, Custom); setOperationAction(ISD::CTSELECT, MVT::v4f32, Custom); setOperationAction(ISD::FCANONICALIZE, MVT::v4f32, Custom); - setOperationAction(ISD::LOAD, MVT::v2f32, Custom); - setOperationAction(ISD::STORE, MVT::v2f32, Custom); + setOperationAction(ISD::LOAD, MVT::v2f32, Custom); + setOperationAction(ISD::STORE, MVT::v2f32, Custom); setOperationAction(ISD::FCANONICALIZE, MVT::v2f32, Custom); - setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal); - setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal); - setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal); - setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal); - setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal); + setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal); + setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal); + setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal); + setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal); + setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal); } if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) { @@ -1106,74 +1105,74 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); - for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) { + for (auto VT : {MVT::f64, MVT::v4f32, MVT::v2f64}) { setOperationAction(ISD::FMAXIMUM, VT, Custom); setOperationAction(ISD::FMINIMUM, VT, Custom); setOperationAction(ISD::FMAXIMUMNUM, VT, Custom); setOperationAction(ISD::FMINIMUMNUM, VT, Custom); } - for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8, - MVT::v2i16, MVT::v4i16, MVT::v2i32 }) { + for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16, + MVT::v2i32}) { setOperationAction(ISD::SDIV, VT, Custom); setOperationAction(ISD::SREM, VT, Custom); setOperationAction(ISD::UDIV, VT, Custom); setOperationAction(ISD::UREM, VT, Custom); } - setOperationAction(ISD::MUL, MVT::v2i8, Custom); - setOperationAction(ISD::MUL, MVT::v4i8, Custom); - setOperationAction(ISD::MUL, MVT::v8i8, Custom); - - setOperationAction(ISD::MUL, MVT::v16i8, Custom); - setOperationAction(ISD::MUL, MVT::v4i32, Custom); - setOperationAction(ISD::MUL, MVT::v2i64, Custom); - setOperationAction(ISD::MULHU, MVT::v4i32, Custom); - setOperationAction(ISD::MULHS, MVT::v4i32, Custom); - setOperationAction(ISD::MULHU, MVT::v16i8, Custom); - setOperationAction(ISD::MULHS, MVT::v16i8, Custom); - setOperationAction(ISD::MULHU, MVT::v8i16, Legal); - setOperationAction(ISD::MULHS, MVT::v8i16, Legal); - setOperationAction(ISD::MUL, MVT::v8i16, Legal); - setOperationAction(ISD::AVGCEILU, MVT::v16i8, Legal); - setOperationAction(ISD::AVGCEILU, MVT::v8i16, Legal); - - setOperationAction(ISD::SMULO, MVT::v16i8, Custom); - setOperationAction(ISD::UMULO, MVT::v16i8, Custom); - setOperationAction(ISD::UMULO, MVT::v2i32, Custom); - - setOperationAction(ISD::FNEG, MVT::v2f64, Custom); + setOperationAction(ISD::MUL, MVT::v2i8, Custom); + setOperationAction(ISD::MUL, MVT::v4i8, Custom); + setOperationAction(ISD::MUL, MVT::v8i8, Custom); + + setOperationAction(ISD::MUL, MVT::v16i8, Custom); + setOperationAction(ISD::MUL, MVT::v4i32, Custom); + setOperationAction(ISD::MUL, MVT::v2i64, Custom); + setOperationAction(ISD::MULHU, MVT::v4i32, Custom); + setOperationAction(ISD::MULHS, MVT::v4i32, Custom); + setOperationAction(ISD::MULHU, MVT::v16i8, Custom); + setOperationAction(ISD::MULHS, MVT::v16i8, Custom); + setOperationAction(ISD::MULHU, MVT::v8i16, Legal); + setOperationAction(ISD::MULHS, MVT::v8i16, Legal); + setOperationAction(ISD::MUL, MVT::v8i16, Legal); + setOperationAction(ISD::AVGCEILU, MVT::v16i8, Legal); + setOperationAction(ISD::AVGCEILU, MVT::v8i16, Legal); + + setOperationAction(ISD::SMULO, MVT::v16i8, Custom); + setOperationAction(ISD::UMULO, MVT::v16i8, Custom); + setOperationAction(ISD::UMULO, MVT::v2i32, Custom); + + setOperationAction(ISD::FNEG, MVT::v2f64, Custom); setOperationAction(ISD::FCANONICALIZE, MVT::v2f64, Custom); - setOperationAction(ISD::FABS, MVT::v2f64, Custom); - setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom); + setOperationAction(ISD::FABS, MVT::v2f64, Custom); + setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom); setOperationAction(ISD::LRINT, MVT::v4f32, Custom); setOperationAction(ISD::LRINT, MVT::v2i32, Custom); - for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { + for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) { setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom); setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom); setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom); setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom); } - setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal); - setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal); - setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal); - setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal); - setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal); - setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal); - setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal); - setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal); - setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom); - setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom); + setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal); + setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal); + setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal); + setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal); + setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal); + setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal); + setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal); + setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal); + setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom); + setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); - for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { + for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) { setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::CTPOP, VT, Custom); setOperationAction(ISD::ABS, VT, Custom); @@ -1186,30 +1185,30 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setCondCodeAction(ISD::SETLE, VT, Custom); } - setOperationAction(ISD::SETCC, MVT::v2f64, Custom); - setOperationAction(ISD::SETCC, MVT::v4f32, Custom); - setOperationAction(ISD::STRICT_FSETCC, MVT::v2f64, Custom); - setOperationAction(ISD::STRICT_FSETCC, MVT::v4f32, Custom); + setOperationAction(ISD::SETCC, MVT::v2f64, Custom); + setOperationAction(ISD::SETCC, MVT::v4f32, Custom); + setOperationAction(ISD::STRICT_FSETCC, MVT::v2f64, Custom); + setOperationAction(ISD::STRICT_FSETCC, MVT::v4f32, Custom); setOperationAction(ISD::STRICT_FSETCCS, MVT::v2f64, Custom); setOperationAction(ISD::STRICT_FSETCCS, MVT::v4f32, Custom); - for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) { - setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); - setOperationAction(ISD::BUILD_VECTOR, VT, Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); - setOperationAction(ISD::VSELECT, VT, Custom); + for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32}) { + setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); } - for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) { - setOperationAction(ISD::BUILD_VECTOR, VT, Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); - setOperationAction(ISD::VSELECT, VT, Custom); + for (auto VT : {MVT::v8f16, MVT::v2f64, MVT::v2i64}) { + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::VSELECT, VT, Custom); if (VT == MVT::v2i64 && !Subtarget.is64Bit()) continue; - setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); } setF16Action(MVT::v8f16, Expand); @@ -1222,12 +1221,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Custom); // Custom lower v2i64 and v2f64 selects. - setOperationAction(ISD::SELECT, MVT::v2f64, Custom); - setOperationAction(ISD::SELECT, MVT::v2i64, Custom); - setOperationAction(ISD::SELECT, MVT::v4i32, Custom); - setOperationAction(ISD::SELECT, MVT::v8i16, Custom); - setOperationAction(ISD::SELECT, MVT::v8f16, Custom); - setOperationAction(ISD::SELECT, MVT::v16i8, Custom); + setOperationAction(ISD::SELECT, MVT::v2f64, Custom); + setOperationAction(ISD::SELECT, MVT::v2i64, Custom); + setOperationAction(ISD::SELECT, MVT::v4i32, Custom); + setOperationAction(ISD::SELECT, MVT::v8i16, Custom); + setOperationAction(ISD::SELECT, MVT::v8f16, Custom); + setOperationAction(ISD::SELECT, MVT::v16i8, Custom); setOperationAction(ISD::CTSELECT, MVT::v2f64, Custom); setOperationAction(ISD::CTSELECT, MVT::v2i64, Custom); @@ -1236,60 +1235,60 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CTSELECT, MVT::v8f16, Custom); setOperationAction(ISD::CTSELECT, MVT::v16i8, Custom); - setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Custom); - setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom); - setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Custom); - setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom); // Custom legalize these to avoid over promotion or custom promotion. for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) { - setOperationAction(ISD::FP_TO_SINT, VT, Custom); - setOperationAction(ISD::FP_TO_UINT, VT, Custom); + setOperationAction(ISD::FP_TO_SINT, VT, Custom); + setOperationAction(ISD::FP_TO_UINT, VT, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom); } - setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom); - setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Custom); - setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); - setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom); - setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom); - setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom); - setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); - setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Custom); // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion. - setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom); - setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom); - setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom); - setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom); - setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom); - setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom); - setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom); - setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f32, Custom); + setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom); + setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f32, Custom); // We want to legalize this to an f64 load rather than an i64 load on // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for // store. - setOperationAction(ISD::LOAD, MVT::v2i32, Custom); - setOperationAction(ISD::LOAD, MVT::v4i16, Custom); - setOperationAction(ISD::LOAD, MVT::v8i8, Custom); - setOperationAction(ISD::STORE, MVT::v2i32, Custom); - setOperationAction(ISD::STORE, MVT::v4i16, Custom); - setOperationAction(ISD::STORE, MVT::v8i8, Custom); + setOperationAction(ISD::LOAD, MVT::v2i32, Custom); + setOperationAction(ISD::LOAD, MVT::v4i16, Custom); + setOperationAction(ISD::LOAD, MVT::v8i8, Custom); + setOperationAction(ISD::STORE, MVT::v2i32, Custom); + setOperationAction(ISD::STORE, MVT::v4i16, Custom); + setOperationAction(ISD::STORE, MVT::v8i8, Custom); // Add 32-bit vector stores to help vectorization opportunities. - setOperationAction(ISD::STORE, MVT::v2i16, Custom); - setOperationAction(ISD::STORE, MVT::v4i8, Custom); + setOperationAction(ISD::STORE, MVT::v2i16, Custom); + setOperationAction(ISD::STORE, MVT::v4i8, Custom); - setOperationAction(ISD::BITCAST, MVT::v2i32, Custom); - setOperationAction(ISD::BITCAST, MVT::v4i16, Custom); - setOperationAction(ISD::BITCAST, MVT::v8i8, Custom); + setOperationAction(ISD::BITCAST, MVT::v2i32, Custom); + setOperationAction(ISD::BITCAST, MVT::v4i16, Custom); + setOperationAction(ISD::BITCAST, MVT::v8i8, Custom); if (!Subtarget.hasAVX512()) setOperationAction(ISD::BITCAST, MVT::v16i1, Custom); @@ -1299,41 +1298,42 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v2i64, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v4i64, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v2i64, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v4i64, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom); // In the customized shift lowering, the legal v4i32/v2i64 cases // in AVX2 will be recognized. - for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { - setOperationAction(ISD::SRL, VT, Custom); - setOperationAction(ISD::SHL, VT, Custom); - setOperationAction(ISD::SRA, VT, Custom); - if (VT == MVT::v2i64) continue; - setOperationAction(ISD::ROTL, VT, Custom); - setOperationAction(ISD::ROTR, VT, Custom); - setOperationAction(ISD::FSHL, VT, Custom); - setOperationAction(ISD::FSHR, VT, Custom); + for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) { + setOperationAction(ISD::SRL, VT, Custom); + setOperationAction(ISD::SHL, VT, Custom); + setOperationAction(ISD::SRA, VT, Custom); + if (VT == MVT::v2i64) + continue; + setOperationAction(ISD::ROTL, VT, Custom); + setOperationAction(ISD::ROTR, VT, Custom); + setOperationAction(ISD::FSHL, VT, Custom); + setOperationAction(ISD::FSHR, VT, Custom); } - setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal); - setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal); - setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal); - setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal); - setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal); + setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal); + setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal); + setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal); + setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal); + setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal); } if (!Subtarget.useSoftFloat() && Subtarget.hasGFNI()) { @@ -1348,73 +1348,73 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) { - setOperationAction(ISD::ABS, MVT::v16i8, Legal); - setOperationAction(ISD::ABS, MVT::v8i16, Legal); - setOperationAction(ISD::ABS, MVT::v4i32, Legal); + setOperationAction(ISD::ABS, MVT::v16i8, Legal); + setOperationAction(ISD::ABS, MVT::v8i16, Legal); + setOperationAction(ISD::ABS, MVT::v4i32, Legal); for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) { - setOperationAction(ISD::BITREVERSE, VT, Custom); - setOperationAction(ISD::CTLZ, VT, Custom); + setOperationAction(ISD::BITREVERSE, VT, Custom); + setOperationAction(ISD::CTLZ, VT, Custom); } // These might be better off as horizontal vector ops. - setOperationAction(ISD::ADD, MVT::i16, Custom); - setOperationAction(ISD::ADD, MVT::i32, Custom); - setOperationAction(ISD::SUB, MVT::i16, Custom); - setOperationAction(ISD::SUB, MVT::i32, Custom); + setOperationAction(ISD::ADD, MVT::i16, Custom); + setOperationAction(ISD::ADD, MVT::i32, Custom); + setOperationAction(ISD::SUB, MVT::i16, Custom); + setOperationAction(ISD::SUB, MVT::i32, Custom); } if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) { for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) { - setOperationAction(ISD::FFLOOR, RoundedTy, Legal); - setOperationAction(ISD::STRICT_FFLOOR, RoundedTy, Legal); - setOperationAction(ISD::FCEIL, RoundedTy, Legal); - setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal); - setOperationAction(ISD::FTRUNC, RoundedTy, Legal); - setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal); - setOperationAction(ISD::FRINT, RoundedTy, Legal); - setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal); - setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal); - setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal); - setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal); - setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy, Legal); - - setOperationAction(ISD::FROUND, RoundedTy, Custom); - } - - setOperationAction(ISD::SMAX, MVT::v16i8, Legal); - setOperationAction(ISD::SMAX, MVT::v4i32, Legal); - setOperationAction(ISD::UMAX, MVT::v8i16, Legal); - setOperationAction(ISD::UMAX, MVT::v4i32, Legal); - setOperationAction(ISD::SMIN, MVT::v16i8, Legal); - setOperationAction(ISD::SMIN, MVT::v4i32, Legal); - setOperationAction(ISD::UMIN, MVT::v8i16, Legal); - setOperationAction(ISD::UMIN, MVT::v4i32, Legal); - - setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom); - setOperationAction(ISD::SADDSAT, MVT::v2i64, Custom); - setOperationAction(ISD::SSUBSAT, MVT::v2i64, Custom); + setOperationAction(ISD::FFLOOR, RoundedTy, Legal); + setOperationAction(ISD::STRICT_FFLOOR, RoundedTy, Legal); + setOperationAction(ISD::FCEIL, RoundedTy, Legal); + setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal); + setOperationAction(ISD::FTRUNC, RoundedTy, Legal); + setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal); + setOperationAction(ISD::FRINT, RoundedTy, Legal); + setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal); + setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal); + setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal); + setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal); + setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy, Legal); + + setOperationAction(ISD::FROUND, RoundedTy, Custom); + } + + setOperationAction(ISD::SMAX, MVT::v16i8, Legal); + setOperationAction(ISD::SMAX, MVT::v4i32, Legal); + setOperationAction(ISD::UMAX, MVT::v8i16, Legal); + setOperationAction(ISD::UMAX, MVT::v4i32, Legal); + setOperationAction(ISD::SMIN, MVT::v16i8, Legal); + setOperationAction(ISD::SMIN, MVT::v4i32, Legal); + setOperationAction(ISD::UMIN, MVT::v8i16, Legal); + setOperationAction(ISD::UMIN, MVT::v4i32, Legal); + + setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom); + setOperationAction(ISD::SADDSAT, MVT::v2i64, Custom); + setOperationAction(ISD::SSUBSAT, MVT::v2i64, Custom); // FIXME: Do we need to handle scalar-to-vector here? - setOperationAction(ISD::MUL, MVT::v4i32, Legal); - setOperationAction(ISD::SMULO, MVT::v2i32, Custom); + setOperationAction(ISD::MUL, MVT::v4i32, Legal); + setOperationAction(ISD::SMULO, MVT::v2i32, Custom); // We directly match byte blends in the backend as they match the VSELECT // condition form. - setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); + setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); // SSE41 brings specific instructions for doing vector sign extend even in // cases where we don't have SRA. - for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { + for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64}) { setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal); setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal); } // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X - for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) { - setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal); - setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal); - setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal); + for (auto LoadExtOp : {ISD::SEXTLOAD, ISD::ZEXTLOAD}) { + setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal); + setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal); + setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal); setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal); setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal); setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal); @@ -1423,73 +1423,73 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) { // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can // do the pre and post work in the vector domain. - setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom); // We need to mark SINT_TO_FP as Custom even though we want to expand it // so that DAG combine doesn't try to turn it into uint_to_fp. - setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom); } } if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) { - setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom); + setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom); } if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) { - for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, - MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { + for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v32i8, + MVT::v16i16, MVT::v8i32, MVT::v4i64}) { setOperationAction(ISD::ROTL, VT, Custom); setOperationAction(ISD::ROTR, VT, Custom); } // XOP can efficiently perform BITREVERSE with VPPERM. - for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) + for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) setOperationAction(ISD::BITREVERSE, VT, Custom); } if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) { bool HasInt256 = Subtarget.hasInt256(); - addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass - : &X86::VR256RegClass); + addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass + : &X86::VR256RegClass); addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass); addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass); - addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass - : &X86::VR256RegClass); - addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass - : &X86::VR256RegClass); - addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass - : &X86::VR256RegClass); - addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass - : &X86::VR256RegClass); - - for (auto VT : { MVT::v8f32, MVT::v4f64 }) { - setOperationAction(ISD::FFLOOR, VT, Legal); - setOperationAction(ISD::STRICT_FFLOOR, VT, Legal); - setOperationAction(ISD::FCEIL, VT, Legal); - setOperationAction(ISD::STRICT_FCEIL, VT, Legal); - setOperationAction(ISD::FTRUNC, VT, Legal); - setOperationAction(ISD::STRICT_FTRUNC, VT, Legal); - setOperationAction(ISD::FRINT, VT, Legal); - setOperationAction(ISD::STRICT_FRINT, VT, Legal); - setOperationAction(ISD::FNEARBYINT, VT, Legal); + addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass + : &X86::VR256RegClass); + addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass + : &X86::VR256RegClass); + addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass + : &X86::VR256RegClass); + addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass + : &X86::VR256RegClass); + + for (auto VT : {MVT::v8f32, MVT::v4f64}) { + setOperationAction(ISD::FFLOOR, VT, Legal); + setOperationAction(ISD::STRICT_FFLOOR, VT, Legal); + setOperationAction(ISD::FCEIL, VT, Legal); + setOperationAction(ISD::STRICT_FCEIL, VT, Legal); + setOperationAction(ISD::FTRUNC, VT, Legal); + setOperationAction(ISD::STRICT_FTRUNC, VT, Legal); + setOperationAction(ISD::FRINT, VT, Legal); + setOperationAction(ISD::STRICT_FRINT, VT, Legal); + setOperationAction(ISD::FNEARBYINT, VT, Legal); setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal); - setOperationAction(ISD::FROUNDEVEN, VT, Legal); + setOperationAction(ISD::FROUNDEVEN, VT, Legal); setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal); - setOperationAction(ISD::FROUND, VT, Custom); + setOperationAction(ISD::FROUND, VT, Custom); - setOperationAction(ISD::FNEG, VT, Custom); - setOperationAction(ISD::FABS, VT, Custom); - setOperationAction(ISD::FCOPYSIGN, VT, Custom); + setOperationAction(ISD::FNEG, VT, Custom); + setOperationAction(ISD::FABS, VT, Custom); + setOperationAction(ISD::FCOPYSIGN, VT, Custom); - setOperationAction(ISD::FMAXIMUM, VT, Custom); - setOperationAction(ISD::FMINIMUM, VT, Custom); - setOperationAction(ISD::FMAXIMUMNUM, VT, Custom); - setOperationAction(ISD::FMINIMUMNUM, VT, Custom); + setOperationAction(ISD::FMAXIMUM, VT, Custom); + setOperationAction(ISD::FMINIMUM, VT, Custom); + setOperationAction(ISD::FMAXIMUMNUM, VT, Custom); + setOperationAction(ISD::FMINIMUMNUM, VT, Custom); setOperationAction(ISD::FCANONICALIZE, VT, Custom); } @@ -1498,64 +1498,65 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted // even though v8i16 is a legal type. - setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32); - setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32); + setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32); + setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32); setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32); setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32); - setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Custom); - setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Custom); - - setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Custom); - setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Custom); - setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Expand); - setOperationAction(ISD::FP_ROUND, MVT::v8f16, Expand); - setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom); - setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Custom); - - setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal); - setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal); - setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal); - setOperationAction(ISD::STRICT_FSUB, MVT::v8f32, Legal); - setOperationAction(ISD::STRICT_FSUB, MVT::v4f64, Legal); - setOperationAction(ISD::STRICT_FMUL, MVT::v8f32, Legal); - setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal); - setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal); - setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal); - setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal); - setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Custom); + + setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Custom); + setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Expand); + setOperationAction(ISD::FP_ROUND, MVT::v8f16, Expand); + setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Custom); + + setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal); + setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal); + setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal); + setOperationAction(ISD::STRICT_FSUB, MVT::v8f32, Legal); + setOperationAction(ISD::STRICT_FSUB, MVT::v4f64, Legal); + setOperationAction(ISD::STRICT_FMUL, MVT::v8f32, Legal); + setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal); + setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal); + setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal); + setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal); + setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal); if (!Subtarget.hasAVX512()) setOperationAction(ISD::BITCAST, MVT::v32i1, Custom); // In the customized shift lowering, the legal v8i32/v4i64 cases // in AVX2 will be recognized. - for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { - setOperationAction(ISD::SRL, VT, Custom); - setOperationAction(ISD::SHL, VT, Custom); - setOperationAction(ISD::SRA, VT, Custom); - setOperationAction(ISD::ABDS, VT, Custom); - setOperationAction(ISD::ABDU, VT, Custom); - if (VT == MVT::v4i64) continue; - setOperationAction(ISD::ROTL, VT, Custom); - setOperationAction(ISD::ROTR, VT, Custom); - setOperationAction(ISD::FSHL, VT, Custom); - setOperationAction(ISD::FSHR, VT, Custom); + for (auto VT : {MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64}) { + setOperationAction(ISD::SRL, VT, Custom); + setOperationAction(ISD::SHL, VT, Custom); + setOperationAction(ISD::SRA, VT, Custom); + setOperationAction(ISD::ABDS, VT, Custom); + setOperationAction(ISD::ABDU, VT, Custom); + if (VT == MVT::v4i64) + continue; + setOperationAction(ISD::ROTL, VT, Custom); + setOperationAction(ISD::ROTR, VT, Custom); + setOperationAction(ISD::FSHL, VT, Custom); + setOperationAction(ISD::FSHR, VT, Custom); } // These types need custom splitting if their input is a 128-bit vector. - setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); - setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); - setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); - - setOperationAction(ISD::SELECT, MVT::v4f64, Custom); - setOperationAction(ISD::SELECT, MVT::v4i64, Custom); - setOperationAction(ISD::SELECT, MVT::v8i32, Custom); - setOperationAction(ISD::SELECT, MVT::v16i16, Custom); - setOperationAction(ISD::SELECT, MVT::v16f16, Custom); - setOperationAction(ISD::SELECT, MVT::v32i8, Custom); - setOperationAction(ISD::SELECT, MVT::v8f32, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); + + setOperationAction(ISD::SELECT, MVT::v4f64, Custom); + setOperationAction(ISD::SELECT, MVT::v4i64, Custom); + setOperationAction(ISD::SELECT, MVT::v8i32, Custom); + setOperationAction(ISD::SELECT, MVT::v16i16, Custom); + setOperationAction(ISD::SELECT, MVT::v16f16, Custom); + setOperationAction(ISD::SELECT, MVT::v32i8, Custom); + setOperationAction(ISD::SELECT, MVT::v8f32, Custom); setOperationAction(ISD::CTSELECT, MVT::v4f64, Custom); setOperationAction(ISD::CTSELECT, MVT::v4i64, Custom); @@ -1565,22 +1566,22 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CTSELECT, MVT::v32i8, Custom); setOperationAction(ISD::CTSELECT, MVT::v8f32, Custom); - for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { - setOperationAction(ISD::SIGN_EXTEND, VT, Custom); - setOperationAction(ISD::ZERO_EXTEND, VT, Custom); - setOperationAction(ISD::ANY_EXTEND, VT, Custom); + for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) { + setOperationAction(ISD::SIGN_EXTEND, VT, Custom); + setOperationAction(ISD::ZERO_EXTEND, VT, Custom); + setOperationAction(ISD::ANY_EXTEND, VT, Custom); } - setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v32i16, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v32i32, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v32i64, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v32i16, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v32i32, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v32i64, Custom); - for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { - setOperationAction(ISD::SETCC, VT, Custom); - setOperationAction(ISD::CTPOP, VT, Custom); - setOperationAction(ISD::CTLZ, VT, Custom); - setOperationAction(ISD::BITREVERSE, VT, Custom); + for (auto VT : {MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64}) { + setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::CTPOP, VT, Custom); + setOperationAction(ISD::CTLZ, VT, Custom); + setOperationAction(ISD::BITREVERSE, VT, Custom); // The condition codes aren't legal in SSE/AVX and under AVX512 we use // setcc all the way to isel and prefer SETGT in some isel patterns. @@ -1588,64 +1589,64 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setCondCodeAction(ISD::SETLE, VT, Custom); } - setOperationAction(ISD::SETCC, MVT::v4f64, Custom); - setOperationAction(ISD::SETCC, MVT::v8f32, Custom); - setOperationAction(ISD::STRICT_FSETCC, MVT::v4f64, Custom); - setOperationAction(ISD::STRICT_FSETCC, MVT::v8f32, Custom); + setOperationAction(ISD::SETCC, MVT::v4f64, Custom); + setOperationAction(ISD::SETCC, MVT::v8f32, Custom); + setOperationAction(ISD::STRICT_FSETCC, MVT::v4f64, Custom); + setOperationAction(ISD::STRICT_FSETCC, MVT::v8f32, Custom); setOperationAction(ISD::STRICT_FSETCCS, MVT::v4f64, Custom); setOperationAction(ISD::STRICT_FSETCCS, MVT::v8f32, Custom); if (Subtarget.hasAnyFMA()) { - for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32, - MVT::v2f64, MVT::v4f64 }) { + for (auto VT : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32, MVT::v2f64, + MVT::v4f64}) { setOperationAction(ISD::FMA, VT, Legal); setOperationAction(ISD::STRICT_FMA, VT, Legal); } } - for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { + for (auto VT : {MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64}) { setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom); setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom); } - setOperationAction(ISD::MUL, MVT::v4i64, Custom); - setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom); - setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom); - setOperationAction(ISD::MUL, MVT::v32i8, Custom); - - setOperationAction(ISD::MULHU, MVT::v8i32, Custom); - setOperationAction(ISD::MULHS, MVT::v8i32, Custom); - setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom); - setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom); - setOperationAction(ISD::MULHU, MVT::v32i8, Custom); - setOperationAction(ISD::MULHS, MVT::v32i8, Custom); - setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom); - setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom); - - setOperationAction(ISD::SMULO, MVT::v32i8, Custom); - setOperationAction(ISD::UMULO, MVT::v32i8, Custom); - - setOperationAction(ISD::ABS, MVT::v4i64, Custom); - setOperationAction(ISD::SMAX, MVT::v4i64, Custom); - setOperationAction(ISD::UMAX, MVT::v4i64, Custom); - setOperationAction(ISD::SMIN, MVT::v4i64, Custom); - setOperationAction(ISD::UMIN, MVT::v4i64, Custom); - - setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom); - setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom); - setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom); - setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom); - setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom); - setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom); - setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom); - setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom); - setOperationAction(ISD::UADDSAT, MVT::v8i32, Custom); - setOperationAction(ISD::USUBSAT, MVT::v8i32, Custom); - setOperationAction(ISD::UADDSAT, MVT::v4i64, Custom); - setOperationAction(ISD::USUBSAT, MVT::v4i64, Custom); - - for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) { - setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom); + setOperationAction(ISD::MUL, MVT::v4i64, Custom); + setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom); + setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom); + setOperationAction(ISD::MUL, MVT::v32i8, Custom); + + setOperationAction(ISD::MULHU, MVT::v8i32, Custom); + setOperationAction(ISD::MULHS, MVT::v8i32, Custom); + setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom); + setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom); + setOperationAction(ISD::MULHU, MVT::v32i8, Custom); + setOperationAction(ISD::MULHS, MVT::v32i8, Custom); + setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom); + setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom); + + setOperationAction(ISD::SMULO, MVT::v32i8, Custom); + setOperationAction(ISD::UMULO, MVT::v32i8, Custom); + + setOperationAction(ISD::ABS, MVT::v4i64, Custom); + setOperationAction(ISD::SMAX, MVT::v4i64, Custom); + setOperationAction(ISD::UMAX, MVT::v4i64, Custom); + setOperationAction(ISD::SMIN, MVT::v4i64, Custom); + setOperationAction(ISD::UMIN, MVT::v4i64, Custom); + + setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom); + setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom); + setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom); + setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom); + setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom); + setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom); + setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom); + setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom); + setOperationAction(ISD::UADDSAT, MVT::v8i32, Custom); + setOperationAction(ISD::USUBSAT, MVT::v8i32, Custom); + setOperationAction(ISD::UADDSAT, MVT::v4i64, Custom); + setOperationAction(ISD::USUBSAT, MVT::v4i64, Custom); + + for (auto VT : {MVT::v32i8, MVT::v16i16, MVT::v8i32}) { + setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom); setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom); setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom); setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom); @@ -1664,41 +1665,41 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom); // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X - for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) { + for (auto LoadExtOp : {ISD::SEXTLOAD, ISD::ZEXTLOAD}) { setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal); - setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal); - setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal); - setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal); - setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal); - setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal); + setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal); + setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal); + setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal); + setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal); + setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal); } } - for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, - MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) { - setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom); + for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, MVT::v4f32, + MVT::v8f32, MVT::v2f64, MVT::v4f64}) { + setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom); setOperationAction(ISD::MSTORE, VT, Legal); } // Extract subvector is special because the value type // (result) is 128-bit but the source is 256-bit wide. - for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, - MVT::v8f16, MVT::v4f32, MVT::v2f64 }) { + for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v8f16, + MVT::v4f32, MVT::v2f64}) { setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); } // Custom lower several nodes for 256-bit types. - for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64, - MVT::v16f16, MVT::v8f32, MVT::v4f64 }) { - setOperationAction(ISD::BUILD_VECTOR, VT, Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); - setOperationAction(ISD::VSELECT, VT, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); + for (MVT VT : {MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64, MVT::v16f16, + MVT::v8f32, MVT::v4f64}) { + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::VSELECT, VT, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); - setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); - setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); - setOperationAction(ISD::STORE, VT, Custom); + setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); + setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); + setOperationAction(ISD::STORE, VT, Custom); } setF16Action(MVT::v16f16, Expand); setOperationAction(ISD::FNEG, MVT::v16f16, Custom); @@ -1716,21 +1717,21 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::MGATHER, MVT::v2f32, Custom); setOperationAction(ISD::MGATHER, MVT::v2i32, Custom); - for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, - MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) - setOperationAction(ISD::MGATHER, VT, Custom); + for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, + MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) + setOperationAction(ISD::MGATHER, VT, Custom); } } if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() && Subtarget.hasF16C()) { - for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) { - setOperationAction(ISD::FP_ROUND, VT, Custom); - setOperationAction(ISD::STRICT_FP_ROUND, VT, Custom); + for (MVT VT : {MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16}) { + setOperationAction(ISD::FP_ROUND, VT, Custom); + setOperationAction(ISD::STRICT_FP_ROUND, VT, Custom); } - for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) { - setOperationAction(ISD::FP_EXTEND, VT, Custom); - setOperationAction(ISD::STRICT_FP_EXTEND, VT, Custom); + for (MVT VT : {MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32}) { + setOperationAction(ISD::FP_EXTEND, VT, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, VT, Custom); } for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) { setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32); @@ -1744,29 +1745,29 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // available with AVX512. 512-bit vectors are in a separate block controlled // by useAVX512Regs. if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) { - addRegisterClass(MVT::v1i1, &X86::VK1RegClass); - addRegisterClass(MVT::v2i1, &X86::VK2RegClass); - addRegisterClass(MVT::v4i1, &X86::VK4RegClass); - addRegisterClass(MVT::v8i1, &X86::VK8RegClass); - addRegisterClass(MVT::v16i1, &X86::VK16RegClass); + addRegisterClass(MVT::v1i1, &X86::VK1RegClass); + addRegisterClass(MVT::v2i1, &X86::VK2RegClass); + addRegisterClass(MVT::v4i1, &X86::VK4RegClass); + addRegisterClass(MVT::v8i1, &X86::VK8RegClass); + addRegisterClass(MVT::v16i1, &X86::VK16RegClass); - setOperationAction(ISD::SELECT, MVT::v1i1, Custom); + setOperationAction(ISD::SELECT, MVT::v1i1, Custom); setOperationAction(ISD::CTSELECT, MVT::v1i1, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom); - setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom); - - setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32); - setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32); - setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32); - setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32); - setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32); - setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32); - setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32); - setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32); - setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom); - setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom); - setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom); + + setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32); + setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32); + setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32); + setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32); + setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32); + setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32); + setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32); + setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32); + setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom); setOperationAction(ISD::FCANONICALIZE, MVT::v8f16, Custom); setOperationAction(ISD::FCANONICALIZE, MVT::v16f16, Custom); setOperationAction(ISD::FCANONICALIZE, MVT::v32f16, Custom); @@ -1785,30 +1786,30 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors. - for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { + for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) { setOperationAction(ISD::SIGN_EXTEND, VT, Custom); setOperationAction(ISD::ZERO_EXTEND, VT, Custom); - setOperationAction(ISD::ANY_EXTEND, VT, Custom); + setOperationAction(ISD::ANY_EXTEND, VT, Custom); } - for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) - setOperationAction(ISD::VSELECT, VT, Expand); + for (auto VT : {MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1}) + setOperationAction(ISD::VSELECT, VT, Expand); - for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) { - setOperationAction(ISD::SETCC, VT, Custom); - setOperationAction(ISD::SELECT, VT, Custom); + for (auto VT : {MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1}) { + setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::CTSELECT, VT, Custom); - setOperationAction(ISD::TRUNCATE, VT, Custom); + setOperationAction(ISD::TRUNCATE, VT, Custom); - setOperationAction(ISD::BUILD_VECTOR, VT, Custom); - setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); } - for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 }) + for (auto VT : {MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1}) setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); } if (Subtarget.hasDQI() && Subtarget.hasVLX()) { @@ -1826,30 +1827,30 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addRegisterClass(MVT::v16i32, &X86::VR512RegClass); addRegisterClass(MVT::v16f32, &X86::VR512RegClass); - addRegisterClass(MVT::v8i64, &X86::VR512RegClass); - addRegisterClass(MVT::v8f64, &X86::VR512RegClass); + addRegisterClass(MVT::v8i64, &X86::VR512RegClass); + addRegisterClass(MVT::v8f64, &X86::VR512RegClass); addRegisterClass(MVT::v32i16, &X86::VR512RegClass); addRegisterClass(MVT::v32f16, &X86::VR512RegClass); - addRegisterClass(MVT::v64i8, &X86::VR512RegClass); + addRegisterClass(MVT::v64i8, &X86::VR512RegClass); for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) { - setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal); + setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal); setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal); - setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal); - setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal); - setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal); + setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal); + setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal); + setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal); if (HasBWI) setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal); } - for (MVT VT : { MVT::v16f32, MVT::v8f64 }) { + for (MVT VT : {MVT::v16f32, MVT::v8f64}) { setOperationAction(ISD::FMAXIMUM, VT, Custom); setOperationAction(ISD::FMINIMUM, VT, Custom); setOperationAction(ISD::FMAXIMUMNUM, VT, Custom); setOperationAction(ISD::FMINIMUMNUM, VT, Custom); - setOperationAction(ISD::FNEG, VT, Custom); - setOperationAction(ISD::FABS, VT, Custom); - setOperationAction(ISD::FMA, VT, Legal); + setOperationAction(ISD::FNEG, VT, Custom); + setOperationAction(ISD::FABS, VT, Custom); + setOperationAction(ISD::FMA, VT, Legal); setOperationAction(ISD::STRICT_FMA, VT, Legal); setOperationAction(ISD::FCOPYSIGN, VT, Custom); setOperationAction(ISD::FCANONICALIZE, VT, Custom); @@ -1861,93 +1862,93 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (Subtarget.hasDQI()) setOperationAction(ISD::LLRINT, MVT::v8f64, Legal); - for (MVT VT : { MVT::v16i1, MVT::v16i8 }) { - setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32); - setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32); + for (MVT VT : {MVT::v16i1, MVT::v16i8}) { + setOperationPromotedToType(ISD::FP_TO_SINT, VT, MVT::v16i32); + setOperationPromotedToType(ISD::FP_TO_UINT, VT, MVT::v16i32); setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32); setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32); } - for (MVT VT : { MVT::v16i16, MVT::v16i32 }) { - setOperationAction(ISD::FP_TO_SINT, VT, Custom); - setOperationAction(ISD::FP_TO_UINT, VT, Custom); + for (MVT VT : {MVT::v16i16, MVT::v16i32}) { + setOperationAction(ISD::FP_TO_SINT, VT, Custom); + setOperationAction(ISD::FP_TO_UINT, VT, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom); } - setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Custom); - setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Custom); - setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom); - setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Custom); - - setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal); - setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal); - setOperationAction(ISD::STRICT_FSUB, MVT::v16f32, Legal); - setOperationAction(ISD::STRICT_FSUB, MVT::v8f64, Legal); - setOperationAction(ISD::STRICT_FMUL, MVT::v16f32, Legal); - setOperationAction(ISD::STRICT_FMUL, MVT::v8f64, Legal); - setOperationAction(ISD::STRICT_FDIV, MVT::v16f32, Legal); - setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal); - setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal); - setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal); - setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal); - - setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal); - setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal); - setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal); - setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal); - setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal); + setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Custom); + + setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal); + setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal); + setOperationAction(ISD::STRICT_FSUB, MVT::v16f32, Legal); + setOperationAction(ISD::STRICT_FSUB, MVT::v8f64, Legal); + setOperationAction(ISD::STRICT_FMUL, MVT::v16f32, Legal); + setOperationAction(ISD::STRICT_FMUL, MVT::v8f64, Legal); + setOperationAction(ISD::STRICT_FDIV, MVT::v16f32, Legal); + setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal); + setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal); + setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal); + + setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal); + setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal); + setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal); + setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal); + setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal); if (HasBWI) - setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal); + setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal); // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE // to 512-bit rather than use the AVX2 instructions so that we can use // k-masks. if (!Subtarget.hasVLX()) { for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, - MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) { - setOperationAction(ISD::MLOAD, VT, Custom); + MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) { + setOperationAction(ISD::MLOAD, VT, Custom); setOperationAction(ISD::MSTORE, VT, Custom); } } - setOperationAction(ISD::TRUNCATE, MVT::v8i32, Legal); - setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal); - setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom); + setOperationAction(ISD::TRUNCATE, MVT::v8i32, Legal); + setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal); + setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); - setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); - setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom); - setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom); - setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); + setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom); + setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom); + setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); if (HasBWI) { // Extends from v64i1 masks to 512-bit vectors. - setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom); - setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom); - setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom); - } - - for (auto VT : { MVT::v16f32, MVT::v8f64 }) { - setOperationAction(ISD::FFLOOR, VT, Legal); - setOperationAction(ISD::STRICT_FFLOOR, VT, Legal); - setOperationAction(ISD::FCEIL, VT, Legal); - setOperationAction(ISD::STRICT_FCEIL, VT, Legal); - setOperationAction(ISD::FTRUNC, VT, Legal); - setOperationAction(ISD::STRICT_FTRUNC, VT, Legal); - setOperationAction(ISD::FRINT, VT, Legal); - setOperationAction(ISD::STRICT_FRINT, VT, Legal); - setOperationAction(ISD::FNEARBYINT, VT, Legal); + setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom); + setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom); + } + + for (auto VT : {MVT::v16f32, MVT::v8f64}) { + setOperationAction(ISD::FFLOOR, VT, Legal); + setOperationAction(ISD::STRICT_FFLOOR, VT, Legal); + setOperationAction(ISD::FCEIL, VT, Legal); + setOperationAction(ISD::STRICT_FCEIL, VT, Legal); + setOperationAction(ISD::FTRUNC, VT, Legal); + setOperationAction(ISD::STRICT_FTRUNC, VT, Legal); + setOperationAction(ISD::FRINT, VT, Legal); + setOperationAction(ISD::STRICT_FRINT, VT, Legal); + setOperationAction(ISD::FNEARBYINT, VT, Legal); setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal); - setOperationAction(ISD::FROUNDEVEN, VT, Legal); + setOperationAction(ISD::FROUNDEVEN, VT, Legal); setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal); - setOperationAction(ISD::FROUND, VT, Custom); + setOperationAction(ISD::FROUND, VT, Custom); } for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) { @@ -1957,36 +1958,36 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom); setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom); - setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom); - setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom); + setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom); + setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom); - setOperationAction(ISD::MUL, MVT::v8i64, Custom); + setOperationAction(ISD::MUL, MVT::v8i64, Custom); setOperationAction(ISD::MUL, MVT::v16i32, Legal); setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom); - setOperationAction(ISD::MUL, MVT::v64i8, Custom); + setOperationAction(ISD::MUL, MVT::v64i8, Custom); setOperationAction(ISD::MULHU, MVT::v16i32, Custom); setOperationAction(ISD::MULHS, MVT::v16i32, Custom); setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom); setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom); - setOperationAction(ISD::MULHS, MVT::v64i8, Custom); - setOperationAction(ISD::MULHU, MVT::v64i8, Custom); + setOperationAction(ISD::MULHS, MVT::v64i8, Custom); + setOperationAction(ISD::MULHU, MVT::v64i8, Custom); setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom); - setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom); + setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom); setOperationAction(ISD::SMULO, MVT::v64i8, Custom); setOperationAction(ISD::UMULO, MVT::v64i8, Custom); - for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) { - setOperationAction(ISD::SRL, VT, Custom); - setOperationAction(ISD::SHL, VT, Custom); - setOperationAction(ISD::SRA, VT, Custom); - setOperationAction(ISD::ROTL, VT, Custom); - setOperationAction(ISD::ROTR, VT, Custom); - setOperationAction(ISD::SETCC, VT, Custom); - setOperationAction(ISD::ABDS, VT, Custom); - setOperationAction(ISD::ABDU, VT, Custom); - setOperationAction(ISD::BITREVERSE, VT, Custom); + for (auto VT : {MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64}) { + setOperationAction(ISD::SRL, VT, Custom); + setOperationAction(ISD::SHL, VT, Custom); + setOperationAction(ISD::SRA, VT, Custom); + setOperationAction(ISD::ROTL, VT, Custom); + setOperationAction(ISD::ROTR, VT, Custom); + setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::ABDS, VT, Custom); + setOperationAction(ISD::ABDU, VT, Custom); + setOperationAction(ISD::BITREVERSE, VT, Custom); // The condition codes aren't legal in SSE/AVX and under AVX512 we use // setcc all the way to isel and prefer SETGT in some isel patterns. @@ -1994,83 +1995,84 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setCondCodeAction(ISD::SETLE, VT, Custom); } - setOperationAction(ISD::SETCC, MVT::v8f64, Custom); - setOperationAction(ISD::SETCC, MVT::v16f32, Custom); - setOperationAction(ISD::STRICT_FSETCC, MVT::v8f64, Custom); - setOperationAction(ISD::STRICT_FSETCC, MVT::v16f32, Custom); + setOperationAction(ISD::SETCC, MVT::v8f64, Custom); + setOperationAction(ISD::SETCC, MVT::v16f32, Custom); + setOperationAction(ISD::STRICT_FSETCC, MVT::v8f64, Custom); + setOperationAction(ISD::STRICT_FSETCC, MVT::v16f32, Custom); setOperationAction(ISD::STRICT_FSETCCS, MVT::v8f64, Custom); setOperationAction(ISD::STRICT_FSETCCS, MVT::v16f32, Custom); - for (auto VT : { MVT::v16i32, MVT::v8i64 }) { - setOperationAction(ISD::SMAX, VT, Legal); - setOperationAction(ISD::UMAX, VT, Legal); - setOperationAction(ISD::SMIN, VT, Legal); - setOperationAction(ISD::UMIN, VT, Legal); - setOperationAction(ISD::ABS, VT, Legal); - setOperationAction(ISD::CTPOP, VT, Custom); - } - - for (auto VT : { MVT::v64i8, MVT::v32i16 }) { - setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom); - setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom); - setOperationAction(ISD::CTLZ, VT, Custom); - setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom); - setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom); - setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom); - setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom); + for (auto VT : {MVT::v16i32, MVT::v8i64}) { + setOperationAction(ISD::SMAX, VT, Legal); + setOperationAction(ISD::UMAX, VT, Legal); + setOperationAction(ISD::SMIN, VT, Legal); + setOperationAction(ISD::UMIN, VT, Legal); + setOperationAction(ISD::ABS, VT, Legal); + setOperationAction(ISD::CTPOP, VT, Custom); + } + + for (auto VT : {MVT::v64i8, MVT::v32i16}) { + setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom); + setOperationAction(ISD::CTPOP, VT, + Subtarget.hasBITALG() ? Legal : Custom); + setOperationAction(ISD::CTLZ, VT, Custom); + setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom); + setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom); + setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom); + setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom); setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom); setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom); setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom); setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom); } - setOperationAction(ISD::FSHL, MVT::v64i8, Custom); - setOperationAction(ISD::FSHR, MVT::v64i8, Custom); - setOperationAction(ISD::FSHL, MVT::v32i16, Custom); - setOperationAction(ISD::FSHR, MVT::v32i16, Custom); - setOperationAction(ISD::FSHL, MVT::v16i32, Custom); - setOperationAction(ISD::FSHR, MVT::v16i32, Custom); + setOperationAction(ISD::FSHL, MVT::v64i8, Custom); + setOperationAction(ISD::FSHR, MVT::v64i8, Custom); + setOperationAction(ISD::FSHL, MVT::v32i16, Custom); + setOperationAction(ISD::FSHR, MVT::v32i16, Custom); + setOperationAction(ISD::FSHL, MVT::v16i32, Custom); + setOperationAction(ISD::FSHR, MVT::v16i32, Custom); if (Subtarget.hasDQI() || Subtarget.hasFP16()) for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP, ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT}) - setOperationAction(Opc, MVT::v8i64, Custom); + setOperationAction(Opc, MVT::v8i64, Custom); if (Subtarget.hasDQI()) - setOperationAction(ISD::MUL, MVT::v8i64, Legal); + setOperationAction(ISD::MUL, MVT::v8i64, Legal); if (Subtarget.hasCDI()) { // NonVLX sub-targets extend 128/256 vectors to use the 512 version. - for (auto VT : { MVT::v16i32, MVT::v8i64} ) { - setOperationAction(ISD::CTLZ, VT, Legal); + for (auto VT : {MVT::v16i32, MVT::v8i64}) { + setOperationAction(ISD::CTLZ, VT, Legal); } } // Subtarget.hasCDI() if (Subtarget.hasVPOPCNTDQ()) { - for (auto VT : { MVT::v16i32, MVT::v8i64 }) + for (auto VT : {MVT::v16i32, MVT::v8i64}) setOperationAction(ISD::CTPOP, VT, Legal); } // Extract subvector is special because the value type // (result) is 256-bit but the source is 512-bit wide. // 128-bit was made Legal under AVX1. - for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64, - MVT::v16f16, MVT::v8f32, MVT::v4f64 }) + for (auto VT : {MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64, + MVT::v16f16, MVT::v8f32, MVT::v4f64}) setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); - for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64, - MVT::v32f16, MVT::v16f32, MVT::v8f64 }) { - setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); - setOperationAction(ISD::SELECT, VT, Custom); + for (auto VT : {MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64, + MVT::v32f16, MVT::v16f32, MVT::v8f64}) { + setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); + setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::CTSELECT, VT, Custom); - setOperationAction(ISD::VSELECT, VT, Custom); - setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VSELECT, VT, Custom); + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); - setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); } setF16Action(MVT::v32f16, Expand); setOperationAction(ISD::FP_ROUND, MVT::v16f16, Custom); @@ -2081,20 +2083,20 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32); setOperationAction(ISD::SETCC, MVT::v32f16, Custom); - for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) { - setOperationAction(ISD::MLOAD, VT, Legal); - setOperationAction(ISD::MSTORE, VT, Legal); - setOperationAction(ISD::MGATHER, VT, Custom); - setOperationAction(ISD::MSCATTER, VT, Custom); + for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64}) { + setOperationAction(ISD::MLOAD, VT, Legal); + setOperationAction(ISD::MSTORE, VT, Legal); + setOperationAction(ISD::MGATHER, VT, Custom); + setOperationAction(ISD::MSCATTER, VT, Custom); } if (HasBWI) { - for (auto VT : { MVT::v64i8, MVT::v32i16 }) { - setOperationAction(ISD::MLOAD, VT, Legal); - setOperationAction(ISD::MSTORE, VT, Legal); + for (auto VT : {MVT::v64i8, MVT::v32i16}) { + setOperationAction(ISD::MLOAD, VT, Legal); + setOperationAction(ISD::MSTORE, VT, Legal); } } else { setOperationAction(ISD::STORE, MVT::v32i16, Custom); - setOperationAction(ISD::STORE, MVT::v64i8, Custom); + setOperationAction(ISD::STORE, MVT::v64i8, Custom); } if (Subtarget.hasVBMI2()) { @@ -2110,7 +2112,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FNEG, MVT::v32f16, Custom); setOperationAction(ISD::FABS, MVT::v32f16, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::v32f16, Custom); - }// useAVX512Regs + } // useAVX512Regs if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) { for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32, @@ -2127,9 +2129,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // These operations are handled on non-VLX by artificially widening in // isel patterns. - setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32, Custom); - setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, Custom); - setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom); if (Subtarget.hasDQI()) { // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion. @@ -2138,31 +2140,31 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && "Unexpected operation action!"); // v2i64 FP_TO_S/UINT(v2f32) custom conversion. - setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom); } - for (auto VT : { MVT::v2i64, MVT::v4i64 }) { + for (auto VT : {MVT::v2i64, MVT::v4i64}) { setOperationAction(ISD::SMAX, VT, Legal); setOperationAction(ISD::UMAX, VT, Legal); setOperationAction(ISD::SMIN, VT, Legal); setOperationAction(ISD::UMIN, VT, Legal); - setOperationAction(ISD::ABS, VT, Legal); + setOperationAction(ISD::ABS, VT, Legal); } - for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) { - setOperationAction(ISD::ROTL, VT, Custom); - setOperationAction(ISD::ROTR, VT, Custom); + for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64}) { + setOperationAction(ISD::ROTL, VT, Custom); + setOperationAction(ISD::ROTR, VT, Custom); } // Custom legalize 2x32 to get a little better code. setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom); setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom); - for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, - MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) + for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, MVT::v4f32, + MVT::v8f32, MVT::v2f64, MVT::v4f64}) setOperationAction(ISD::MSCATTER, VT, Custom); if (Subtarget.hasDQI()) { @@ -2177,13 +2179,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } if (Subtarget.hasCDI()) { - for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) { - setOperationAction(ISD::CTLZ, VT, Legal); + for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64}) { + setOperationAction(ISD::CTLZ, VT, Legal); } } // Subtarget.hasCDI() if (Subtarget.hasVPOPCNTDQ()) { - for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) + for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64}) setOperationAction(ISD::CTPOP, VT, Legal); } @@ -2220,34 +2222,34 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // This block control legalization of v32i1/v64i1 which are available with // AVX512BW.. if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) { - addRegisterClass(MVT::v32i1, &X86::VK32RegClass); - addRegisterClass(MVT::v64i1, &X86::VK64RegClass); + addRegisterClass(MVT::v32i1, &X86::VK32RegClass); + addRegisterClass(MVT::v64i1, &X86::VK64RegClass); - for (auto VT : { MVT::v32i1, MVT::v64i1 }) { - setOperationAction(ISD::VSELECT, VT, Expand); - setOperationAction(ISD::TRUNCATE, VT, Custom); - setOperationAction(ISD::SETCC, VT, Custom); + for (auto VT : {MVT::v32i1, MVT::v64i1}) { + setOperationAction(ISD::VSELECT, VT, Expand); + setOperationAction(ISD::TRUNCATE, VT, Custom); + setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); - setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::CTSELECT, VT, Custom); - setOperationAction(ISD::BUILD_VECTOR, VT, Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); - setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); } - for (auto VT : { MVT::v16i1, MVT::v32i1 }) + for (auto VT : {MVT::v16i1, MVT::v32i1}) setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); // Extends from v32i1 masks to 256-bit vectors. - setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom); - setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom); - setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom); + setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom); for (auto VT : {MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16, MVT::v16f16, MVT::v8f16}) { - setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom); + setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom); setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom); } @@ -2256,120 +2258,120 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // TODO: Custom widen in lowering on non-VLX and drop the isel patterns? if (Subtarget.hasBITALG()) { - for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 }) + for (auto VT : {MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16}) setOperationAction(ISD::CTPOP, VT, Legal); } } if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) { - auto setGroup = [&] (MVT VT) { - setOperationAction(ISD::FADD, VT, Legal); - setOperationAction(ISD::STRICT_FADD, VT, Legal); - setOperationAction(ISD::FSUB, VT, Legal); - setOperationAction(ISD::STRICT_FSUB, VT, Legal); - setOperationAction(ISD::FMUL, VT, Legal); - setOperationAction(ISD::STRICT_FMUL, VT, Legal); - setOperationAction(ISD::FDIV, VT, Legal); - setOperationAction(ISD::STRICT_FDIV, VT, Legal); - setOperationAction(ISD::FSQRT, VT, Legal); - setOperationAction(ISD::STRICT_FSQRT, VT, Legal); - - setOperationAction(ISD::FFLOOR, VT, Legal); - setOperationAction(ISD::STRICT_FFLOOR, VT, Legal); - setOperationAction(ISD::FCEIL, VT, Legal); - setOperationAction(ISD::STRICT_FCEIL, VT, Legal); - setOperationAction(ISD::FTRUNC, VT, Legal); - setOperationAction(ISD::STRICT_FTRUNC, VT, Legal); - setOperationAction(ISD::FRINT, VT, Legal); - setOperationAction(ISD::STRICT_FRINT, VT, Legal); - setOperationAction(ISD::FNEARBYINT, VT, Legal); - setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal); + auto setGroup = [&](MVT VT) { + setOperationAction(ISD::FADD, VT, Legal); + setOperationAction(ISD::STRICT_FADD, VT, Legal); + setOperationAction(ISD::FSUB, VT, Legal); + setOperationAction(ISD::STRICT_FSUB, VT, Legal); + setOperationAction(ISD::FMUL, VT, Legal); + setOperationAction(ISD::STRICT_FMUL, VT, Legal); + setOperationAction(ISD::FDIV, VT, Legal); + setOperationAction(ISD::STRICT_FDIV, VT, Legal); + setOperationAction(ISD::FSQRT, VT, Legal); + setOperationAction(ISD::STRICT_FSQRT, VT, Legal); + + setOperationAction(ISD::FFLOOR, VT, Legal); + setOperationAction(ISD::STRICT_FFLOOR, VT, Legal); + setOperationAction(ISD::FCEIL, VT, Legal); + setOperationAction(ISD::STRICT_FCEIL, VT, Legal); + setOperationAction(ISD::FTRUNC, VT, Legal); + setOperationAction(ISD::STRICT_FTRUNC, VT, Legal); + setOperationAction(ISD::FRINT, VT, Legal); + setOperationAction(ISD::STRICT_FRINT, VT, Legal); + setOperationAction(ISD::FNEARBYINT, VT, Legal); + setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal); setOperationAction(ISD::FROUNDEVEN, VT, Legal); setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal); - setOperationAction(ISD::FROUND, VT, Custom); + setOperationAction(ISD::FROUND, VT, Custom); - setOperationAction(ISD::LOAD, VT, Legal); - setOperationAction(ISD::STORE, VT, Legal); + setOperationAction(ISD::LOAD, VT, Legal); + setOperationAction(ISD::STORE, VT, Legal); - setOperationAction(ISD::FMA, VT, Legal); - setOperationAction(ISD::STRICT_FMA, VT, Legal); - setOperationAction(ISD::VSELECT, VT, Legal); - setOperationAction(ISD::BUILD_VECTOR, VT, Custom); - setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::FMA, VT, Legal); + setOperationAction(ISD::STRICT_FMA, VT, Legal); + setOperationAction(ISD::VSELECT, VT, Legal); + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::CTSELECT, VT, Custom); - setOperationAction(ISD::FNEG, VT, Custom); - setOperationAction(ISD::FABS, VT, Custom); - setOperationAction(ISD::FCOPYSIGN, VT, Custom); + setOperationAction(ISD::FNEG, VT, Custom); + setOperationAction(ISD::FABS, VT, Custom); + setOperationAction(ISD::FCOPYSIGN, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); - setOperationAction(ISD::SETCC, VT, Custom); - setOperationAction(ISD::STRICT_FSETCC, VT, Custom); - setOperationAction(ISD::STRICT_FSETCCS, VT, Custom); + setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::STRICT_FSETCC, VT, Custom); + setOperationAction(ISD::STRICT_FSETCCS, VT, Custom); }; // AVX512_FP16 scalar operations setGroup(MVT::f16); - setOperationAction(ISD::FREM, MVT::f16, Promote); - setOperationAction(ISD::STRICT_FREM, MVT::f16, Promote); - setOperationAction(ISD::SELECT_CC, MVT::f16, Expand); - setOperationAction(ISD::BR_CC, MVT::f16, Expand); - setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote); - setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal); - setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Legal); - setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); - setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); - setOperationAction(ISD::FMAXIMUM, MVT::f16, Custom); - setOperationAction(ISD::FMINIMUM, MVT::f16, Custom); - setOperationAction(ISD::FMAXIMUMNUM, MVT::f16, Custom); - setOperationAction(ISD::FMINIMUMNUM, MVT::f16, Custom); - setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal); - setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal); - setOperationAction(ISD::LRINT, MVT::f16, Legal); - setOperationAction(ISD::LLRINT, MVT::f16, Legal); + setOperationAction(ISD::FREM, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FREM, MVT::f16, Promote); + setOperationAction(ISD::SELECT_CC, MVT::f16, Expand); + setOperationAction(ISD::BR_CC, MVT::f16, Expand); + setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote); + setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal); + setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Legal); + setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); + setOperationAction(ISD::FMAXIMUM, MVT::f16, Custom); + setOperationAction(ISD::FMINIMUM, MVT::f16, Custom); + setOperationAction(ISD::FMAXIMUMNUM, MVT::f16, Custom); + setOperationAction(ISD::FMINIMUMNUM, MVT::f16, Custom); + setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal); + setOperationAction(ISD::LRINT, MVT::f16, Legal); + setOperationAction(ISD::LLRINT, MVT::f16, Legal); setCondCodeAction(ISD::SETOEQ, MVT::f16, Expand); setCondCodeAction(ISD::SETUNE, MVT::f16, Expand); if (Subtarget.useAVX512Regs()) { setGroup(MVT::v32f16); - setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32f16, Custom); - setOperationAction(ISD::SINT_TO_FP, MVT::v32i16, Legal); - setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v32i16, Legal); - setOperationAction(ISD::UINT_TO_FP, MVT::v32i16, Legal); - setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v32i16, Legal); - setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal); - setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Legal); - setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom); - setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal); - setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom); - setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32f16, Custom); - - setOperationAction(ISD::FP_TO_SINT, MVT::v32i16, Custom); - setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v32i16, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::v32i16, Custom); - setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v32i16, Custom); - setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32f16, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v32i16, Legal); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v32i16, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v32i16, Legal); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v32i16, Legal); + setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Legal); + setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal); + setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32f16, Custom); + + setOperationAction(ISD::FP_TO_SINT, MVT::v32i16, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v32i16, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::v32i16, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v32i16, Custom); + setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16); setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i8, MVT::v32i16); - setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16); + setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16); setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i8, MVT::v32i16); - setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16); + setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16); setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i1, MVT::v32i16); - setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16); + setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16); setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i1, MVT::v32i16); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f16, Legal); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32f16, Legal); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v32f16, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f16, Legal); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32f16, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v32f16, Custom); - setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal); + setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal); setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal); setOperationAction(ISD::FMINIMUM, MVT::v32f16, Custom); @@ -2380,40 +2382,40 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::LLRINT, MVT::v8f16, Legal); } - setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom); - setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i16, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom); - setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i16, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i16, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i16, Custom); if (Subtarget.hasVLX()) { setGroup(MVT::v8f16); setGroup(MVT::v16f16); - setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8f16, Legal); - setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16f16, Custom); - setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Legal); - setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i16, Legal); - setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Legal); - setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i16, Legal); - setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Legal); - setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i16, Legal); - setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Legal); - setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i16, Legal); - - setOperationAction(ISD::FP_ROUND, MVT::v8f16, Legal); - setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f16, Legal); - setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Custom); - setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal); - setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom); - setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8f16, Legal); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16f16, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Legal); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i16, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Legal); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i16, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Legal); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i16, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Legal); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i16, Legal); + + setOperationAction(ISD::FP_ROUND, MVT::v8f16, Legal); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f16, Legal); + setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal); + setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal); // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f16, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16f16, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f16, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16f16, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f16, Legal); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16f16, Legal); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f16, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f16, Legal); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16f16, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f16, Custom); setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal); setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal); @@ -2421,7 +2423,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal); // Need to custom widen these to prevent scalarization. - setOperationAction(ISD::LOAD, MVT::v4f16, Custom); + setOperationAction(ISD::LOAD, MVT::v4f16, Custom); setOperationAction(ISD::STORE, MVT::v4f16, Custom); setOperationAction(ISD::FMINIMUM, MVT::v8f16, Custom); @@ -2514,52 +2516,52 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) { - setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal); + setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal); setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal); setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal); - setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal); + setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal); setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal); - setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal); + setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal); setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal); setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal); - setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal); + setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal); setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); if (Subtarget.hasBWI()) { - setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal); - setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); + setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal); + setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); } if (Subtarget.hasFP16()) { // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64 - setOperationAction(ISD::FP_TO_SINT, MVT::v2f16, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::v2f16, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f16, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::v2f16, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::v2f16, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f16, Custom); - setOperationAction(ISD::FP_TO_SINT, MVT::v4f16, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::v4f16, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4f16, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::v4f16, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::v4f16, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4f16, Custom); // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16 - setOperationAction(ISD::SINT_TO_FP, MVT::v2f16, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v2f16, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f16, Custom); - setOperationAction(ISD::UINT_TO_FP, MVT::v2f16, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v2f16, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f16, Custom); - setOperationAction(ISD::SINT_TO_FP, MVT::v4f16, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v4f16, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4f16, Custom); - setOperationAction(ISD::UINT_TO_FP, MVT::v4f16, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v4f16, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4f16, Custom); // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16 - setOperationAction(ISD::FP_ROUND, MVT::v2f16, Custom); - setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f16, Custom); - setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom); - setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f16, Custom); + setOperationAction(ISD::FP_ROUND, MVT::v2f16, Custom); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f16, Custom); + setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f16, Custom); // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32 - setOperationAction(ISD::FP_EXTEND, MVT::v2f16, Custom); - setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f16, Custom); - setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Custom); - setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f16, Custom); + setOperationAction(ISD::FP_EXTEND, MVT::v2f16, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f16, Custom); + setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f16, Custom); } } @@ -2597,7 +2599,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // FIXME: We really should do custom legalization for addition and // subtraction on x86-32 once PR3203 is fixed. We really can't do much better // than generic legalization for 64-bit multiplication-with-overflow, though. - for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { + for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) { if (VT == MVT::i64 && !Subtarget.is64Bit()) continue; // Add/Sub/Mul with overflow operations are custom lowered. @@ -2881,8 +2883,9 @@ static bool isLogicOp(unsigned Opcode) { } static bool isTargetShuffle(unsigned Opcode) { - switch(Opcode) { - default: return false; + switch (Opcode) { + default: + return false; case X86ISD::BLENDI: case X86ISD::PSHUFB: case X86ISD::PSHUFD: @@ -2923,7 +2926,8 @@ static bool isTargetShuffle(unsigned Opcode) { static bool isTargetShuffleVariableMask(unsigned Opcode) { switch (Opcode) { - default: return false; + default: + return false; // Target Shuffles. case X86ISD::PSHUFB: case X86ISD::VPERMILPV: @@ -2949,9 +2953,8 @@ SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { if (ReturnAddrIndex == 0) { // Set up a frame object for the return address. unsigned SlotSize = RegInfo->getSlotSize(); - ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize, - -(int64_t)SlotSize, - false); + ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject( + SlotSize, -(int64_t)SlotSize, false); FuncInfo->setRAIndex(ReturnAddrIndex); } @@ -3009,7 +3012,7 @@ static bool isX86CCSigned(X86::CondCode X86CC) { static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) { switch (SetCCOpcode) { - // clang-format off + // clang-format off default: llvm_unreachable("Invalid integer condition!"); case ISD::SETEQ: return X86::COND_E; case ISD::SETGT: return X86::COND_G; @@ -3021,7 +3024,7 @@ static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) { case ISD::SETUGT: return X86::COND_A; case ISD::SETULE: return X86::COND_BE; case ISD::SETUGE: return X86::COND_AE; - // clang-format on + // clang-format on } } @@ -3059,14 +3062,14 @@ static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL, // First determine if it is required or is profitable to flip the operands. // If LHS is a foldable load, but RHS is not, flip the condition. - if (ISD::isNON_EXTLoad(LHS.getNode()) && - !ISD::isNON_EXTLoad(RHS.getNode())) { + if (ISD::isNON_EXTLoad(LHS.getNode()) && !ISD::isNON_EXTLoad(RHS.getNode())) { SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); std::swap(LHS, RHS); } switch (SetCCOpcode) { - default: break; + default: + break; case ISD::SETOLT: case ISD::SETOLE: case ISD::SETUGT: @@ -3082,7 +3085,7 @@ static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL, // 1 | 0 | 0 | X == Y // 1 | 1 | 1 | unordered switch (SetCCOpcode) { - // clang-format off + // clang-format off default: llvm_unreachable("Condcode should be pre-legalized away"); case ISD::SETUEQ: case ISD::SETEQ: return X86::COND_E; @@ -3104,7 +3107,7 @@ static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL, case ISD::SETO: return X86::COND_NP; case ISD::SETOEQ: case ISD::SETUNE: return X86::COND_INVALID; - // clang-format on + // clang-format on } } @@ -3139,7 +3142,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags = MachineMemOperand::MONone; Info.offset = 0; - const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic); + const IntrinsicData *IntrData = getIntrinsicWithChain(Intrinsic); if (!IntrData) { switch (Intrinsic) { case Intrinsic::x86_aesenc128kl: @@ -3232,7 +3235,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case TRUNCATE_TO_MEM_VI32: { Info.opc = ISD::INTRINSIC_VOID; Info.ptrVal = I.getArgOperand(0); - MVT VT = MVT::getVT(I.getArgOperand(1)->getType()); + MVT VT = MVT::getVT(I.getArgOperand(1)->getType()); MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE; if (IntrData->Type == TRUNCATE_TO_MEM_VI8) ScalarVT = MVT::i8; @@ -3252,8 +3255,8 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.ptrVal = nullptr; MVT DataVT = MVT::getVT(I.getType()); MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType()); - unsigned NumElts = std::min(DataVT.getVectorNumElements(), - IndexVT.getVectorNumElements()); + unsigned NumElts = + std::min(DataVT.getVectorNumElements(), IndexVT.getVectorNumElements()); Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts); Info.align = Align(1); Info.flags |= MachineMemOperand::MOLoad; @@ -3264,8 +3267,8 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.ptrVal = nullptr; MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType()); MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType()); - unsigned NumElts = std::min(DataVT.getVectorNumElements(), - IndexVT.getVectorNumElements()); + unsigned NumElts = + std::min(DataVT.getVectorNumElements(), IndexVT.getVectorNumElements()); Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts); Info.align = Align(1); Info.flags |= MachineMemOperand::MOStore; @@ -3424,8 +3427,9 @@ bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, // Mask vectors support all subregister combinations and operations that // extract half of vector. if (ResVT.getVectorElementType() == MVT::i1) - return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) && - (Index == ResVT.getVectorNumElements())); + return Index == 0 || + ((ResVT.getSizeInBits() == SrcVT.getSizeInBits() * 2) && + (Index == ResVT.getVectorNumElements())); return (Index % ResVT.getVectorNumElements()) == 0; } @@ -3485,9 +3489,9 @@ bool X86TargetLowering::isScalarFPTypeInSSEReg(EVT VT) const { (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16; } -bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, - const SelectionDAG &DAG, - const MachineMemOperand &MMO) const { +bool X86TargetLowering::isLoadBitCastBeneficial( + EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, + const MachineMemOperand &MMO) const { if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() && BitcastVT.getVectorElementType() == MVT::i1) return false; @@ -3496,8 +3500,8 @@ bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, return false; // If both types are legal vectors, it's always ok to convert them. - if (LoadVT.isVector() && BitcastVT.isVector() && - isTypeLegal(LoadVT) && isTypeLegal(BitcastVT)) + if (LoadVT.isVector() && BitcastVT.isVector() && isTypeLegal(LoadVT) && + isTypeLegal(BitcastVT)) return true; return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO); @@ -3521,9 +3525,7 @@ bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT, return true; } -bool X86TargetLowering::isCtlzFast() const { - return Subtarget.hasFastLZCNT(); -} +bool X86TargetLowering::isCtlzFast() const { return Subtarget.hasFastLZCNT(); } bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial( const Instruction &AndI) const { @@ -3952,8 +3954,7 @@ static bool canWidenShuffleElements(ArrayRef Mask, return true; } -static bool canWidenShuffleElements(ArrayRef Mask, - const APInt &Zeroable, +static bool canWidenShuffleElements(ArrayRef Mask, const APInt &Zeroable, bool V2IsZero, SmallVectorImpl &WidenedMask) { // Create an alternative mask with info about zeroable elements. @@ -4037,7 +4038,7 @@ bool X86::isZeroNode(SDValue Elt) { static SDValue getConstVector(ArrayRef Values, MVT VT, SelectionDAG &DAG, const SDLoc &dl, bool IsMask = false) { - SmallVector Ops; + SmallVector Ops; bool Split = false; MVT ConstVecVT = VT; @@ -4051,12 +4052,12 @@ static SDValue getConstVector(ArrayRef Values, MVT VT, SelectionDAG &DAG, MVT EltVT = ConstVecVT.getVectorElementType(); for (unsigned i = 0; i < NumElts; ++i) { bool IsUndef = Values[i] < 0 && IsMask; - SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) : - DAG.getConstant(Values[i], dl, EltVT); + SDValue OpNode = + IsUndef ? DAG.getUNDEF(EltVT) : DAG.getConstant(Values[i], dl, EltVT); Ops.push_back(OpNode); if (Split) - Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) : - DAG.getConstant(0, dl, EltVT)); + Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) + : DAG.getConstant(0, dl, EltVT)); } SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops); if (Split) @@ -4064,8 +4065,8 @@ static SDValue getConstVector(ArrayRef Values, MVT VT, SelectionDAG &DAG, return ConstsNode; } -static SDValue getConstVector(ArrayRef Bits, const APInt &Undefs, - MVT VT, SelectionDAG &DAG, const SDLoc &dl) { +static SDValue getConstVector(ArrayRef Bits, const APInt &Undefs, MVT VT, + SelectionDAG &DAG, const SDLoc &dl) { assert(Bits.size() == Undefs.getBitWidth() && "Unequal constant and undef arrays"); SmallVector Ops; @@ -4100,8 +4101,8 @@ static SDValue getConstVector(ArrayRef Bits, const APInt &Undefs, return DAG.getBitcast(VT, ConstsNode); } -static SDValue getConstVector(ArrayRef Bits, MVT VT, - SelectionDAG &DAG, const SDLoc &dl) { +static SDValue getConstVector(ArrayRef Bits, MVT VT, SelectionDAG &DAG, + const SDLoc &dl) { APInt Undefs = APInt::getZero(Bits.size()); return getConstVector(Bits, Undefs, VT, DAG, dl); } @@ -4638,8 +4639,7 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) { // May need to promote to a legal type. Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, - DAG.getConstant(0, dl, WideOpVT), - SubVec, Idx); + DAG.getConstant(0, dl, WideOpVT), SubVec, Idx); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); } @@ -4654,20 +4654,18 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, if (IdxVal == 0) { // Zero lower bits of the Vec SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8); - Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, - ZeroIdx); + Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits); Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits); // Merge them together, SubVec should be zero extended. SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, - DAG.getConstant(0, dl, WideOpVT), - SubVec, ZeroIdx); + DAG.getConstant(0, dl, WideOpVT), SubVec, ZeroIdx); Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); } - SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, - Undef, SubVec, ZeroIdx); + SubVec = + DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, SubVec, ZeroIdx); if (Vec.isUndef()) { assert(IdxVal != 0 && "Unexpected index"); @@ -4705,12 +4703,11 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, // isel to optimize when bits are known zero. Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx); Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, - DAG.getConstant(0, dl, WideOpVT), - Vec, ZeroIdx); + DAG.getConstant(0, dl, WideOpVT), Vec, ZeroIdx); } else { // Otherwise use explicit shifts to zero the bits. - Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, - Undef, Vec, ZeroIdx); + Vec = + DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); NumElems = WideOpVT.getVectorNumElements(); SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8); Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits); @@ -4763,9 +4760,9 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, // Isolate the bits after the last inserted bit. unsigned HighShift = IdxVal + SubVecNumElems; SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, - DAG.getTargetConstant(HighShift, dl, MVT::i8)); + DAG.getTargetConstant(HighShift, dl, MVT::i8)); High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High, - DAG.getTargetConstant(HighShift, dl, MVT::i8)); + DAG.getTargetConstant(HighShift, dl, MVT::i8)); // Now OR all 3 pieces together. Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High); @@ -4846,8 +4843,8 @@ static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS, return DAG.getNode(ISD::OR, DL, VT, LHS, RHS); } -void llvm::createUnpackShuffleMask(EVT VT, SmallVectorImpl &Mask, - bool Lo, bool Unary) { +void llvm::createUnpackShuffleMask(EVT VT, SmallVectorImpl &Mask, bool Lo, + bool Unary) { assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 && "Illegal vector type to unpack"); assert(Mask.empty() && "Expected an empty shuffle mask vector"); @@ -4984,13 +4981,12 @@ static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget, /// This produces a shuffle where the low element of V2 is swizzled into the /// zero/undef vector, landing at element Idx. /// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). -static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx, - bool IsZero, +static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx, bool IsZero, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = V2.getSimpleValueType(); - SDValue V1 = IsZero - ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT); + SDValue V1 = + IsZero ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT); int NumElems = VT.getVectorNumElements(); SmallVector MaskVec(NumElems); for (int i = 0; i != NumElems; ++i) @@ -8568,7 +8564,7 @@ static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG, case ISD::FADD: HOpcode = X86ISD::FHADD; break; case ISD::FSUB: HOpcode = X86ISD::FHSUB; break; default: return false; - // clang-format on + // clang-format on } } @@ -8598,8 +8594,7 @@ static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG, // op (extract_vector_elt A, I), (extract_vector_elt A, I+1) unsigned ExtIndex0 = Op0.getConstantOperandVal(1); unsigned ExtIndex1 = Op1.getConstantOperandVal(1); - unsigned ExpectedIndex = i * NumEltsIn128Bits + - (j % NumEltsIn64Bits) * 2; + unsigned ExpectedIndex = i * NumEltsIn128Bits + (j % NumEltsIn64Bits) * 2; if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1) continue; @@ -9249,8 +9244,8 @@ LowerBUILD_VECTORAsVariablePermute(SDValue V, const SDLoc &DL, return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget); } -SDValue -X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { +SDValue X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, + SelectionDAG &DAG) const { SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); @@ -9474,14 +9469,13 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { } // Is it a vector logical left shift? - if (NumElems == 2 && Idx == 1 && - X86::isZeroNode(Op.getOperand(0)) && + if (NumElems == 2 && Idx == 1 && X86::isZeroNode(Op.getOperand(0)) && !X86::isZeroNode(Op.getOperand(1))) { unsigned NumBits = VT.getSizeInBits(); - return getVShift(true, VT, - DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, - VT, Op.getOperand(1)), - NumBits/2, DAG, *this, dl); + return getVShift( + true, VT, + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(1)), + NumBits / 2, DAG, *this, dl); } if (IsAllConstants) // Otherwise, it's better to do a constpool load. @@ -9494,7 +9488,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // place. if (EVTBits == 32) { Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); - return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG); + return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, + DAG); } } @@ -9533,8 +9528,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // build_vector and broadcast it. // TODO: We could probably generalize this more. if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) { - SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1), - DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) }; + SDValue Ops[4] = {Op.getOperand(0), Op.getOperand(1), DAG.getUNDEF(EltVT), + DAG.getUNDEF(EltVT)}; auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef Ops) { // Make sure all the even/odd operands match. for (unsigned i = 2; i != NumElems; ++i) @@ -9550,8 +9545,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { DAG.getBuildVector(NarrowVT, dl, Ops)); // Broadcast from v2i64/v2f64 and cast to final VT. MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2); - return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT, - NewBV)); + return DAG.getBitcast( + VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT, NewBV)); } } @@ -9564,7 +9559,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { SDValue Lower = DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2)); SDValue Upper = DAG.getBuildVector( - HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2)); + HVT, dl, Op->ops().slice(NumElems / 2, NumElems / 2)); // Recreate the wider vector with the lower and upper part. return concatSubVectors(Lower, Upper, DAG, dl); @@ -9575,8 +9570,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (NumNonZero == 1) { // One half is zero or undef. unsigned Idx = NonZeroMask.countr_zero(); - SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, - Op.getOperand(Idx)); + SDValue V2 = + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(Idx)); return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG); } return SDValue(); @@ -9611,30 +9606,28 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { for (unsigned i = 0; i < 2; ++i) { switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) { - default: llvm_unreachable("Unexpected NonZero count"); - case 0: - Ops[i] = Ops[i*2]; // Must be a zero vector. - break; - case 1: - Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]); - break; - case 2: - Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]); - break; - case 3: - Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]); - break; + default: + llvm_unreachable("Unexpected NonZero count"); + case 0: + Ops[i] = Ops[i * 2]; // Must be a zero vector. + break; + case 1: + Ops[i] = getMOVL(DAG, dl, VT, Ops[i * 2 + 1], Ops[i * 2]); + break; + case 2: + Ops[i] = getMOVL(DAG, dl, VT, Ops[i * 2], Ops[i * 2 + 1]); + break; + case 3: + Ops[i] = getUnpackl(DAG, dl, VT, Ops[i * 2], Ops[i * 2 + 1]); + break; } } bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2; bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2; - int MaskVec[] = { - Reverse1 ? 1 : 0, - Reverse1 ? 0 : 1, - static_cast(Reverse2 ? NumElems+1 : NumElems), - static_cast(Reverse2 ? NumElems : NumElems+1) - }; + int MaskVec[] = {Reverse1 ? 1 : 0, Reverse1 ? 0 : 1, + static_cast(Reverse2 ? NumElems + 1 : NumElems), + static_cast(Reverse2 ? NumElems : NumElems + 1)}; return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec); } @@ -9653,7 +9646,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { Result = DAG.getUNDEF(VT); for (unsigned i = 1; i < NumElems; ++i) { - if (Op.getOperand(i).isUndef()) continue; + if (Op.getOperand(i).isUndef()) + continue; Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, Op.getOperand(i), DAG.getVectorIdxConstant(i, dl)); } @@ -9678,14 +9672,14 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) { // Generate scaled UNPCKL shuffle mask. SmallVector Mask; - for(unsigned i = 0; i != Scale; ++i) + for (unsigned i = 0; i != Scale; ++i) Mask.push_back(i); for (unsigned i = 0; i != Scale; ++i) - Mask.push_back(NumElems+i); + Mask.push_back(NumElems + i); Mask.append(NumElems - Mask.size(), SM_SentinelUndef); for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i) - Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask); + Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2 * i], Ops[(2 * i) + 1], Mask); } return Ops[0]; } @@ -9711,15 +9705,14 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, const SDLoc &dl, if (SubVec.isUndef()) continue; if (ISD::isFreezeUndef(SubVec.getNode())) { - // If the freeze(undef) has multiple uses then we must fold to zero. - if (SubVec.hasOneUse()) { - ++NumFreezeUndef; - } else { - ++NumZero; - Undefs.insert(SubVec); - } - } - else if (ISD::isBuildVectorAllZeros(SubVec.getNode())) + // If the freeze(undef) has multiple uses then we must fold to zero. + if (SubVec.hasOneUse()) { + ++NumFreezeUndef; + } else { + ++NumZero; + Undefs.insert(SubVec); + } + } else if (ISD::isBuildVectorAllZeros(SubVec.getNode())) ++NumZero; else { assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range. @@ -9733,9 +9726,9 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, const SDLoc &dl, MVT HalfVT = ResVT.getHalfNumVectorElementsVT(); ArrayRef Ops = Op->ops(); SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, - Ops.slice(0, NumOperands/2)); + Ops.slice(0, NumOperands / 2)); SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, - Ops.slice(NumOperands/2)); + Ops.slice(NumOperands / 2)); return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi); } @@ -9768,7 +9761,7 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, const SDLoc &dl, // TODO: Merge this with LowerAVXCONCAT_VECTORS? static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const SDLoc &dl, const X86Subtarget &Subtarget, - SelectionDAG & DAG) { + SelectionDAG &DAG) { MVT ResVT = Op.getSimpleValueType(); unsigned NumOperands = Op.getNumOperands(); assert(NumOperands > 1 && isPowerOf2_32(NumOperands) && @@ -9839,8 +9832,7 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const SDLoc &dl, DAG.getVectorIdxConstant(NumElems / 2, dl)); } -static SDValue LowerCONCAT_VECTORS(SDValue Op, - const X86Subtarget &Subtarget, +static SDValue LowerCONCAT_VECTORS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); @@ -10062,8 +10054,8 @@ static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT, // Ok, handle the in-lane shuffles by detecting if and when they repeat. // Adjust second vector indices to start at LaneSize instead of Size. - int LocalM = Mask[i] < Size ? Mask[i] % LaneSize - : Mask[i] % LaneSize + LaneSize; + int LocalM = + Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize; if (RepeatedMask[i % LaneSize] < 0) // This is the first non-undef entry in this slot of a 128-bit lane. RepeatedMask[i % LaneSize] = LocalM; @@ -10081,8 +10073,7 @@ is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef Mask, return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask); } -static bool -is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef Mask) { +static bool is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef Mask) { SmallVector RepeatedMask; return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask); } @@ -10381,8 +10372,8 @@ static SDValue getSHUFPDImmForMask(ArrayRef Mask, const SDLoc &DL, // // The function looks for a sub-mask that the nonzero elements are in // increasing order. If such sub-mask exist. The function returns true. -static bool isNonZeroElementsInOrder(const APInt &Zeroable, - ArrayRef Mask, const EVT &VectorType, +static bool isNonZeroElementsInOrder(const APInt &Zeroable, ArrayRef Mask, + const EVT &VectorType, bool &IsZeroSideLeft) { int NextElement = -1; // Check if the Mask's nonzero elements are in increasing order. @@ -11162,7 +11153,7 @@ static bool matchShuffleAsBlend(MVT VT, SDValue V1, SDValue V2, if (M == SM_SentinelUndef) continue; if (M == Elt || (0 <= M && M < NumElts && - IsElementEquivalent(NumElts, V1, V1, M, Elt))) { + IsElementEquivalent(NumElts, V1, V1, M, Elt))) { Mask[Elt] = Elt; LaneV1InUse = true; continue; @@ -11295,8 +11286,7 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, // If we have VPTERNLOG, we can use that as a bit blend. if (Subtarget.hasVLX()) - if (SDValue BitBlend = - lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG)) + if (SDValue BitBlend = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG)) return BitBlend; // Scale the blend by the number of bytes per element. @@ -11604,9 +11594,11 @@ static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT, /// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then /// permuting the elements of the result in place. -static SDValue lowerShuffleAsByteRotateAndPermute( - const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, - const X86Subtarget &Subtarget, SelectionDAG &DAG) { +static SDValue lowerShuffleAsByteRotateAndPermute(const SDLoc &DL, MVT VT, + SDValue V1, SDValue V2, + ArrayRef Mask, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) || (VT.is256BitVector() && !Subtarget.hasAVX2()) || (VT.is512BitVector() && !Subtarget.hasBWI())) @@ -11804,9 +11796,9 @@ static SDValue lowerShuffleAsDecomposedShuffleMerge( // If either input vector provides only a single element which is repeated // multiple times, unpacking from both input vectors would generate worse // code. e.g. for - // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4 - // it is better to process t4 first to create a vector of t4[0], then unpack - // that vector with t2. + // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, + // t4 it is better to process t4 first to create a vector of t4[0], then + // unpack that vector with t2. if (!V1Zero && !V2Zero && !isSingleElementRepeatedMask(V1Mask) && !isSingleElementRepeatedMask(V2Mask)) if (SDValue UnpackPerm = @@ -11818,8 +11810,8 @@ static SDValue lowerShuffleAsDecomposedShuffleMerge( return RotatePerm; // Unpack/rotate failed - try again with variable blends. - if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, - DAG)) + if (SDValue BlendPerm = + lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG)) return BlendPerm; if (VT.getScalarSizeInBits() >= 32) @@ -11933,7 +11925,7 @@ static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2, SDValue Lo, Hi; for (int i = 0; i < NumElts; ++i) { int M = Mask[i]; - assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && + assert((M == SM_SentinelUndef || (0 <= M && M < (2 * NumElts))) && "Unexpected mask index."); if (M < 0) continue; @@ -12055,8 +12047,7 @@ static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1, "Rotate-based lowering only supports 128-bit lowering!"); assert(Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"); - assert(ByteVT == MVT::v16i8 && - "SSE2 rotate lowering only needed for v16i8!"); + assert(ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"); // Default SSE2 implementation int LoByteShift = 16 - ByteRotation; @@ -12091,8 +12082,9 @@ static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1, "Only 32-bit and 64-bit elements are supported!"); // 128/256-bit vectors are only supported with VLX. - assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) - && "VLX required for 128/256-bit vectors"); + assert( + (Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && + "VLX required for 128/256-bit vectors"); SDValue Lo = V1, Hi = V2; int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask); @@ -12644,8 +12636,7 @@ static SDValue lowerShuffleAsSpecificExtension(const SDLoc &DL, MVT VT, /// are both incredibly common and often quite performance sensitive. static SDValue lowerShuffleAsZeroOrAnyExtend( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, - const APInt &Zeroable, const X86Subtarget &Subtarget, - SelectionDAG &DAG) { + const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { int Bits = VT.getSizeInBits(); int NumLanes = Bits / 128; int NumElements = VT.getVectorNumElements(); @@ -12771,7 +12762,8 @@ static SDValue getScalarValueForVectorElement(SDValue V, int Idx, // If the bitcasts shift the element size, we can't extract an equivalent // element from it. MVT NewVT = V.getSimpleValueType(); - if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) + if (!NewVT.isVector() || + NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) return SDValue(); if (V.getOpcode() == ISD::BUILD_VECTOR || @@ -12795,7 +12787,7 @@ static bool isShuffleFoldableLoad(SDValue V) { ISD::isNON_EXTLoad(peekThroughOneUseBitcasts(V).getNode()); } -template +template static bool isSoftF16(T VT, const X86Subtarget &Subtarget) { T EltVT = VT.getScalarType(); return (EltVT == MVT::bf16 && !Subtarget.hasAVX10_2()) || @@ -12808,8 +12800,7 @@ static bool isSoftF16(T VT, const X86Subtarget &Subtarget) { /// across all subtarget feature sets. static SDValue lowerShuffleAsElementInsertion( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, - const APInt &Zeroable, const X86Subtarget &Subtarget, - SelectionDAG &DAG) { + const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT ExtVT = VT; MVT EltVT = VT.getVectorElementType(); unsigned NumElts = VT.getVectorNumElements(); @@ -12842,8 +12833,8 @@ static SDValue lowerShuffleAsElementInsertion( // all the smarts here sunk into that routine. However, the current // lowering of BUILD_VECTOR makes that nearly impossible until the old // vector shuffle lowering is dead. - SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(), - DAG); + SDValue V2S = + getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(), DAG); if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) { // We need to zext the scalar if it is smaller than an i32. V2S = DAG.getBitcast(EltVT, V2S); @@ -13046,8 +13037,8 @@ static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0, // Check that both sources are extracts of the same source vector. if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR || N1.getOpcode() != ISD::EXTRACT_SUBVECTOR || - N0.getOperand(0) != N1.getOperand(0) || - !N0.hasOneUse() || !N1.hasOneUse()) + N0.getOperand(0) != N1.getOperand(0) || !N0.hasOneUse() || + !N1.hasOneUse()) return SDValue(); SDValue WideVec = N0.getOperand(0); @@ -13077,8 +13068,8 @@ static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0, NewMask.append(NumElts, -1); // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0 - SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT), - NewMask); + SDValue Shuf = + DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT), NewMask); // This is free: ymm -> xmm. return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf, DAG.getVectorIdxConstant(0, DL)); @@ -13277,8 +13268,8 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, if (!V.getValueType().isVector()) { assert(V.getScalarValueSizeInBits() == NumEltBits && "Unexpected scalar size"); - MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(), - VT.getVectorNumElements()); + MVT BroadcastVT = + MVT::getVectorVT(V.getSimpleValueType(), VT.getVectorNumElements()); return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V)); } @@ -13303,8 +13294,8 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, // elements are zeroable. static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2, unsigned &InsertPSMask, - const APInt &Zeroable, - ArrayRef Mask, SelectionDAG &DAG) { + const APInt &Zeroable, ArrayRef Mask, + SelectionDAG &DAG) { assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!"); assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); @@ -13756,8 +13747,8 @@ static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef Mask, // when the V2 input is targeting element 0 of the mask -- that is the fast // case here. if (NumV2Elements == 1 && Mask[0] >= 4) - if (SDValue V = lowerShuffleAsElementInsertion( - DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG)) + if (SDValue V = lowerShuffleAsElementInsertion(DL, MVT::v4f32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return V; if (Subtarget.hasSSE41()) { @@ -13766,8 +13757,8 @@ static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef Mask, return V; if (!isSingleSHUFPSMask(Mask)) - if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1, - V2, Mask, DAG)) + if (SDValue BlendPerm = + lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1, V2, Mask, DAG)) return BlendPerm; } @@ -13859,8 +13850,8 @@ static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef Mask, // There are special ways we can lower some single-element blends. if (NumV2Elements == 1) - if (SDValue V = lowerShuffleAsElementInsertion( - DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) + if (SDValue V = lowerShuffleAsElementInsertion(DL, MVT::v4i32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return V; // We have different paths for blend lowering, but they all must use the @@ -13990,7 +13981,7 @@ static SDValue lowerV8I16GeneralSingleInputShuffle( }; if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) { - int PSHUFDMask[4] = { -1, -1, -1, -1 }; + int PSHUFDMask[4] = {-1, -1, -1, -1}; SmallVector, 4> DWordPairs; int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2); @@ -14094,7 +14085,8 @@ static SDValue lowerV8I16GeneralSingleInputShuffle( int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0]; int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset); int TripleNonInputIdx = - TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0); + TripleInputSum - + std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0); TripleDWord = TripleNonInputIdx / 2; // We use xor with one to compute the adjacent DWord to whichever one the @@ -14172,9 +14164,9 @@ static SDValue lowerV8I16GeneralSingleInputShuffle( // Adjust the mask to match the new locations of A and B. for (int &M : Mask) - if (M >= 0 && M/2 == ADWord) + if (M >= 0 && M / 2 == ADWord) M = 2 * BDWord + M % 2; - else if (M >= 0 && M/2 == BDWord) + else if (M >= 0 && M / 2 == BDWord) M = 2 * ADWord + M % 2; // Recurse back into this routine to re-compute state now that this isn't @@ -14202,33 +14194,33 @@ static SDValue lowerV8I16GeneralSingleInputShuffle( [&PSHUFDMask](ArrayRef InPlaceInputs, ArrayRef IncomingInputs, MutableArrayRef SourceHalfMask, MutableArrayRef HalfMask, int HalfOffset) { - if (InPlaceInputs.empty()) - return; - if (InPlaceInputs.size() == 1) { - SourceHalfMask[InPlaceInputs[0] - HalfOffset] = - InPlaceInputs[0] - HalfOffset; - PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2; - return; - } - if (IncomingInputs.empty()) { - // Just fix all of the in place inputs. - for (int Input : InPlaceInputs) { - SourceHalfMask[Input - HalfOffset] = Input - HalfOffset; - PSHUFDMask[Input / 2] = Input / 2; - } - return; - } + if (InPlaceInputs.empty()) + return; + if (InPlaceInputs.size() == 1) { + SourceHalfMask[InPlaceInputs[0] - HalfOffset] = + InPlaceInputs[0] - HalfOffset; + PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2; + return; + } + if (IncomingInputs.empty()) { + // Just fix all of the in place inputs. + for (int Input : InPlaceInputs) { + SourceHalfMask[Input - HalfOffset] = Input - HalfOffset; + PSHUFDMask[Input / 2] = Input / 2; + } + return; + } - assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!"); - SourceHalfMask[InPlaceInputs[0] - HalfOffset] = - InPlaceInputs[0] - HalfOffset; - // Put the second input next to the first so that they are packed into - // a dword. We find the adjacent index by toggling the low bit. - int AdjIndex = InPlaceInputs[0] ^ 1; - SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset; - llvm::replace(HalfMask, InPlaceInputs[1], AdjIndex); - PSHUFDMask[AdjIndex / 2] = AdjIndex / 2; - }; + assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!"); + SourceHalfMask[InPlaceInputs[0] - HalfOffset] = + InPlaceInputs[0] - HalfOffset; + // Put the second input next to the first so that they are packed into + // a dword. We find the adjacent index by toggling the low bit. + int AdjIndex = InPlaceInputs[0] ^ 1; + SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset; + llvm::replace(HalfMask, InPlaceInputs[1], AdjIndex); + PSHUFDMask[AdjIndex / 2] = AdjIndex / 2; + }; fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0); fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4); @@ -14237,10 +14229,12 @@ static SDValue lowerV8I16GeneralSingleInputShuffle( // FIXME: This operation could almost certainly be simplified dramatically to // look more like the 3-1 fixing operation. auto moveInputsToRightHalf = [&PSHUFDMask]( - MutableArrayRef IncomingInputs, ArrayRef ExistingInputs, - MutableArrayRef SourceHalfMask, MutableArrayRef HalfMask, - MutableArrayRef FinalSourceHalfMask, int SourceOffset, - int DestOffset) { + MutableArrayRef IncomingInputs, + ArrayRef ExistingInputs, + MutableArrayRef SourceHalfMask, + MutableArrayRef HalfMask, + MutableArrayRef FinalSourceHalfMask, + int SourceOffset, int DestOffset) { auto isWordClobbered = [](ArrayRef SourceHalfMask, int Word) { return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word; }; @@ -14436,9 +14430,11 @@ static SDValue lowerV8I16GeneralSingleInputShuffle( /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the /// blend if only one input is used. -static SDValue lowerShuffleAsBlendOfPSHUFBs( - const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, - const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) { +static SDValue lowerShuffleAsBlendOfPSHUFBs(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef Mask, + const APInt &Zeroable, + SelectionDAG &DAG, bool &V1InUse, + bool &V2InUse) { assert(!is128BitLaneCrossingShuffleMask(VT, Mask) && "Lane crossing shuffle masks not supported"); @@ -14533,8 +14529,8 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef Mask, return Broadcast; // Try to use bit rotation instructions. - if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask, - Subtarget, DAG)) + if (SDValue Rotate = + lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask, Subtarget, DAG)) return Rotate; // Use dedicated unpack instructions for masks that match their pattern. @@ -14569,14 +14565,14 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef Mask, // See if we can use SSE4A Extraction / Insertion. if (Subtarget.hasSSE4A()) - if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask, - Zeroable, DAG)) + if (SDValue V = + lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask, Zeroable, DAG)) return V; // There are special ways we can lower some single-element blends. if (NumV2Inputs == 1) - if (SDValue V = lowerShuffleAsElementInsertion( - DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) + if (SDValue V = lowerShuffleAsElementInsertion(DL, MVT::v8i16, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return V; // We have different paths for blend lowering, but they all must use the @@ -14692,8 +14688,8 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef Mask, // can both shuffle and set up the inefficient blend. if (!IsBlendSupported && Subtarget.hasSSSE3()) { bool V1InUse, V2InUse; - return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask, - Zeroable, DAG, V1InUse, V2InUse); + return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask, Zeroable, + DAG, V1InUse, V2InUse); } // We can always bit-blend if we have to so the fallback strategy is to @@ -14826,8 +14822,8 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef Mask, // See if we can use SSE4A Extraction / Insertion. if (Subtarget.hasSSE4A()) - if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, - Zeroable, DAG)) + if (SDValue V = + lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG)) return V; int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; }); @@ -14840,8 +14836,8 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef Mask, return Broadcast; // Try to use bit rotation instructions. - if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask, - Subtarget, DAG)) + if (SDValue Rotate = + lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask, Subtarget, DAG)) return Rotate; if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG)) @@ -14882,7 +14878,7 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef Mask, int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1}; SmallDenseMap LaneMap; for (int I : InPlaceInputs) { - PreDupI16Shuffle[I/2] = I/2; + PreDupI16Shuffle[I / 2] = I / 2; LaneMap[I] = I; } int j = TargetLo ? 0 : 4, je = j + 4; @@ -14896,7 +14892,8 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef Mask, ++j; if (j == je) - // We can't place the inputs into a single half with a simple i16 shuffle, so bail. + // We can't place the inputs into a single half with a simple i16 + // shuffle, so bail. return SDValue(); // Map this input with the i16 shuffle. @@ -15017,8 +15014,8 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef Mask, // Use PALIGNR+Permute if possible - permute might become PSHUFB but the // PALIGNR will be cheaper than the second PSHUFB+OR. - if (SDValue V = lowerShuffleAsByteRotateAndPermute( - DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) + if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v16i8, V1, V2, + Mask, Subtarget, DAG)) return V; } @@ -15027,8 +15024,8 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef Mask, // There are special ways we can lower some single-element blends. if (NumV2Elements == 1) - if (SDValue V = lowerShuffleAsElementInsertion( - DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) + if (SDValue V = lowerShuffleAsElementInsertion(DL, MVT::v16i8, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return V; if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG)) @@ -15120,8 +15117,8 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef Mask, if (M >= 0) M /= 2; } else { - // Otherwise just unpack the low half of V into VLoHalf and the high half into - // VHiHalf so that we can blend them as i16s. + // Otherwise just unpack the low half of V into VLoHalf and the high half + // into VHiHalf so that we can blend them as i16s. SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL); VLoHalf = DAG.getBitcast( @@ -15130,8 +15127,10 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef Mask, MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero)); } - SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask); - SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask); + SDValue LoV = + DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask); + SDValue HiV = + DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask); return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV); } @@ -15140,9 +15139,8 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef Mask, /// /// This routine breaks down the specific type of 128-bit shuffle and /// dispatches to the lowering routines accordingly. -static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef Mask, - MVT VT, SDValue V1, SDValue V2, - const APInt &Zeroable, +static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef Mask, MVT VT, + SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { if (VT == MVT::v8bf16) { @@ -15324,7 +15322,7 @@ static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(!V2.isUndef() && "This routine must not be used to lower single-input " - "shuffles as it could then recurse on itself."); + "shuffles as it could then recurse on itself."); int Size = Mask.size(); // If this can be modeled as a broadcast of two elements followed by a blend, @@ -15663,8 +15661,8 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1, // instruction bytes needed to explicitly generate the zero vector. // Blends are faster and handle all the non-lane-crossing cases. - if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable, - Subtarget, DAG)) + if (SDValue Blend = + lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // If either input operand is a zero vector, use VPERM2X128 because its mask @@ -15690,8 +15688,8 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1, // Try to use SHUF128 if possible. if (Subtarget.hasVLX()) { if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) { - unsigned PermMask = ((WidenedMask[0] % 2) << 0) | - ((WidenedMask[1] % 2) << 1); + unsigned PermMask = + ((WidenedMask[0] % 2) << 0) | ((WidenedMask[1] % 2) << 1); return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2, DAG.getTargetConstant(PermMask, DL, MVT::i8)); } @@ -15715,7 +15713,7 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1, (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?"); unsigned PermMask = 0; - PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0); + PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0); PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4); // Check the immediate mask and replace unused sources with undef. @@ -15907,9 +15905,9 @@ static SDValue lowerShuffleAsLanePermuteAndRepeatedMask( /// adjusted to access the extracted halves of the original shuffle operands is /// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or /// lower half of each input operand is accessed. -static bool -getHalfShuffleMask(ArrayRef Mask, MutableArrayRef HalfMask, - int &HalfIdx1, int &HalfIdx2) { +static bool getHalfShuffleMask(ArrayRef Mask, + MutableArrayRef HalfMask, int &HalfIdx1, + int &HalfIdx2) { assert((Mask.size() == HalfMask.size() * 2) && "Expected input mask to be twice as long as output"); @@ -15962,7 +15960,8 @@ getHalfShuffleMask(ArrayRef Mask, MutableArrayRef HalfMask, static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef HalfMask, int HalfIdx1, int HalfIdx2, bool UndefLower, - SelectionDAG &DAG, bool UseConcat = false) { + SelectionDAG &DAG, + bool UseConcat = false) { assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?"); assert(V1.getValueType().isSimple() && "Expecting only simple types"); @@ -16324,7 +16323,7 @@ static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && "Illegal shuffle mask"); - bool ZeroLane[2] = { true, true }; + bool ZeroLane[2] = {true, true}; for (int i = 0; i < NumElts; ++i) ZeroLane[i & 1] &= Zeroable[i]; @@ -16409,9 +16408,9 @@ static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT, // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in // the upper bits of the result using an unpckldq. - SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, - { 0, 1, 2, 3, 16, 17, 18, 19, - 4, 5, 6, 7, 20, 21, 22, 23 }); + SDValue Unpack = DAG.getVectorShuffle( + MVT::v16i8, DL, V1, V2, + {0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23}); // Insert the unpckldq into a zero vector to widen to v32i8. return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8, DAG.getConstant(0, DL, MVT::v32i8), Unpack, @@ -16648,8 +16647,8 @@ static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef Mask, return Blend; // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask, - Subtarget, DAG)) + if (SDValue Broadcast = + lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // Try to use shift instructions if fast. @@ -16756,8 +16755,8 @@ static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef Mask, return Blend; // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask, - Subtarget, DAG)) + if (SDValue Broadcast = + lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) return Broadcast; if (!Subtarget.hasAVX2()) { @@ -16904,8 +16903,8 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef Mask, return Blend; // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask, - Subtarget, DAG)) + if (SDValue Broadcast = + lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // Try to use shift instructions if fast. @@ -17072,7 +17071,8 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef Mask, // Try to produce a fixed cross-128-bit lane permute followed by unpack // because that should be faster than the variable permute alternatives. - if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, V1, V2, Mask, DAG)) + if (SDValue V = + lowerShuffleWithUNPCK256(DL, MVT::v16i16, V1, V2, Mask, DAG)) return V; // There are no generalized cross-lane shuffle operations available on i16 @@ -17091,8 +17091,8 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef Mask, // As this is a single-input shuffle, the repeated mask should be // a strictly valid v8i16 mask that we can pass through to the v8i16 // lowering to handle even the v16 case. - return lowerV8I16GeneralSingleInputShuffle( - DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG); + return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v16i16, V1, + RepeatedMask, Subtarget, DAG); } } @@ -17111,8 +17111,8 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef Mask, return Result; // Try to permute the lanes and then use a per-lane permute. - if (SDValue V = lowerShuffleAsLanePermuteAndPermute( - DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget)) + if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v16i16, V1, V2, + Mask, DAG, Subtarget)) return V; // Try to match an interleave of two v16i16s and lower them as unpck and @@ -17148,8 +17148,8 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef Mask, return ZExt; // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask, - Subtarget, DAG)) + if (SDValue Broadcast = + lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) return Broadcast; if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask, @@ -17201,8 +17201,8 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef Mask, if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, V1, V2, Mask, DAG)) return V; - if (SDValue V = lowerShuffleAsLanePermuteAndPermute( - DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget)) + if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v32i8, V1, V2, + Mask, DAG, Subtarget)) return V; return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask, @@ -17224,16 +17224,16 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef Mask, return Result; // Try to permute the lanes and then use a per-lane permute. - if (SDValue V = lowerShuffleAsLanePermuteAndPermute( - DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget)) + if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v32i8, V1, V2, + Mask, DAG, Subtarget)) return V; // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed // by zeroable elements in the remaining 24 elements. Turn this into two // vmovqb instructions shuffled together. if (Subtarget.hasVLX()) - if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2, - Mask, Zeroable, DAG)) + if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2, Mask, + Zeroable, DAG)) return V; // Try to match an interleave of two v32i8s and lower them as unpck and @@ -17288,7 +17288,8 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef Mask, MVT VT, return V; if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG)) return V; - return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false); + return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, + /*SimpleOnly*/ false); } MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits), @@ -17537,8 +17538,7 @@ static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef Mask, // If we have a single input shuffle with different shuffle patterns in the // 128-bit lanes and don't lane cross, use variable mask VPERMILPS. - if (V2.isUndef() && - !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) { + if (V2.isUndef() && !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) { SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true); return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask); } @@ -17805,8 +17805,8 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef Mask, // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. - if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend( - DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) + if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v64i8, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return ZExt; // Use dedicated unpack instructions for masks that match their pattern. @@ -17883,7 +17883,8 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef Mask, if (Subtarget.hasVBMI()) return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG); - return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false); + return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, + /*SimpleOnly*/ false); } /// High-level routine to lower various 512-bit x86 vector shuffles. @@ -17891,13 +17892,11 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef Mask, /// This routine either breaks down the specific type of a 512-bit x86 vector /// shuffle or splits it into two 256-bit shuffles and fuses the results back /// together based on the available instructions. -static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef Mask, - MVT VT, SDValue V1, SDValue V2, - const APInt &Zeroable, +static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef Mask, MVT VT, + SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { - assert(Subtarget.hasAVX512() && - "Cannot lower 512-bit vectors w/ basic ISA!"); + assert(Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"); // If we have a single input to the zero element, insert that into V1 if we // can do so cheaply. @@ -17915,8 +17914,8 @@ static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef Mask, return V; // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask, - Subtarget, DAG)) + if (SDValue Broadcast = + lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG)) return Broadcast; if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) { @@ -17928,7 +17927,8 @@ static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef Mask, if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG)) return V; - return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false); + return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, + /*SimpleOnly*/ false); } if (VT == MVT::v32f16 || VT == MVT::v32bf16) { @@ -18035,14 +18035,12 @@ static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef Mask, return -1; } - // Lower vXi1 vector shuffles. // There is no a dedicated instruction on AVX-512 that shuffles the masks. // The only way to shuffle bits is to sign-extend the mask vector to SIMD // vector, shuffle and then truncate it back. -static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef Mask, - MVT VT, SDValue V1, SDValue V2, - const APInt &Zeroable, +static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef Mask, MVT VT, + SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Subtarget.hasAVX512() && @@ -18173,8 +18171,8 @@ static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef Mask, int NumElems = VT.getVectorNumElements(); if ((Subtarget.hasBWI() && (NumElems >= 32)) || (Subtarget.hasDQI() && (NumElems < 32))) - return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT), - Shuffle, ISD::SETGT); + return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT), Shuffle, + ISD::SETGT); return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle); } @@ -18301,7 +18299,7 @@ static SDValue canonicalizeShuffleMaskWithHorizOp( unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget); - /// Top-level lowering for x86 vector shuffles. +/// Top-level lowering for x86 vector shuffles. /// /// This handles decomposition, canonicalization, and lowering of all x86 /// vector shuffles. Most of the specific lowering strategies are encapsulated @@ -18377,8 +18375,8 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget, // by obfuscating the operands with bitcasts. // TODO: Avoid lowering directly from this top-level function: make this // a query (canLowerAsBroadcast) and defer lowering to the type-based calls. - if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask, - Subtarget, DAG)) + if (SDValue Broadcast = + lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask, Subtarget, DAG)) return Broadcast; MVT NewEltVT = VT.isFloatingPoint() @@ -18601,8 +18599,7 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { // Build a mask by testing the condition against zero. MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts); SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond, - DAG.getConstant(0, dl, CondVT), - ISD::SETNE); + DAG.getConstant(0, dl, CondVT), ISD::SETNE); // Now return a new VSELECT using the mask. return DAG.getSelect(dl, VT, Mask, LHS, RHS); } @@ -18709,7 +18706,7 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { } if (VT == MVT::i32 || VT == MVT::i64) - return Op; + return Op; return SDValue(); } @@ -18722,7 +18719,7 @@ static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG, SDLoc dl(Vec); MVT VecVT = Vec.getSimpleValueType(); SDValue Idx = Op.getOperand(1); - auto* IdxC = dyn_cast(Idx); + auto *IdxC = dyn_cast(Idx); MVT EltVT = Op.getSimpleValueType(); assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && @@ -18737,7 +18734,8 @@ static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG, if (NumElts == 1) { Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl); MVT IntVT = MVT::getIntegerVT(Vec.getValueType().getVectorNumElements()); - return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, DAG.getBitcast(IntVT, Vec)); + return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, + DAG.getBitcast(IntVT, Vec)); } MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8; MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts); @@ -18795,14 +18793,13 @@ static APInt getExtractedDemandedElts(SDNode *N) { return DemandedElts; } -SDValue -X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, - SelectionDAG &DAG) const { +SDValue X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, + SelectionDAG &DAG) const { SDLoc dl(Op); SDValue Vec = Op.getOperand(0); MVT VecVT = Vec.getSimpleValueType(); SDValue Idx = Op.getOperand(1); - auto* IdxC = dyn_cast(Idx); + auto *IdxC = dyn_cast(Idx); if (VecVT.getVectorElementType() == MVT::i1) return ExtractBitFromMaskVector(Op, DAG, Subtarget); @@ -18833,10 +18830,10 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, // | | Ports pressure in cycles | | // |Uops| 1 | 2 - D |3 - D | 4 | 5 | | // --------------------------------------------------------- - // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0 - // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18] - // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1] - // Total Num Of Uops: 4 + // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], + // xmm0 |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18] |1 + // | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1] Total Num + // Of Uops: 4 return SDValue(); } @@ -18941,7 +18938,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, // UNPCKHPD the element to the lowest double word, then movsd. // Note if the lower 64 bits of the result of the UNPCKHPD is then stored // to a f64mem, the whole operation is folded into a single MOVHPDmr. - int Mask[2] = { 1, -1 }; + int Mask[2] = {1, -1}; Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, DAG.getVectorIdxConstant(0, dl)); @@ -18966,9 +18963,10 @@ static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG, unsigned NumElts = VecVT.getVectorNumElements(); MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8; MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts); - SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT, - DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec), - DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx); + SDValue ExtOp = + DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT, + DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec), + DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx); return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp); } @@ -18995,9 +18993,9 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, if (EltVT == MVT::bf16) { MVT IVT = VT.changeVectorElementTypeToInteger(); - SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVT, - DAG.getBitcast(IVT, N0), - DAG.getBitcast(MVT::i16, N1), N2); + SDValue Res = + DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVT, DAG.getBitcast(IVT, N0), + DAG.getBitcast(MVT::i16, N1), N2); return DAG.getBitcast(VT, Res); } @@ -19258,8 +19256,9 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, } // Returns the appropriate wrapper opcode for a global reference. -unsigned X86TargetLowering::getGlobalWrapperKind( - const GlobalValue *GV, const unsigned char OpFlags) const { +unsigned +X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV, + const unsigned char OpFlags) const { // References to absolute symbols are never PC-relative. if (GV && GV->isAbsoluteSymbolRef()) return X86ISD::Wrapper; @@ -19283,8 +19282,8 @@ unsigned X86TargetLowering::getGlobalWrapperKind( // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only // be used to form addressing mode. These wrapped nodes will be selected // into MOV32ri. -SDValue -X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { +SDValue X86TargetLowering::LowerConstantPool(SDValue Op, + SelectionDAG &DAG) const { ConstantPoolSDNode *CP = cast(Op); // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the @@ -19334,11 +19333,10 @@ SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op, return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr); } -SDValue -X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { +SDValue X86TargetLowering::LowerBlockAddress(SDValue Op, + SelectionDAG &DAG) const { // Create the TargetBlockAddressAddress node. - unsigned char OpFlags = - Subtarget.classifyBlockAddressReference(); + unsigned char OpFlags = Subtarget.classifyBlockAddressReference(); const BlockAddress *BA = cast(Op)->getBlockAddress(); int64_t Offset = cast(Op)->getOffset(); SDLoc dl(Op); @@ -19443,8 +19441,8 @@ SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG, return Result; } -SDValue -X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { +SDValue X86TargetLowering::LowerGlobalAddress(SDValue Op, + SelectionDAG &DAG) const { return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr); } @@ -19522,24 +19520,24 @@ static SDValue GetTLSADDR(SelectionDAG &DAG, GlobalAddressSDNode *GA, } // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit -static SDValue -LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, - const EVT PtrVT) { +static SDValue LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, + SelectionDAG &DAG, + const EVT PtrVT) { return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD, /*LoadGlobalBaseReg=*/true); } // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64 -static SDValue -LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, - const EVT PtrVT) { +static SDValue LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, + SelectionDAG &DAG, + const EVT PtrVT) { return GetTLSADDR(DAG, GA, PtrVT, X86::RAX, X86II::MO_TLSGD); } // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32 -static SDValue -LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG, - const EVT PtrVT) { +static SDValue LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, + SelectionDAG &DAG, + const EVT PtrVT) { return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD); } @@ -19571,9 +19569,8 @@ static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, // Build x@dtpoff. unsigned char OperandFlags = X86II::MO_DTPOFF; unsigned WrapperKind = X86ISD::Wrapper; - SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, - GA->getValueType(0), - GA->getOffset(), OperandFlags); + SDValue TGA = DAG.getTargetGlobalAddress( + GA->getGlobal(), dl, GA->getValueType(0), GA->getOffset(), OperandFlags); SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); // Add x@dtpoff with the base. @@ -19614,9 +19611,8 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, // emit "addl x@ntpoff,%eax" (local exec) // or "addl x@indntpoff,%eax" (initial exec) // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic) - SDValue TGA = - DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), - GA->getOffset(), OperandFlags); + SDValue TGA = DAG.getTargetGlobalAddress( + GA->getGlobal(), dl, GA->getValueType(0), GA->getOffset(), OperandFlags); SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); if (model == TLSModel::InitialExec) { @@ -19635,8 +19631,8 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); } -SDValue -X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { +SDValue X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, + SelectionDAG &DAG) const { GlobalAddressSDNode *GA = cast(Op); @@ -19650,20 +19646,20 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { if (Subtarget.isTargetELF()) { TLSModel::Model model = DAG.getTarget().getTLSModel(GV); switch (model) { - case TLSModel::GeneralDynamic: - if (Subtarget.is64Bit()) { - if (Subtarget.isTarget64BitLP64()) - return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT); - return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT); - } - return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT); - case TLSModel::LocalDynamic: - return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(), - Subtarget.isTarget64BitLP64()); - case TLSModel::InitialExec: - case TLSModel::LocalExec: - return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(), - PositionIndependent); + case TLSModel::GeneralDynamic: + if (Subtarget.is64Bit()) { + if (Subtarget.isTarget64BitLP64()) + return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT); + return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT); + } + return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT); + case TLSModel::LocalDynamic: + return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(), + Subtarget.isTarget64BitLP64()); + case TLSModel::InitialExec: + case TLSModel::LocalExec: + return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(), + PositionIndependent); } llvm_unreachable("Unknown TLS model."); } @@ -19684,9 +19680,8 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { WrapperKind = X86ISD::WrapperRIP; } SDLoc DL(Op); - SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, - GA->getValueType(0), - GA->getOffset(), OpFlag); + SDValue Result = DAG.getTargetGlobalAddress( + GA->getGlobal(), DL, GA->getValueType(0), GA->getOffset(), OpFlag); SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result); // With PIC32, the address is actually $g + Offset. @@ -19700,7 +19695,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = DAG.getEntryNode(); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL); - SDValue Args[] = { Chain, Offset }; + SDValue Args[] = {Chain, Offset}; Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args); Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL); @@ -19768,9 +19763,9 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo()); // Get the offset of start of .tls section - SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, - GA->getValueType(0), - GA->getOffset(), X86II::MO_SECREL); + SDValue TGA = + DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), + GA->getOffset(), X86II::MO_SECREL); SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA); // The address of the thread local variable is the add of the thread @@ -19830,8 +19825,8 @@ static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, const SDLoc &dl, MVT SrcVT = Src.getSimpleValueType(); MVT VT = Op.getSimpleValueType(); - if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() || - (VT != MVT::f32 && VT != MVT::f64)) + if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() || + (VT != MVT::f32 && VT != MVT::f64)) return SDValue(); // Pack the i64 into a vector, do the operation and extract. @@ -19896,22 +19891,22 @@ static SDValue LowerI64IntToFP16(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT, const X86Subtarget &Subtarget) { switch (Opcode) { - case ISD::SINT_TO_FP: - // TODO: Handle wider types with AVX/AVX512. - if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32) - return false; - // CVTDQ2PS or (V)CVTDQ2PD - return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64); - - case ISD::UINT_TO_FP: - // TODO: Handle wider types and i64 elements. - if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32) - return false; - // VCVTUDQ2PS or VCVTUDQ2PD - return ToVT == MVT::v4f32 || ToVT == MVT::v4f64; + case ISD::SINT_TO_FP: + // TODO: Handle wider types with AVX/AVX512. + if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32) + return false; + // CVTDQ2PS or (V)CVTDQ2PD + return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64); - default: + case ISD::UINT_TO_FP: + // TODO: Handle wider types and i64 elements. + if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32) return false; + // VCVTUDQ2PS or VCVTUDQ2PD + return ToVT == MVT::v4f32 || ToVT == MVT::v4f64; + + default: + return false; } } @@ -20055,7 +20050,7 @@ static SDValue lowerINT_TO_FP_vXi64(SDValue Op, const SDLoc &DL, return SDValue(); SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64); - SDValue One = DAG.getConstant(1, DL, MVT::v4i64); + SDValue One = DAG.getConstant(1, DL, MVT::v4i64); SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64, DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One), DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One)); @@ -20275,7 +20270,7 @@ std::pair X86TargetLowering::BuildFILD( Chain = Result.getValue(1); } - return { Result, Chain }; + return {Result, Chain}; } /// Horizontal vector math instructions may be slower than normal math with @@ -20312,18 +20307,18 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, const SDLoc &dl, LLVMContext *Context = DAG.getContext(); // Build some magic constants. - static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 }; + static const uint32_t CV0[] = {0x43300000, 0x45300000, 0, 0}; Constant *C0 = ConstantDataVector::get(*Context, CV0); auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16)); - SmallVector CV1; + SmallVector CV1; CV1.push_back( - ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(), - APInt(64, 0x4330000000000000ULL)))); + ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(), + APInt(64, 0x4330000000000000ULL)))); CV1.push_back( - ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(), - APInt(64, 0x4530000000000000ULL)))); + ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(), + APInt(64, 0x4530000000000000ULL)))); Constant *C1 = ConstantVector::get(CV1); SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16)); @@ -20344,11 +20339,10 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, const SDLoc &dl, SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); SDValue Result; - if (Subtarget.hasSSE3() && - shouldUseHorizontalOp(true, DAG, Subtarget)) { + if (Subtarget.hasSSE3() && shouldUseHorizontalOp(true, DAG, Subtarget)) { Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub); } else { - SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1}); + SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1, -1}); Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub); } Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result, @@ -20374,8 +20368,7 @@ static SDValue LowerUINT_TO_FP_i32(SDValue Op, const SDLoc &dl, // Or the load with the bias. SDValue Or = DAG.getNode( - ISD::OR, dl, MVT::v2i64, - DAG.getBitcast(MVT::v2i64, Load), + ISD::OR, dl, MVT::v2i64, DAG.getBitcast(MVT::v2i64, Load), DAG.getBitcast(MVT::v2i64, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias))); Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, @@ -20579,8 +20572,9 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL, SDValue VecBitcast = DAG.getBitcast(VecI16VT, V); // Low will be bitcasted right away, so do not bother bitcasting back to its // original type. - Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast, - VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8)); + Low = + DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast, VecCstLowBitcast, + DAG.getTargetConstant(0xaa, DL, MVT::i8)); // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16), // (uint4) 0x53000000, 0xaa); SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh); @@ -20588,7 +20582,8 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL, // High will be bitcasted right away, so do not bother bitcasting back to // its original type. High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast, - VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8)); + VecCstHighBitcast, + DAG.getTargetConstant(0xaa, DL, MVT::i8)); } else { SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT); // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000; @@ -20624,7 +20619,8 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL, return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh); } -static SDValue lowerUINT_TO_FP_vec(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, +static SDValue lowerUINT_TO_FP_vec(SDValue Op, const SDLoc &dl, + SelectionDAG &DAG, const X86Subtarget &Subtarget) { unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0; SDValue N0 = Op.getOperand(OpNo); @@ -20835,8 +20831,7 @@ SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, DstTy = MVT::i64; } - assert(DstTy.getSimpleVT() <= MVT::i64 && - DstTy.getSimpleVT() >= MVT::i16 && + assert(DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"); // We lower FP->int64 into FISTP64 followed by a load from a temporary @@ -20874,8 +20869,8 @@ SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool LosesInfo = false; if (TheVT == MVT::f64) // The rounding mode is irrelevant as the conversion should be exact. - Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven, - &LosesInfo); + Status = Thresh.convert(APFloat::IEEEdouble(), + APFloat::rmNearestTiesToEven, &LosesInfo); else if (TheVT == MVT::f80) Status = Thresh.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven, &LosesInfo); @@ -20885,8 +20880,8 @@ SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT); - EVT ResVT = getSetCCResultType(DAG.getDataLayout(), - *DAG.getContext(), TheVT); + EVT ResVT = + getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), TheVT); SDValue Cmp; if (IsStrict) { Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain, @@ -20915,8 +20910,8 @@ SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, DAG.getConstantFP(0.0, DL, TheVT)); if (IsStrict) { - Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other}, - { Chain, Value, FltOfs }); + Value = DAG.getNode(ISD::STRICT_FSUB, DL, {TheVT, MVT::Other}, + {Chain, Value, FltOfs}); Chain = Value.getValue(1); } else Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs); @@ -20930,7 +20925,7 @@ SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI); SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); - SDValue Ops[] = { Chain, StackSlot }; + SDValue Ops[] = {Chain, StackSlot}; unsigned FLDSize = TheVT.getStoreSize(); assert(FLDSize <= MemSize && "Stack slot not big enough"); @@ -20943,10 +20938,9 @@ SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, // Build the FP_TO_INT*_IN_MEM MachineMemOperand *MMO = MF.getMachineMemOperand( MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize)); - SDValue Ops[] = { Chain, Value, StackSlot }; - SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL, - DAG.getVTList(MVT::Other), - Ops, DstTy, MMO); + SDValue Ops[] = {Chain, Value, StackSlot}; + SDValue FIST = DAG.getMemIntrinsicNode( + X86ISD::FP_TO_INT_IN_MEM, DL, DAG.getVTList(MVT::Other), Ops, DstTy, MMO); SDValue Res = DAG.getLoad(Op.getValueType(), DL, FIST, StackSlot, MPI); Chain = Res.getValue(1); @@ -21125,7 +21119,7 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In, return In; unsigned NumElems = SrcVT.getVectorNumElements(); - if (NumElems < 2 || !isPowerOf2_32(NumElems) ) + if (NumElems < 2 || !isPowerOf2_32(NumElems)) return SDValue(); unsigned DstSizeInBits = DstVT.getSizeInBits(); @@ -21196,7 +21190,7 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In, // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits. SmallVector Mask; int Scale = 64 / OutVT.getScalarSizeInBits(); - narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask); + narrowShuffleMaskElts(Scale, {0, 2, 1, 3}, Mask); Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask); if (DstVT.is256BitVector()) @@ -21440,14 +21434,12 @@ static SDValue LowerTruncateVecI1(SDValue Op, const SDLoc &DL, if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) { // We need to shift to get the lsb into sign position. // Shift packed bytes not supported natively, bitcast to word - MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16); - In = DAG.getNode(ISD::SHL, DL, ExtVT, - DAG.getBitcast(ExtVT, In), + MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits() / 16); + In = DAG.getNode(ISD::SHL, DL, ExtVT, DAG.getBitcast(ExtVT, In), DAG.getConstant(ShiftInx, DL, ExtVT)); In = DAG.getBitcast(InVT, In); } - return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), - In, ISD::SETGT); + return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT); } // Use TESTD/Q, extended vector to packed dword/qword. assert((InVT.is256BitVector() || InVT.is128BitVector()) && @@ -21485,7 +21477,8 @@ static SDValue LowerTruncateVecI1(SDValue Op, const SDLoc &DL, // We either have 8 elements or we're allowed to use 512-bit vectors. // If we have VLX, we want to use the narrowest vector that can get the // job done so we use vXi32. - MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts); + MVT EltVT = + Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512 / NumElts); MVT ExtVT = MVT::getVectorVT(EltVT, NumElts); In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In); InVT = ExtVT; @@ -21599,10 +21592,9 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { // On AVX2, v8i32 -> v8i16 becomes PSHUFB. if (Subtarget.hasInt256()) { // The PSHUFB mask: - static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13, - -1, -1, -1, -1, -1, -1, -1, -1, - 16, 17, 20, 21, 24, 25, 28, 29, - -1, -1, -1, -1, -1, -1, -1, -1 }; + static const int ShufMask1[] = { + 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, + 16, 17, 20, 21, 24, 25, 28, 29, -1, -1, -1, -1, -1, -1, -1, -1}; In = DAG.getBitcast(MVT::v32i8, In); In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1); In = DAG.getBitcast(MVT::v4i64, In); @@ -21793,8 +21785,8 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { dl, {NVT, MVT::Other}, {Chain, Src}); Chain = Res.getValue(1); } else { - Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl, - NVT, Src); + Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl, NVT, + Src); } // TODO: Need to add exception check code for strict FP. @@ -21896,8 +21888,8 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, DAG.getUNDEF(MVT::v2f32)); if (IsStrict) { - unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI - : X86ISD::STRICT_CVTTP2UI; + unsigned Opc = + IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI; return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp}); } unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; @@ -22022,7 +22014,7 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { makeLibCall(DAG, LC, VT, Src, CallOptions, dl, Chain); if (IsStrict) - return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl); + return DAG.getMergeValues({Tmp.first, Tmp.second}, dl); return Tmp.first; } @@ -22085,7 +22077,7 @@ SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N, assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!"); Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI); SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); - SDValue Ops[] = { Chain, StackPtr }; + SDValue Ops[] = {Chain, StackPtr}; Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI, /*Align*/ std::nullopt, @@ -22093,7 +22085,7 @@ SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N, Chain = Src.getValue(1); } - SDValue StoreOps[] = { Chain, Src, StackPtr }; + SDValue StoreOps[] = {Chain, Src, StackPtr}; Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other), StoreOps, DstVT, MPI, /*Align*/ std::nullopt, MachineMemOperand::MOStore); @@ -22101,8 +22093,8 @@ SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N, return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI); } -SDValue -X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const { +SDValue X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, + SelectionDAG &DAG) const { // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation, // but making use of X86 specifics to produce better instruction sequences. SDNode *Node = Op.getNode(); @@ -22164,12 +22156,12 @@ X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const { APFloat MinFloat(Sem); APFloat MaxFloat(Sem); - APFloat::opStatus MinStatus = MinFloat.convertFromAPInt( - MinInt, IsSigned, APFloat::rmTowardZero); - APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt( - MaxInt, IsSigned, APFloat::rmTowardZero); - bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact) - && !(MaxStatus & APFloat::opStatus::opInexact); + APFloat::opStatus MinStatus = + MinFloat.convertFromAPInt(MinInt, IsSigned, APFloat::rmTowardZero); + APFloat::opStatus MaxStatus = + MaxFloat.convertFromAPInt(MaxInt, IsSigned, APFloat::rmTowardZero); + bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact) && + !(MaxStatus & APFloat::opStatus::opInexact); SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT); SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT); @@ -22179,11 +22171,11 @@ X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const { if (AreExactFloatBounds) { if (DstVT != TmpVT) { // Clamp by MinFloat from below. If Src is NaN, propagate NaN. - SDValue MinClamped = DAG.getNode( - X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src); + SDValue MinClamped = + DAG.getNode(X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src); // Clamp by MaxFloat from above. If Src is NaN, propagate NaN. - SDValue BothClamped = DAG.getNode( - X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped); + SDValue BothClamped = + DAG.getNode(X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped); // Convert clamped value to integer. SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped); @@ -22193,11 +22185,11 @@ X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const { } // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat. - SDValue MinClamped = DAG.getNode( - X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode); + SDValue MinClamped = + DAG.getNode(X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode); // Clamp by MaxFloat from above. NaN cannot occur. - SDValue BothClamped = DAG.getNode( - X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode); + SDValue BothClamped = + DAG.getNode(X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode); // Convert clamped value to integer. SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped); @@ -22209,8 +22201,8 @@ X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const { // Otherwise, select zero if Src is NaN. SDValue ZeroInt = DAG.getConstant(0, dl, DstVT); - return DAG.getSelectCC( - dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO); + return DAG.getSelectCC(dl, Src, Src, ZeroInt, FpToInt, + ISD::CondCode::SETUO); } SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT); @@ -22232,13 +22224,13 @@ X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const { if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) { // If Src ULT MinFloat, select MinInt. In particular, this also selects // MinInt if Src is NaN. - Select = DAG.getSelectCC( - dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT); + Select = DAG.getSelectCC(dl, Src, MinFloatNode, MinIntNode, Select, + ISD::CondCode::SETULT); } // If Src OGT MaxFloat, select MaxInt. - Select = DAG.getSelectCC( - dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT); + Select = DAG.getSelectCC(dl, Src, MaxFloatNode, MaxIntNode, Select, + ISD::CondCode::SETOGT); // In the unsigned case we are done, because we mapped NaN to MinInt, which // is already zero. The promoted case was already handled above. @@ -22248,8 +22240,7 @@ X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const { // Otherwise, select 0 if Src is NaN. SDValue ZeroInt = DAG.getConstant(0, dl, DstVT); - return DAG.getSelectCC( - dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO); + return DAG.getSelectCC(dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO); } SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { @@ -22304,15 +22295,15 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { Entry.IsZExt = true; Args.push_back(Entry); - SDValue Callee = DAG.getExternalSymbol( - getLibcallName(RTLIB::FPEXT_F16_F32), - getPointerTy(DAG.getDataLayout())); + SDValue Callee = + DAG.getExternalSymbol(getLibcallName(RTLIB::FPEXT_F16_F32), + getPointerTy(DAG.getDataLayout())); CLI.setDebugLoc(DL).setChain(Chain).setLibCallee( CallingConv::C, EVT(VT).getTypeForEVT(*DAG.getContext()), Callee, std::move(Args)); SDValue Res; - std::tie(Res,Chain) = LowerCallTo(CLI); + std::tie(Res, Chain) = LowerCallTo(CLI); if (IsStrict) Res = DAG.getMergeValues({Res, Chain}, DL); @@ -22579,14 +22570,14 @@ static SDValue lowerAddSubToHorizontalOp(SDValue Op, const SDLoc &DL, // TODO: Allow commuted (f)sub by negating the result of (F)HSUB? unsigned HOpcode; switch (Op.getOpcode()) { - // clang-format off + // clang-format off case ISD::ADD: HOpcode = X86ISD::HADD; break; case ISD::SUB: HOpcode = X86ISD::HSUB; break; case ISD::FADD: HOpcode = X86ISD::FHADD; break; case ISD::FSUB: HOpcode = X86ISD::FHSUB; break; default: llvm_unreachable("Trying to lower unsupported opcode to horizontal op"); - // clang-format on + // clang-format on } unsigned LExtIndex = LHS.getConstantOperandVal(1); unsigned RExtIndex = RHS.getConstantOperandVal(1); @@ -22644,7 +22635,7 @@ static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) { bool Ignored; APFloat Point5Pred = APFloat(0.5f); Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored); - Point5Pred.next(/*nextDown*/true); + Point5Pred.next(/*nextDown*/ true); SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT, DAG.getConstantFP(Point5Pred, dl, VT), N0); @@ -22694,16 +22685,16 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { unsigned EltBits = VT.getScalarSizeInBits(); // For FABS, mask is 0x7f...; for FNEG, mask is 0x80... - APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) : - APInt::getSignMask(EltBits); + APInt MaskElt = + IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits); const fltSemantics &Sem = VT.getFltSemantics(); SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT); SDValue Op0 = Op.getOperand(0); bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS); - unsigned LogicOp = IsFABS ? X86ISD::FAND : - IsFNABS ? X86ISD::FOR : - X86ISD::FXOR; + unsigned LogicOp = IsFABS ? X86ISD::FAND + : IsFNABS ? X86ISD::FOR + : X86ISD::FXOR; SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0; if (VT.isVector() || IsF128) @@ -22806,7 +22797,8 @@ static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { } /// Helper for attempting to create a X86ISD::BT node. -static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) { +static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, + SelectionDAG &DAG) { // If Src is i8, promote it to i32 with any_extend. There is no i8 BT // instruction. Since the shift amount is in-range-or-undefined, we know // that doing a bittest on the i32 value is ok. We extend to i32 because @@ -23422,8 +23414,7 @@ static bool hasNonFlagsUse(SDValue Op) { // the node alone and emit a 'cmp' or 'test' instruction. static bool isProfitableToUseFlagOp(SDValue Op) { for (SDNode *U : Op->users()) - if (U->getOpcode() != ISD::CopyToReg && - U->getOpcode() != ISD::SETCC && + if (U->getOpcode() != ISD::CopyToReg && U->getOpcode() != ISD::SETCC && U->getOpcode() != ISD::STORE) return false; @@ -23439,14 +23430,20 @@ static SDValue EmitTest(SDValue Op, X86::CondCode X86CC, const SDLoc &dl, bool NeedCF = false; bool NeedOF = false; switch (X86CC) { - default: break; - case X86::COND_A: case X86::COND_AE: - case X86::COND_B: case X86::COND_BE: + default: + break; + case X86::COND_A: + case X86::COND_AE: + case X86::COND_B: + case X86::COND_BE: NeedCF = true; break; - case X86::COND_G: case X86::COND_GE: - case X86::COND_L: case X86::COND_LE: - case X86::COND_O: case X86::COND_NO: { + case X86::COND_G: + case X86::COND_GE: + case X86::COND_L: + case X86::COND_LE: + case X86::COND_O: + case X86::COND_NO: { // Check if we really need to set the // Overflow flag. If NoSignedWrap is present // that is not actually needed. @@ -23498,14 +23495,14 @@ static SDValue EmitTest(SDValue Op, X86::CondCode X86CC, const SDLoc &dl, // Otherwise use a regular EFLAGS-setting instruction. switch (ArithOp.getOpcode()) { - // clang-format off + // clang-format off default: llvm_unreachable("unexpected operator!"); case ISD::ADD: Opcode = X86ISD::ADD; break; case ISD::SUB: Opcode = X86ISD::SUB; break; case ISD::XOR: Opcode = X86ISD::XOR; break; case ISD::AND: Opcode = X86ISD::AND; break; case ISD::OR: Opcode = X86ISD::OR; break; - // clang-format on + // clang-format on } NumOperands = 2; @@ -23520,8 +23517,9 @@ static SDValue EmitTest(SDValue Op, X86::CondCode X86CC, const SDLoc &dl, case ISD::USUBO: { // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag. SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); - return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0), - Op->getOperand(1)).getValue(1); + return DAG + .getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0), Op->getOperand(1)) + .getValue(1); } default: break; @@ -23550,8 +23548,9 @@ static SDValue EmitCmp(SDValue Op0, SDValue Op1, X86::CondCode X86CC, EVT CmpVT = Op0.getValueType(); - assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 || - CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!"); + assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || + CmpVT == MVT::i64) && + "Unexpected VT!"); // Only promote the compare up to I32 if it is a 16 bit operation // with an immediate. 16 bit immediates are to be avoided unless the target @@ -23678,9 +23677,8 @@ bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const { /// The minimum architected relative accuracy is 2^-12. We need one /// Newton-Raphson step to have a good float result (24 bits of precision). -SDValue X86TargetLowering::getSqrtEstimate(SDValue Op, - SelectionDAG &DAG, int Enabled, - int &RefinementSteps, +SDValue X86TargetLowering::getSqrtEstimate(SDValue Op, SelectionDAG &DAG, + int Enabled, int &RefinementSteps, bool &UseOneConstNR, bool Reciprocal) const { SDLoc DL(Op); @@ -23787,9 +23785,7 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG, /// This is because we still need one division to calculate the reciprocal and /// then we need two multiplies by that reciprocal as replacements for the /// original divisions. -unsigned X86TargetLowering::combineRepeatedFPDivisors() const { - return 2; -} +unsigned X86TargetLowering::combineRepeatedFPDivisors() const { return 2; } SDValue X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, @@ -23797,7 +23793,7 @@ X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, SmallVectorImpl &Created) const { AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); if (isIntDivCheap(N->getValueType(0), Attr)) - return SDValue(N,0); // Lower SDIV as SDIV + return SDValue(N, 0); // Lower SDIV as SDIV assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) && "Unexpected divisor!"); @@ -23866,8 +23862,8 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl, if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) && isPowerOf2_64(AndRHSVal)) { Src = AndLHS; - BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, - Src.getValueType()); + BitNo = + DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, Src.getValueType()); } } } @@ -23913,7 +23909,7 @@ static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, // 6 - NLE // 7 - ORD switch (SetCCOpcode) { - // clang-format off + // clang-format off default: llvm_unreachable("Unexpected SETCC condition"); case ISD::SETOEQ: case ISD::SETEQ: SSECC = 0; break; @@ -23935,7 +23931,7 @@ static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, case ISD::SETO: SSECC = 7; break; case ISD::SETUEQ: SSECC = 8; break; case ISD::SETONE: SSECC = 12; break; - // clang-format on + // clang-format on } if (Swap) std::swap(Op0, Op1); @@ -24220,13 +24216,13 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, Cmp1 = DAG.getNode( Opc, dl, {VT, MVT::Other}, {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)}); - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1), - Cmp1.getValue(1)); + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + Cmp0.getValue(1), Cmp1.getValue(1)); } else { - Cmp0 = DAG.getNode( - Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)); - Cmp1 = DAG.getNode( - Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)); + Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1, + DAG.getTargetConstant(CC0, dl, MVT::i8)); + Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1, + DAG.getTargetConstant(CC1, dl, MVT::i8)); } Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1); } else { @@ -24236,8 +24232,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)}); Chain = Cmp.getValue(1); } else - Cmp = DAG.getNode( - Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)); + Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1, + DAG.getTargetConstant(SSECC, dl, MVT::i8)); } } else { // Handle all other FP comparisons here. @@ -24249,8 +24245,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)}); Chain = Cmp.getValue(1); } else - Cmp = DAG.getNode( - Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)); + Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1, + DAG.getTargetConstant(SSECC, dl, MVT::i8)); } if (VT.getFixedSizeInBits() > @@ -24301,7 +24297,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, // Translate compare code to XOP PCOM compare mode. unsigned CmpMode = 0; switch (Cond) { - // clang-format off + // clang-format off default: llvm_unreachable("Unexpected SETCC condition"); case ISD::SETULT: case ISD::SETLT: CmpMode = 0x00; break; @@ -24313,7 +24309,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, case ISD::SETGE: CmpMode = 0x03; break; case ISD::SETEQ: CmpMode = 0x04; break; case ISD::SETNE: CmpMode = 0x05; break; - // clang-format on + // clang-format on } // Are we comparing unsigned or signed integers? @@ -24411,13 +24407,13 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, bool Invert = false; unsigned Opc; switch (Cond) { - // clang-format off + // clang-format off default: llvm_unreachable("Unexpected condition code"); case ISD::SETUGT: Invert = true; [[fallthrough]]; case ISD::SETULE: Opc = ISD::UMIN; break; case ISD::SETULT: Invert = true; [[fallthrough]]; case ISD::SETUGE: Opc = ISD::UMAX; break; - // clang-format on + // clang-format on } SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); @@ -24441,10 +24437,10 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, // operations may be required for some comparisons. unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ : X86ISD::PCMPGT; - bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT || - Cond == ISD::SETGE || Cond == ISD::SETUGE; - bool Invert = Cond == ISD::SETNE || - (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond)); + bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT || Cond == ISD::SETGE || + Cond == ISD::SETUGE; + bool Invert = + Cond == ISD::SETNE || (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond)); if (Swap) std::swap(Op0, Op1); @@ -24462,7 +24458,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, Op1 = DAG.getBitcast(MVT::v4i32, Op1); SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1); - static const int MaskHi[] = { 1, 1, 3, 3 }; + static const int MaskHi[] = {1, 1, 3, 3}; SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi); return DAG.getBitcast(VT, Result); @@ -24473,7 +24469,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, Op1 = DAG.getAllOnesConstant(dl, MVT::v4i32); SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1); - static const int MaskHi[] = { 1, 1, 3, 3 }; + static const int MaskHi[] = {1, 1, 3, 3}; SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi); return DAG.getBitcast(VT, Result); @@ -24512,8 +24508,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1); // Create masks for only the low parts/high parts of the 64 bit integers. - static const int MaskHi[] = { 1, 1, 3, 3 }; - static const int MaskLo[] = { 0, 0, 2, 2 }; + static const int MaskHi[] = {1, 1, 3, 3}; + static const int MaskLo[] = {0, 0, 2, 2}; SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi); SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo); SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi); @@ -24540,7 +24536,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1); // Make sure the lower and upper halves are both all-ones. - static const int Mask[] = { 1, 0, 3, 2 }; + static const int Mask[] = {1, 0, 3, 2}; SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask); Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf); @@ -24555,8 +24551,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, // bits of the inputs before performing those operations. if (FlipSigns) { MVT EltVT = VT.getVectorElementType(); - SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl, - VT); + SDValue SM = + DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl, VT); Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM); Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM); } @@ -24573,8 +24569,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, // Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible. static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, - const X86Subtarget &Subtarget, - SDValue &X86CC) { + const X86Subtarget &Subtarget, SDValue &X86CC) { assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode"); // Must be a bitcast from vXi1. @@ -24721,7 +24716,8 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { Op.getOpcode() == ISD::STRICT_FSETCCS; MVT VT = Op->getSimpleValueType(0); - if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG); + if (VT.isVector()) + return LowerVSETCC(Op, Subtarget, DAG); assert(VT == MVT::i8 && "SetCC type must be 8-bit integer"); SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); @@ -24816,7 +24812,8 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res; } -SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const { +SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, + SelectionDAG &DAG) const { SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); SDValue Carry = Op.getOperand(2); @@ -24828,8 +24825,8 @@ SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const // Recreate the carry if needed. EVT CarryVT = Carry.getValueType(); - Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32), - Carry, DAG.getAllOnesConstant(DL, CarryVT)); + Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32), Carry, + DAG.getAllOnesConstant(DL, CarryVT)); SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1)); @@ -24849,7 +24846,8 @@ getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) { unsigned BaseOp = 0; SDLoc DL(Op); switch (Op.getOpcode()) { - default: llvm_unreachable("Unknown ovf instruction!"); + default: + llvm_unreachable("Unknown ovf instruction!"); case ISD::SADDO: BaseOp = X86ISD::ADD; Cond = X86::COND_O; @@ -24923,7 +24921,8 @@ static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) { SDValue VOp0 = V.getOperand(0); unsigned InBits = VOp0.getValueSizeInBits(); unsigned Bits = V.getValueSizeInBits(); - return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits)); + return DAG.MaskedValueIsZero(VOp0, + APInt::getHighBitsSet(InBits, InBits - Bits)); } // Lower various (select (icmp CmpVal, 0), LHS, RHS) custom patterns. @@ -25061,7 +25060,7 @@ static SDValue LowerSELECTWithCmpZero(SDValue CmpVal, SDValue LHS, SDValue RHS, SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { bool AddTest = true; - SDValue Cond = Op.getOperand(0); + SDValue Cond = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDValue Op2 = Op.getOperand(2); SDLoc DL(Op); @@ -25212,14 +25211,13 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // If condition flag is set by a X86ISD::CMP, then use it as the condition // setting operand in place of the X86ISD::SETCC. unsigned CondOpcode = Cond.getOpcode(); - if (CondOpcode == X86ISD::SETCC || - CondOpcode == X86ISD::SETCC_CARRY) { + if (CondOpcode == X86ISD::SETCC || CondOpcode == X86ISD::SETCC_CARRY) { CC = Cond.getOperand(0); SDValue Cmp = Cond.getOperand(1); bool IllegalFPCMov = false; - if (VT.isFloatingPoint() && !VT.isVector() && - !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) // FPStack? + if (VT.isFloatingPoint() && !VT.isVector() && !isScalarFPTypeInSSEReg(VT) && + Subtarget.canUseCMOV()) // FPStack? IllegalFPCMov = !hasFPCMov(cast(CC)->getSExtValue()); if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || @@ -25282,14 +25280,15 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // X86 doesn't have an i8 cmov. If both operands are the result of a truncate // widen the cmov and push the truncate through. This avoids introducing a new // branch during isel and doesn't add any extensions. - if (Op.getValueType() == MVT::i8 && - Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) { + if (Op.getValueType() == MVT::i8 && Op1.getOpcode() == ISD::TRUNCATE && + Op2.getOpcode() == ISD::TRUNCATE) { SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0); if (T1.getValueType() == T2.getValueType() && // Exclude CopyFromReg to avoid partial register stalls. - T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){ - SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1, - CC, Cond); + T1.getOpcode() != ISD::CopyFromReg && + T2.getOpcode() != ISD::CopyFromReg) { + SDValue Cmov = + DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1, CC, Cond); return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov); } } @@ -25305,14 +25304,14 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { !X86::mayFoldLoad(Op2, Subtarget))) { Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1); Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2); - SDValue Ops[] = { Op2, Op1, CC, Cond }; + SDValue Ops[] = {Op2, Op1, CC, Cond}; SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops); return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov); } // X86ISD::CMOV means set the result (which is operand 1) to the RHS if // condition is true. - SDValue Ops[] = { Op2, Op1, CC, Cond }; + SDValue Ops[] = {Op2, Op1, CC, Cond}; return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops, Op->getFlags()); } @@ -25372,7 +25371,7 @@ static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const SDLoc &dl, } SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const { - SDValue Cond = Op.getOperand(0); // condition + SDValue Cond = Op.getOperand(0); // condition SDValue TrueOp = Op.getOperand(1); // true_value SDValue FalseOp = Op.getOperand(2); // false_value SDLoc DL(Op); @@ -25533,6 +25532,69 @@ SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const { return DAG.getBitcast(VT, CtSelect); } + // Handle f80 types by splitting into three 32-bit chunks + if (VT == MVT::f80) { + SDValue Chain = DAG.getEntryNode(); + + // Create temporary stack slots for input f80 values + SDValue TrueSlot = DAG.CreateStackTemporary(MVT::f80); + SDValue FalseSlot = DAG.CreateStackTemporary(MVT::f80); + + // Store f80 values to memory + SDValue StoreTrueF80 = + DAG.getStore(Chain, DL, TrueOp, TrueSlot, MachinePointerInfo()); + SDValue StoreFalseF80 = + DAG.getStore(Chain, DL, FalseOp, FalseSlot, MachinePointerInfo()); + + // Load i32 parts from memory (3 chunks for 96-bit f80 storage) + SDValue TruePart0 = + DAG.getLoad(MVT::i32, DL, StoreTrueF80, TrueSlot, MachinePointerInfo()); + SDValue TruePart1Ptr = + DAG.getMemBasePlusOffset(TrueSlot, TypeSize::getFixed(4), DL); + SDValue TruePart1 = DAG.getLoad(MVT::i32, DL, StoreTrueF80, TruePart1Ptr, + MachinePointerInfo()); + SDValue TruePart2Ptr = + DAG.getMemBasePlusOffset(TrueSlot, TypeSize::getFixed(8), DL); + SDValue TruePart2 = DAG.getLoad(MVT::i32, DL, StoreTrueF80, TruePart2Ptr, + MachinePointerInfo()); + + SDValue FalsePart0 = DAG.getLoad(MVT::i32, DL, StoreFalseF80, FalseSlot, + MachinePointerInfo()); + SDValue FalsePart1Ptr = + DAG.getMemBasePlusOffset(FalseSlot, TypeSize::getFixed(4), DL); + SDValue FalsePart1 = DAG.getLoad(MVT::i32, DL, StoreFalseF80, FalsePart1Ptr, + MachinePointerInfo()); + SDValue FalsePart2Ptr = + DAG.getMemBasePlusOffset(FalseSlot, TypeSize::getFixed(8), DL); + SDValue FalsePart2 = DAG.getLoad(MVT::i32, DL, StoreFalseF80, FalsePart2Ptr, + MachinePointerInfo()); + + // Perform CTSELECT on each 32-bit chunk + SDValue Part0Ops[] = {FalsePart0, TruePart0, CC, ProcessedCond}; + SDValue Part0Select = DAG.getNode(X86ISD::CTSELECT, DL, MVT::i32, Part0Ops); + SDValue Part1Ops[] = {FalsePart1, TruePart1, CC, ProcessedCond}; + SDValue Part1Select = DAG.getNode(X86ISD::CTSELECT, DL, MVT::i32, Part1Ops); + SDValue Part2Ops[] = {FalsePart2, TruePart2, CC, ProcessedCond}; + SDValue Part2Select = DAG.getNode(X86ISD::CTSELECT, DL, MVT::i32, Part2Ops); + + // Create result stack slot and store the selected parts + SDValue ResultSlot = DAG.CreateStackTemporary(MVT::f80); + SDValue StorePart0 = + DAG.getStore(Chain, DL, Part0Select, ResultSlot, MachinePointerInfo()); + SDValue ResPart1Ptr = + DAG.getMemBasePlusOffset(ResultSlot, TypeSize::getFixed(4), DL); + SDValue StorePart1 = DAG.getStore(StorePart0, DL, Part1Select, ResPart1Ptr, + MachinePointerInfo()); + SDValue ResPart2Ptr = + DAG.getMemBasePlusOffset(ResultSlot, TypeSize::getFixed(8), DL); + SDValue StorePart2 = DAG.getStore(StorePart1, DL, Part2Select, ResPart2Ptr, + MachinePointerInfo()); + + // Load complete f80 result from memory + return DAG.getLoad(MVT::f80, DL, StorePart2, ResultSlot, + MachinePointerInfo()); + } + // Create final CTSELECT node SDValue Ops[] = {FalseOp, TrueOp, CC, ProcessedCond}; return DAG.getNode(X86ISD::CTSELECT, DL, Op.getValueType(), Ops, @@ -25590,9 +25652,9 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, InVT = In.getSimpleValueType(); } - // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results, - // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still - // need to be handled here for 256/512-bit results. + // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit + // results, so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* + // instructions still need to be handled here for 256/512-bit results. if (Subtarget.hasInt256()) { assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension"); @@ -25601,9 +25663,8 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, // FIXME: Apparently we create inreg operations that could be regular // extends. - unsigned ExtOpc = - Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND - : ISD::ZERO_EXTEND; + unsigned ExtOpc = Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND + : ISD::ZERO_EXTEND; return DAG.getNode(ExtOpc, dl, VT, In); } @@ -25721,9 +25782,9 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In); unsigned NumElems = InVT.getVectorNumElements(); - SmallVector ShufMask(NumElems, -1); - for (unsigned i = 0; i != NumElems/2; ++i) - ShufMask[i] = i + NumElems/2; + SmallVector ShufMask(NumElems, -1); + for (unsigned i = 0; i != NumElems / 2; ++i) + ShufMask[i] = i + NumElems / 2; SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask); OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi); @@ -25885,11 +25946,10 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, // TODO: It is possible to support ZExt by zeroing the undef values during // the shuffle phase or after the shuffle. static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, - SelectionDAG &DAG) { + SelectionDAG &DAG) { MVT RegVT = Op.getSimpleValueType(); assert(RegVT.isVector() && "We only custom lower vector loads."); - assert(RegVT.isInteger() && - "We only custom lower integer vector loads."); + assert(RegVT.isInteger() && "We only custom lower integer vector loads."); LoadSDNode *Ld = cast(Op.getNode()); SDLoc dl(Ld); @@ -25932,8 +25992,8 @@ static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); - SDValue Cond = Op.getOperand(1); - SDValue Dest = Op.getOperand(2); + SDValue Cond = Op.getOperand(1); + SDValue Dest = Op.getOperand(2); SDLoc dl(Op); // Bail out when we don't have native compare instructions. @@ -25983,7 +26043,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { if (User->getOpcode() == ISD::BR) { SDValue FalseBB = User->getOperand(1); SDNode *NewBR = - DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); + DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); assert(NewBR == User); (void)NewBR; Dest = FalseBB; @@ -26054,9 +26114,8 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { // bytes in one go. Touching the stack at 4K increments is necessary to ensure // that the guard pages used by the OS virtual memory manager are allocated in // correct sequence. -SDValue -X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, - SelectionDAG &DAG) const { +SDValue X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, + SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); bool SplitStack = MF.shouldSplitStack(); bool EmitStackProbeCall = hasStackProbeSymbol(MF); @@ -26067,7 +26126,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, // Get the inputs. SDNode *Node = Op.getNode(); SDValue Chain = Op.getOperand(0); - SDValue Size = Op.getOperand(1); + SDValue Size = Op.getOperand(1); MaybeAlign Alignment(Op.getConstantOperandVal(2)); EVT VT = Node->getValueType(0); @@ -26190,8 +26249,9 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { MemOps.push_back(Store); // Store ptr to reg_save_area. - FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant( - Subtarget.isTarget64BitLP64() ? 8 : 4, DL)); + FIN = DAG.getNode( + ISD::ADD, DL, PtrVT, FIN, + DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 8 : 4, DL)); SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT); Store = DAG.getStore( Op.getOperand(0), DL, RSFIN, FIN, @@ -26201,8 +26261,7 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { } SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { - assert(Subtarget.is64Bit() && - "LowerVAARG only handles 64-bit va_arg!"); + assert(Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"); assert(Op.getNumOperands() == 4); MachineFunction &MF = DAG.getMachineFunction(); @@ -26226,11 +26285,11 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { // selection mechanism works only for the basic types. assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented"); if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) { - ArgMode = 2; // Argument passed in XMM register. Use fp_offset. + ArgMode = 2; // Argument passed in XMM register. Use fp_offset. } else { assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ && "Unhandled argument type in LowerVAARG"); - ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset. + ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset. } if (ArgMode == 2) { @@ -26264,7 +26323,7 @@ static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget, // where a va_list is still an i8*. assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!"); if (Subtarget.isCallingConvWin64( - DAG.getMachineFunction().getFunction().getCallingConv())) + DAG.getMachineFunction().getFunction().getCallingConv())) // Probably a Win64 va_copy. return DAG.expandVACopy(Op.getNode()); @@ -26326,15 +26385,17 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, return DAG.getConstant(0, dl, VT); } - assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) - && "Unknown target vector shift-by-constant node"); + assert( + (Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && + "Unknown target vector shift-by-constant node"); // Fold this packed vector shift into a build vector if SrcOp is a // vector of Constants or UNDEFs. if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) { unsigned ShiftOpc; switch (Opc) { - default: llvm_unreachable("Unknown opcode!"); + default: + llvm_unreachable("Unknown opcode!"); case X86ISD::VSHLI: ShiftOpc = ISD::SHL; break; @@ -26474,8 +26535,8 @@ static SDValue getMaskNode(SDValue Mask, MVT MaskVT, Hi = DAG.getBitcast(MVT::v32i1, Hi); return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi); } else { - MVT BitcastVT = MVT::getVectorVT(MVT::i1, - Mask.getSimpleValueType().getSizeInBits()); + MVT BitcastVT = + MVT::getVectorVT(MVT::i1, Mask.getSimpleValueType().getSizeInBits()); // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements // are extracted by EXTRACT_SUBVECTOR. return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, @@ -26556,9 +26617,12 @@ static int getSEHRegistrationNodeSize(const Function *Fn) { // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See // WinEHStatePass for the full struct definition. switch (classifyEHPersonality(Fn->getPersonalityFn())) { - case EHPersonality::MSVC_X86SEH: return 24; - case EHPersonality::MSVC_CXX: return 16; - default: break; + case EHPersonality::MSVC_X86SEH: + return 24; + case EHPersonality::MSVC_CXX: + return 16; + default: + break; } report_fatal_error( "can only recover FP for 32-bit MSVC EH personality functions"); @@ -26648,13 +26712,13 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SDLoc dl(Op); unsigned IntNo = Op.getConstantOperandVal(0); MVT VT = Op.getSimpleValueType(); - const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo); + const IntrinsicData *IntrData = getIntrinsicWithoutChain(IntNo); // Propagate flags from original node to transformed node(s). SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags()); if (IntrData) { - switch(IntrData->Type) { + switch (IntrData->Type) { case INTR_TYPE_1OP: { // We specify 2 possible opcodes for intrinsics with rounding modes. // First, we check if the intrinsic may have non-default rounding mode, @@ -26780,9 +26844,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, if (!isRoundModeCurDirection(Rnd)) return SDValue(); } - return getVectorMaskingNode( - DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru, - Subtarget, DAG); + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src), + Mask, PassThru, Subtarget, DAG); } case INTR_TYPE_1OP_MASK_SAE: { SDValue Src = Op.getOperand(1); @@ -26823,9 +26886,9 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, if (!isRoundModeCurDirection(Rnd)) return SDValue(); } - return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, - Src2), - Mask, passThru, Subtarget, DAG); + return getScalarMaskingNode( + DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2), Mask, passThru, + Subtarget, DAG); } assert(Op.getNumOperands() == (6U + HasRounding) && @@ -26839,9 +26902,9 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, else if (!isRoundModeCurDirection(Sae)) return SDValue(); } - return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, - Src2, RoundingMode), - Mask, passThru, Subtarget, DAG); + return getScalarMaskingNode( + DAG.getNode(Opc, dl, VT, Src1, Src2, RoundingMode), Mask, passThru, + Subtarget, DAG); } case INTR_TYPE_SCALAR_MASK_RND: { SDValue Src1 = Op.getOperand(1); @@ -26876,8 +26939,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, else return SDValue(); - return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2), - Mask, passThru, Subtarget, DAG); + return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2), Mask, + passThru, Subtarget, DAG); } case INTR_TYPE_2OP_MASK: { SDValue Src1 = Op.getOperand(1); @@ -26913,8 +26976,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return SDValue(); } - return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2), - Mask, PassThru, Subtarget, DAG); + return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2), Mask, + PassThru, Subtarget, DAG); } case INTR_TYPE_3OP_SCALAR_MASK_SAE: { SDValue Src1 = Op.getOperand(1); @@ -26963,12 +27026,12 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, // Reverse the operands to match VSELECT order. return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1); } - case VPERM_2OP : { + case VPERM_2OP: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); // Swap Src1 and Src2 in the node creation - return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1); + return DAG.getNode(IntrData->Opc0, dl, VT, Src2, Src1); } case CFMA_OP_MASKZ: case CFMA_OP_MASK: { @@ -27012,8 +27075,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SDValue Imm = Op.getOperand(2); SDValue Mask = Op.getOperand(3); SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm); - SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(), - Subtarget, DAG); + SDValue FPclassMask = + getScalarMaskingNode(FPclass, Mask, SDValue(), Subtarget, DAG); // Need to fill with zeros to ensure the bitcast will produce zeroes // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that. SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1, @@ -27037,7 +27100,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, if (!isRoundModeCurDirection(Sae)) return SDValue(); } - //default rounding mode + // default rounding mode return DAG.getNode(IntrData->Opc0, dl, MaskVT, {Op.getOperand(1), Op.getOperand(2), CC, Mask}); } @@ -27055,12 +27118,12 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, else if (!isRoundModeCurDirection(Sae)) return SDValue(); } - //default rounding mode + // default rounding mode if (!Cmp.getNode()) Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC); - SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(), - Subtarget, DAG); + SDValue CmpMask = + getScalarMaskingNode(Cmp, Mask, SDValue(), Subtarget, DAG); // Need to fill with zeros to ensure the bitcast will produce zeroes // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that. SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1, @@ -27228,8 +27291,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode"); uint64_t Imm = Op.getConstantOperandVal(2); - SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl, - Op.getValueType()); + SDValue Control = + DAG.getTargetConstant(Imm & 0xffff, dl, Op.getValueType()); return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), Control); } @@ -27251,7 +27314,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Op.getOperand(3), GenCF.getValue(1)); } SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG); - SDValue Results[] = { SetCC, Res }; + SDValue Results[] = {SetCC, Res}; return DAG.getMergeValues(Results, dl); } case CVTPD2PS_MASK: @@ -27334,7 +27397,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, } switch (IntNo) { - default: return SDValue(); // Don't custom lower most intrinsics. + default: + return SDValue(); // Don't custom lower most intrinsics. // ptest and testp intrinsics. The intrinsic these come from are designed to // return an integer value, not just an instruction so lower it to the ptest @@ -27368,7 +27432,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, unsigned TestOpc = X86ISD::PTEST; X86::CondCode X86CC; switch (IntNo) { - default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); + default: + llvm_unreachable("Bad fallthrough in Intrinsic lowering."); case Intrinsic::x86_avx512_ktestc_b: case Intrinsic::x86_avx512_ktestc_w: case Intrinsic::x86_avx512_ktestc_d: @@ -27439,7 +27504,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, unsigned Opcode; X86::CondCode X86CC; switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. + default: + llvm_unreachable("Impossible intrinsic"); // Can't reach here. case Intrinsic::x86_sse42_pcmpistria128: Opcode = X86ISD::PCMPISTR; X86CC = X86::COND_A; @@ -27609,7 +27675,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, unsigned NewIntrinsic; switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. + default: + llvm_unreachable("Impossible intrinsic"); // Can't reach here. case Intrinsic::x86_mmx_pslli_w: NewIntrinsic = Intrinsic::x86_mmx_psll_w; break; @@ -27686,16 +27753,16 @@ static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, MemIntrinsicSDNode *MemIntr = cast(Op); - SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale }; + SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale}; SDValue Res = DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops, MemIntr->getMemoryVT(), MemIntr->getMemOperand()); return DAG.getMergeValues({Res, Res.getValue(1)}, dl); } -static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG, - SDValue Src, SDValue Mask, SDValue Base, - SDValue Index, SDValue ScaleOp, SDValue Chain, +static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG, SDValue Src, + SDValue Mask, SDValue Base, SDValue Index, + SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); @@ -27724,7 +27791,7 @@ static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG, MemIntrinsicSDNode *MemIntr = cast(Op); - SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale }; + SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale}; SDValue Res = DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops, MemIntr->getMemoryVT(), MemIntr->getMemOperand()); @@ -27732,9 +27799,9 @@ static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG, } static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, - SDValue Src, SDValue Mask, SDValue Base, - SDValue Index, SDValue ScaleOp, SDValue Chain, - const X86Subtarget &Subtarget) { + SDValue Src, SDValue Mask, SDValue Base, + SDValue Index, SDValue ScaleOp, SDValue Chain, + const X86Subtarget &Subtarget) { SDLoc dl(Op); auto *C = dyn_cast(ScaleOp); // Scale must be constant. @@ -27776,8 +27843,8 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, TLI.getPointerTy(DAG.getDataLayout())); SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); - MVT MaskVT = - MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); + MVT MaskVT = MVT::getVectorVT( + MVT::i1, Index.getSimpleValueType().getVectorNumElements()); SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops); @@ -27793,11 +27860,11 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, /// expanded intrinsics implicitly defines extra registers (i.e. not just /// EDX:EAX). static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL, - SelectionDAG &DAG, - unsigned TargetOpcode, - unsigned SrcReg, - const X86Subtarget &Subtarget, - SmallVectorImpl &Results) { + SelectionDAG &DAG, + unsigned TargetOpcode, + unsigned SrcReg, + const X86Subtarget &Subtarget, + SmallVectorImpl &Results) { SDValue Chain = N->getOperand(0); SDValue Glue; @@ -27837,7 +27904,7 @@ static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL, } // Use a buildpair to merge the two 32-bit values into a 64-bit one. - SDValue Ops[] = { LO, HI }; + SDValue Ops[] = {LO, HI}; SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops); Results.push_back(Pair); Results.push_back(Chain); @@ -27854,9 +27921,9 @@ static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode, // The processor's time-stamp counter (a 64-bit MSR) is stored into the // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR // and the EAX register is loaded with the low-order 32 bits. - SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode, - /* NoRegister */0, Subtarget, - Results); + SDValue Glue = + expandIntrinsicWChainHelper(N, DL, DAG, Opcode, + /* NoRegister */ 0, Subtarget, Results); if (Opcode != X86::RDTSCP) return; @@ -27914,24 +27981,24 @@ static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) { } /// Emit Truncating Store with signed or unsigned saturation. -static SDValue -EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, - SDValue Ptr, EVT MemVT, MachineMemOperand *MMO, - SelectionDAG &DAG) { +static SDValue EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, + SDValue Val, SDValue Ptr, EVT MemVT, + MachineMemOperand *MMO, SelectionDAG &DAG) { SDVTList VTs = DAG.getVTList(MVT::Other); SDValue Undef = DAG.getUNDEF(Ptr.getValueType()); - SDValue Ops[] = { Chain, Val, Ptr, Undef }; + SDValue Ops[] = {Chain, Val, Ptr, Undef}; unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS; return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO); } /// Emit Masked Truncating Store with signed or unsigned saturation. static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, - const SDLoc &DL, - SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT, - MachineMemOperand *MMO, SelectionDAG &DAG) { + const SDLoc &DL, SDValue Val, SDValue Ptr, + SDValue Mask, EVT MemVT, + MachineMemOperand *MMO, + SelectionDAG &DAG) { SDVTList VTs = DAG.getVTList(MVT::Other); - SDValue Ops[] = { Chain, Val, Ptr, Mask }; + SDValue Ops[] = {Chain, Val, Ptr, Mask}; unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS; return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO); } @@ -27999,9 +28066,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, SDLoc dl(Op); // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0 // to the EDX and ECX parameters. - return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other, - Op.getOperand(0), Op.getOperand(2), - DAG.getConstant(0, dl, MVT::i32), + return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other, Op.getOperand(0), + Op.getOperand(2), DAG.getConstant(0, dl, MVT::i32), DAG.getConstant(0, dl, MVT::i32)); } case llvm::Intrinsic::asan_check_memaccess: { @@ -28032,7 +28098,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, unsigned Opcode; switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); + default: + llvm_unreachable("Impossible intrinsic"); case Intrinsic::x86_umwait: Opcode = X86ISD::UMWAIT; break; @@ -28045,9 +28112,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, break; } - SDValue Operation = - DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2), - Op->getOperand(3), Op->getOperand(4)); + SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2), + Op->getOperand(3), Op->getOperand(4)); SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG); return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC, Operation.getValue(1)); @@ -28059,7 +28125,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other); unsigned Opcode; switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic!"); + default: + llvm_unreachable("Impossible intrinsic!"); case Intrinsic::x86_enqcmd: Opcode = X86ISD::ENQCMD; break; @@ -28083,7 +28150,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, unsigned Opcode; switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); + default: + llvm_unreachable("Impossible intrinsic"); case Intrinsic::x86_aesenc128kl: Opcode = X86ISD::AESENC128KL; break; @@ -28121,7 +28189,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, unsigned Opcode; switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); + default: + llvm_unreachable("Impossible intrinsic"); case Intrinsic::x86_aesencwide128kl: Opcode = X86ISD::AESENCWIDE128KL; break; @@ -28215,9 +28284,9 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, SDValue Src2 = Op.getOperand(4); SDValue CC = Op.getOperand(5); MachineMemOperand *MMO = cast(Op)->getMemOperand(); - SDValue Operation = DAG.getMemIntrinsicNode( - X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC}, - MVT::i32, MMO); + SDValue Operation = + DAG.getMemIntrinsicNode(X86ISD::CMPCCXADD, DL, Op->getVTList(), + {Chain, Addr, Src1, Src2, CC}, MVT::i32, MMO); return Operation; } case Intrinsic::x86_aadd32: @@ -28301,8 +28370,9 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, } SDLoc dl(Op); - switch(IntrData->Type) { - default: llvm_unreachable("Unknown Intrinsic Type"); + switch (IntrData->Type) { + default: + llvm_unreachable("Unknown Intrinsic Type"); case RDSEED: case RDRAND: { // Emit the node with the right value type. @@ -28323,32 +28393,32 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, } case GATHER_AVX2: { SDValue Chain = Op.getOperand(0); - SDValue Src = Op.getOperand(2); - SDValue Base = Op.getOperand(3); + SDValue Src = Op.getOperand(2); + SDValue Base = Op.getOperand(3); SDValue Index = Op.getOperand(4); - SDValue Mask = Op.getOperand(5); + SDValue Mask = Op.getOperand(5); SDValue Scale = Op.getOperand(6); return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain, Subtarget); } case GATHER: { - //gather(v1, mask, index, base, scale); + // gather(v1, mask, index, base, scale); SDValue Chain = Op.getOperand(0); - SDValue Src = Op.getOperand(2); - SDValue Base = Op.getOperand(3); + SDValue Src = Op.getOperand(2); + SDValue Base = Op.getOperand(3); SDValue Index = Op.getOperand(4); - SDValue Mask = Op.getOperand(5); + SDValue Mask = Op.getOperand(5); SDValue Scale = Op.getOperand(6); - return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale, - Chain, Subtarget); + return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale, Chain, + Subtarget); } case SCATTER: { - //scatter(base, mask, index, v1, scale); + // scatter(base, mask, index, v1, scale); SDValue Chain = Op.getOperand(0); - SDValue Base = Op.getOperand(2); - SDValue Mask = Op.getOperand(3); + SDValue Base = Op.getOperand(2); + SDValue Mask = Op.getOperand(3); SDValue Index = Op.getOperand(4); - SDValue Src = Op.getOperand(5); + SDValue Src = Op.getOperand(5); SDValue Scale = Op.getOperand(6); return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain, Subtarget); @@ -28359,9 +28429,9 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, "Wrong prefetch hint in intrinsic: should be 2 or 3"); unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0); SDValue Chain = Op.getOperand(0); - SDValue Mask = Op.getOperand(2); + SDValue Mask = Op.getOperand(2); SDValue Index = Op.getOperand(3); - SDValue Base = Op.getOperand(4); + SDValue Base = Op.getOperand(4); SDValue Scale = Op.getOperand(5); return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain, Subtarget); @@ -28396,8 +28466,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG); SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC); - return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), - Ret, SDValue(InTrans.getNode(), 1)); + return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Ret, + SDValue(InTrans.getNode(), 1)); } case TRUNCATE_TO_MEM_VI8: case TRUNCATE_TO_MEM_VI16: @@ -28410,7 +28480,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, MemIntrinsicSDNode *MemIntr = dyn_cast(Op); assert(MemIntr && "Expected MemIntrinsicSDNode!"); - EVT MemVT = MemIntr->getMemoryVT(); + EVT MemVT = MemIntr->getMemoryVT(); uint16_t TruncationOp = IntrData->Opc0; switch (TruncationOp) { @@ -28505,7 +28575,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { Register FrameReg = RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction()); - SDLoc dl(Op); // FIXME probably not meaningful + SDLoc dl(Op); // FIXME probably not meaningful unsigned Depth = Op.getConstantOperandVal(0); assert(((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && @@ -28519,7 +28589,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { // FIXME? Maybe this could be a TableGen attribute on some registers and // this table could be generated automatically from RegInfo. -Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT, +Register X86TargetLowering::getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const { const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); @@ -28576,10 +28646,10 @@ bool X86TargetLowering::needsFixedCatchObjects() const { } SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { - SDValue Chain = Op.getOperand(0); - SDValue Offset = Op.getOperand(1); - SDValue Handler = Op.getOperand(2); - SDLoc dl (Op); + SDValue Chain = Op.getOperand(0); + SDValue Offset = Op.getOperand(1); + SDValue Handler = Op.getOperand(2); + SDLoc dl(Op); EVT PtrVT = getPointerTy(DAG.getDataLayout()); const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); @@ -28590,9 +28660,9 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT); Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX; - SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame, - DAG.getIntPtrConstant(RegInfo->getSlotSize(), - dl)); + SDValue StoreAddr = + DAG.getNode(ISD::ADD, dl, PtrVT, Frame, + DAG.getIntPtrConstant(RegInfo->getSlotSize(), dl)); StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset); Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo()); Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); @@ -28615,19 +28685,20 @@ SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op, (void)TII->getGlobalBaseReg(&DAG.getMachineFunction()); } return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL, - DAG.getVTList(MVT::i32, MVT::Other), - Op.getOperand(0), Op.getOperand(1)); + DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0), + Op.getOperand(1)); } SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); - return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other, - Op.getOperand(0), Op.getOperand(1)); + return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other, Op.getOperand(0), + Op.getOperand(1)); } -SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, - SelectionDAG &DAG) const { +SDValue +X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, + SelectionDAG &DAG) const { SDLoc DL(Op); return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other, Op.getOperand(0)); @@ -28643,7 +28714,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, SDValue Trmp = Op.getOperand(1); // trampoline SDValue FPtr = Op.getOperand(2); // nested function SDValue Nest = Op.getOperand(3); // 'nest' parameter value - SDLoc dl (Op); + SDLoc dl(Op); const Value *TrmpAddr = cast(Op.getOperand(4))->getValue(); const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); @@ -28652,7 +28723,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, SDValue OutChains[6]; // Large code-model. - const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. + const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7; @@ -28700,7 +28771,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); } else { const Function *Func = - cast(cast(Op.getOperand(5))->getValue()); + cast(cast(Op.getOperand(5))->getValue()); CallingConv::ID CC = Func->getCallingConv(); unsigned NestReg; @@ -28722,7 +28793,8 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, unsigned Idx = 0; for (FunctionType::param_iterator I = FTy->param_begin(), - E = FTy->param_end(); I != E; ++I, ++Idx) + E = FTy->param_end(); + I != E; ++I, ++Idx) if (Attrs.hasParamAttr(Idx, Attribute::InReg)) { const DataLayout &DL = DAG.getDataLayout(); // FIXME: should only count parameters that are lowered to integers. @@ -28828,18 +28900,16 @@ SDValue X86TargetLowering::LowerGET_ROUNDING(SDValue Op, Chain = CWD.getValue(1); // Mask and turn the control bits into a shift for the lookup table. - SDValue Shift = - DAG.getNode(ISD::SRL, DL, MVT::i16, - DAG.getNode(ISD::AND, DL, MVT::i16, - CWD, DAG.getConstant(0xc00, DL, MVT::i16)), - DAG.getConstant(9, DL, MVT::i8)); + SDValue Shift = DAG.getNode(ISD::SRL, DL, MVT::i16, + DAG.getNode(ISD::AND, DL, MVT::i16, CWD, + DAG.getConstant(0xc00, DL, MVT::i16)), + DAG.getConstant(9, DL, MVT::i8)); Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift); SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32); - SDValue RetVal = - DAG.getNode(ISD::AND, DL, MVT::i32, - DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift), - DAG.getConstant(3, DL, MVT::i32)); + SDValue RetVal = DAG.getNode(ISD::AND, DL, MVT::i32, + DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift), + DAG.getConstant(3, DL, MVT::i32)); RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT); @@ -29125,17 +29195,15 @@ static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG, MVT EltVT = VT.getVectorElementType(); unsigned NumElems = VT.getVectorNumElements(); - assert((EltVT == MVT::i8 || EltVT == MVT::i16) && - "Unsupported element type"); + assert((EltVT == MVT::i8 || EltVT == MVT::i16) && "Unsupported element type"); // Split vector, it's Lo and Hi parts will be handled in next iteration. - if (NumElems > 16 || - (NumElems == 16 && !Subtarget.canExtendTo512DQ())) + if (NumElems > 16 || (NumElems == 16 && !Subtarget.canExtendTo512DQ())) return splitVectorIntUnary(Op, DAG, dl); MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems); assert((NewVT.is256BitVector() || NewVT.is512BitVector()) && - "Unsupported value type for operation"); + "Unsupported value type for operation"); // Use native supported vector instruction vplzcntd. Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0)); @@ -29807,10 +29875,10 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget, SmallVector LoOps, HiOps; for (unsigned i = 0; i != NumElts; i += 16) { for (unsigned j = 0; j != 8; ++j) { - LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl, - MVT::i16)); - HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl, - MVT::i16)); + LoOps.push_back( + DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl, MVT::i16)); + HiOps.push_back( + DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl, MVT::i16)); } } @@ -29851,7 +29919,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget, // Merge the two vectors back together with a shuffle. This expands into 2 // shuffles. - static const int ShufMask[] = { 0, 4, 2, 6 }; + static const int ShufMask[] = {0, 4, 2, 6}; return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask); } @@ -30016,7 +30084,7 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, // // Place the odd value at an even position (basically, shift all values 1 // step to the left): - const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1, + const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11, -1, 13, -1, 15, -1}; // => SDValue Odd0 = @@ -30066,7 +30134,7 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, // Only i8 vectors should need custom lowering after this. assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || - (VT == MVT::v64i8 && Subtarget.hasBWI())) && + (VT == MVT::v64i8 && Subtarget.hasBWI())) && "Unsupported vector type"); // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply, @@ -30221,7 +30289,8 @@ static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget, return DAG.getMergeValues({Low, Ovf}, dl); } -SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const { +SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, + SelectionDAG &DAG) const { assert(Subtarget.isTargetWin64() && "Unexpected target"); EVT VT = Op.getValueType(); assert(VT.isInteger() && VT.getSizeInBits() == 128 && @@ -30236,13 +30305,13 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons RTLIB::Libcall LC; bool isSigned; switch (Op->getOpcode()) { - // clang-format off + // clang-format off default: llvm_unreachable("Unexpected request for libcall!"); case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break; case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break; case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break; case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break; - // clang-format on + // clang-format on } SDLoc dl(Op); @@ -30381,9 +30450,9 @@ static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget, // The shift amount is a variable, but it is the same for all vector lanes. // These instructions are defined together with shift-immediate. -static -bool supportedVectorShiftWithBaseAmnt(EVT VT, const X86Subtarget &Subtarget, - unsigned Opcode) { +static bool supportedVectorShiftWithBaseAmnt(EVT VT, + const X86Subtarget &Subtarget, + unsigned Opcode) { return supportedVectorShiftWithImm(VT, Subtarget, Opcode); } @@ -30412,7 +30481,7 @@ static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget, return true; bool LShift = VT.is128BitVector() || VT.is256BitVector(); - bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64; + bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64; return (Opcode == ISD::SRA) ? AShift : LShift; } @@ -32350,7 +32419,8 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { auto SSID = AI->getSyncScopeID(); // We must restrict the ordering to avoid generating loads with Release or // ReleaseAcquire orderings. - auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering()); + auto Order = + AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering()); // Before the load we need a fence. Here is an example lifted from // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence @@ -32419,31 +32489,28 @@ static SDValue emitLockedStackOp(SelectionDAG &DAG, if (Subtarget.is64Bit()) { SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32); - SDValue Ops[] = { - DAG.getRegister(X86::RSP, MVT::i64), // Base - DAG.getTargetConstant(1, DL, MVT::i8), // Scale - DAG.getRegister(0, MVT::i64), // Index - DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp - DAG.getRegister(0, MVT::i16), // Segment. - Zero, - Chain}; - SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32, - MVT::Other, Ops); + SDValue Ops[] = {DAG.getRegister(X86::RSP, MVT::i64), // Base + DAG.getTargetConstant(1, DL, MVT::i8), // Scale + DAG.getRegister(0, MVT::i64), // Index + DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp + DAG.getRegister(0, MVT::i16), // Segment. + Zero, + Chain}; + SDNode *Res = + DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32, MVT::Other, Ops); return SDValue(Res, 1); } SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32); - SDValue Ops[] = { - DAG.getRegister(X86::ESP, MVT::i32), // Base - DAG.getTargetConstant(1, DL, MVT::i8), // Scale - DAG.getRegister(0, MVT::i32), // Index - DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp - DAG.getRegister(0, MVT::i16), // Segment. - Zero, - Chain - }; - SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32, - MVT::Other, Ops); + SDValue Ops[] = {DAG.getRegister(X86::ESP, MVT::i32), // Base + DAG.getTargetConstant(1, DL, MVT::i8), // Scale + DAG.getRegister(0, MVT::i32), // Index + DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp + DAG.getRegister(0, MVT::i16), // Segment. + Zero, + Chain}; + SDNode *Res = + DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32, MVT::Other, Ops); return SDValue(Res, 1); } @@ -32476,36 +32543,44 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget, SDLoc DL(Op); unsigned Reg = 0; unsigned size = 0; - switch(T.SimpleTy) { - default: llvm_unreachable("Invalid value type!"); - case MVT::i8: Reg = X86::AL; size = 1; break; - case MVT::i16: Reg = X86::AX; size = 2; break; - case MVT::i32: Reg = X86::EAX; size = 4; break; + switch (T.SimpleTy) { + default: + llvm_unreachable("Invalid value type!"); + case MVT::i8: + Reg = X86::AL; + size = 1; + break; + case MVT::i16: + Reg = X86::AX; + size = 2; + break; + case MVT::i32: + Reg = X86::EAX; + size = 4; + break; case MVT::i64: assert(Subtarget.is64Bit() && "Node not type legal!"); - Reg = X86::RAX; size = 8; + Reg = X86::RAX; + size = 8; break; } - SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg, - Op.getOperand(2), SDValue()); - SDValue Ops[] = { cpIn.getValue(0), - Op.getOperand(1), - Op.getOperand(3), - DAG.getTargetConstant(size, DL, MVT::i8), - cpIn.getValue(1) }; + SDValue cpIn = + DAG.getCopyToReg(Op.getOperand(0), DL, Reg, Op.getOperand(2), SDValue()); + SDValue Ops[] = {cpIn.getValue(0), Op.getOperand(1), Op.getOperand(3), + DAG.getTargetConstant(size, DL, MVT::i8), cpIn.getValue(1)}; SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); MachineMemOperand *MMO = cast(Op)->getMemOperand(); - SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, - Ops, T, MMO); + SDValue Result = + DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, Ops, T, MMO); SDValue cpOut = - DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); + DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS, MVT::i32, cpOut.getValue(2)); SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG); - return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), - cpOut, Success, EFLAGS.getValue(1)); + return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), cpOut, Success, + EFLAGS.getValue(1)); } // Create MOVMSKB, taking into account whether we need to split for AVX1. @@ -32567,7 +32642,8 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget, } assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || - SrcVT == MVT::i64) && "Unexpected VT!"); + SrcVT == MVT::i64) && + "Unexpected VT!"); assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) && @@ -32581,8 +32657,7 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget, // Example: from MVT::v2i32 to MVT::v4i32. MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(), SrcVT.getVectorNumElements() * 2); - Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src, - DAG.getUNDEF(SrcVT)); + Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src, DAG.getUNDEF(SrcVT)); } else { assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() && "Unexpected source type in LowerBITCAST"); @@ -32728,7 +32803,8 @@ static SDValue LowerVectorCTPOP(SDValue Op, const SDLoc &DL, if (Subtarget.hasVPOPCNTDQ()) { unsigned NumElems = VT.getVectorNumElements(); assert((VT.getVectorElementType() == MVT::i8 || - VT.getVectorElementType() == MVT::i16) && "Unexpected type"); + VT.getVectorElementType() == MVT::i16) && + "Unexpected type"); if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) { MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems); Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0); @@ -33127,16 +33203,16 @@ static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, SDValue NewChain = DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Chain); assert(!N->hasAnyUseOfValue(0)); // NOTE: The getUNDEF is needed to give something for the unused result 0. - return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), - DAG.getUNDEF(VT), NewChain); + return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), DAG.getUNDEF(VT), + NewChain); } SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget); // RAUW the chain, but don't worry about the result, as it's unused. assert(!N->hasAnyUseOfValue(0)); // NOTE: The getUNDEF is needed to give something for the unused result 0. - return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), - DAG.getUNDEF(VT), LockOp.getValue(1)); + return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), DAG.getUNDEF(VT), + LockOp.getValue(1)); } static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG, @@ -33236,17 +33312,17 @@ static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) { // Set the carry flag. SDValue Carry = Op.getOperand(2); EVT CarryVT = Carry.getValueType(); - Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32), - Carry, DAG.getAllOnesConstant(DL, CarryVT)); + Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32), Carry, + DAG.getAllOnesConstant(DL, CarryVT)); bool IsAdd = Opc == ISD::UADDO_CARRY || Opc == ISD::SADDO_CARRY; - SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs, - Op.getOperand(0), Op.getOperand(1), - Carry.getValue(1)); + SDValue Sum = + DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs, Op.getOperand(0), + Op.getOperand(1), Carry.getValue(1)); bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY; - SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B, - Sum.getValue(1), DL, DAG); + SDValue SetCC = + getSETCC(IsSigned ? X86::COND_O : X86::COND_B, Sum.getValue(1), DL, DAG); if (N->getValueType(1) == MVT::i1) SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC); @@ -33397,8 +33473,8 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, if (!Subtarget.hasVLX() && !VT.is512BitVector() && !Index.getSimpleValueType().is512BitVector()) { // Determine how much we need to widen by to get a 512-bit type. - unsigned Factor = std::min(512/VT.getSizeInBits(), - 512/IndexVT.getSizeInBits()); + unsigned Factor = + std::min(512 / VT.getSizeInBits(), 512 / IndexVT.getSizeInBits()); unsigned NumElts = VT.getVectorNumElements() * Factor; VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts); @@ -33440,7 +33516,7 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget, N->isExpandingLoad()); // Emit a blend. SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru); - return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl); + return DAG.getMergeValues({Select, NewLoad.getValue(1)}, dl); } assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) && @@ -33507,7 +33583,7 @@ static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget, // This operation is legal for targets with VLX, but without // VLX the vector should be widened to 512 bit - unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits(); + unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits(); MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec); // Mask element has to be i1. @@ -33549,8 +33625,8 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && !IndexVT.is512BitVector()) { // Determine how much we need to widen by to get a 512-bit type. - unsigned Factor = std::min(512/VT.getSizeInBits(), - 512/IndexVT.getSizeInBits()); + unsigned Factor = + std::min(512 / VT.getSizeInBits(), 512 / IndexVT.getSizeInBits()); unsigned NumElts = VT.getVectorNumElements() * Factor; @@ -33567,8 +33643,8 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, if (PassThru.isUndef()) PassThru = getZeroVector(VT, Subtarget, DAG, dl); - SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index, - N->getScale() }; + SDValue Ops[] = {N->getChain(), PassThru, Mask, + N->getBasePtr(), Index, N->getScale()}; SDValue NewGather = DAG.getMemIntrinsicNode( X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(), N->getMemOperand()); @@ -33766,7 +33842,7 @@ SDValue X86TargetLowering::visitMaskedStore(SelectionDAG &DAG, const SDLoc &DL, /// Provide custom lowering hooks for some operations. SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { - // clang-format off + // clang-format off default: llvm_unreachable("Should not custom lower this!"); case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG); case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: @@ -33923,7 +33999,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG); case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG); case ISD::PREFETCH: return LowerPREFETCH(Op, Subtarget, DAG); - // clang-format on + // clang-format on } } @@ -33936,7 +34012,7 @@ bool X86TargetLowering::isSelectSupported(SelectSupportKind Kind) const { /// Replace a node with an illegal result type with a new node built out of /// custom code. void X86TargetLowering::ReplaceNodeResults(SDNode *N, - SmallVectorImpl&Results, + SmallVectorImpl &Results, SelectionDAG &DAG) const { SDLoc dl(N); unsigned Opc = N->getOpcode(); @@ -34062,8 +34138,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE); // Widen the result with by padding with undef. - Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res, - DAG.getUNDEF(VT)); + Res = + DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res, DAG.getUNDEF(VT)); Results.push_back(Res); Results.push_back(Ovf); return; @@ -34080,11 +34156,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, "Unexpected type action!"); unsigned NumConcat = 128 / InVT.getSizeInBits(); - EVT InWideVT = EVT::getVectorVT(*DAG.getContext(), - InVT.getVectorElementType(), - NumConcat * InVT.getVectorNumElements()); - EVT WideVT = EVT::getVectorVT(*DAG.getContext(), - VT.getVectorElementType(), + EVT InWideVT = + EVT::getVectorVT(*DAG.getContext(), InVT.getVectorElementType(), + NumConcat * InVT.getVectorNumElements()); + EVT WideVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumConcat * VT.getVectorNumElements()); SmallVector Ops(NumConcat, DAG.getUNDEF(InVT)); @@ -34148,7 +34223,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } - SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG); + SDValue V = LowerWin64_i128OP(SDValue(N, 0), DAG); Results.push_back(V); return; } @@ -34226,9 +34301,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo); Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi); - SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi, - { 0, 1, 2, 3, 16, 17, 18, 19, - -1, -1, -1, -1, -1, -1, -1, -1 }); + SDValue Res = DAG.getVectorShuffle( + MVT::v16i8, dl, Lo, Hi, + {0, 1, 2, 3, 16, 17, 18, 19, -1, -1, -1, -1, -1, -1, -1, -1}); Results.push_back(Res); return; } @@ -34260,7 +34335,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, SDValue In = N->getOperand(0); EVT InVT = In.getValueType(); if (!Subtarget.hasSSE41() && VT == MVT::v4i64 && - (InVT == MVT::v4i16 || InVT == MVT::v4i8)){ + (InVT == MVT::v4i16 || InVT == MVT::v4i8)) { assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && "Unexpected type action!"); assert(Opc == ISD::SIGN_EXTEND && "Unexpected opcode"); @@ -34276,11 +34351,11 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, // Create an unpackl and unpackh to interleave the sign bits then bitcast // to v2i64. - SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits, - {0, 4, 1, 5}); + SDValue Lo = + DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits, {0, 4, 1, 5}); Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo); - SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits, - {2, 6, 3, 7}); + SDValue Hi = + DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits, {2, 6, 3, 7}); Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi); SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); @@ -34467,7 +34542,6 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } - if (VT == MVT::v2i32) { assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) && "Strict unsigned conversion requires AVX512"); @@ -34552,9 +34626,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, } SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl); - SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT, - DAG.getConstantFP(0.0, dl, VecInVT), Src, - ZeroIdx); + SDValue Res = + DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT, + DAG.getConstantFP(0.0, dl, VecInVT), Src, ZeroIdx); SDValue Chain; if (IsStrict) { SDVTList Tys = DAG.getVTList(VecVT, MVT::Other); @@ -34641,8 +34715,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, EVT SrcVT = Src.getValueType(); if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) { if (IsStrict) { - unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P - : X86ISD::STRICT_CVTUI2P; + unsigned Opc = + IsSigned ? X86ISD::STRICT_CVTSI2P : X86ISD::STRICT_CVTUI2P; SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other}, {N->getOperand(0), Src}); Results.push_back(Res); @@ -34656,7 +34730,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() && Subtarget.hasSSE41() && !Subtarget.hasAVX512()) { SDValue Zero = DAG.getConstant(0, dl, SrcVT); - SDValue One = DAG.getConstant(1, dl, SrcVT); + SDValue One = DAG.getConstant(1, dl, SrcVT); SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT, DAG.getNode(ISD::SRL, dl, SrcVT, Src, One), DAG.getNode(ISD::AND, dl, SrcVT, Src, One)); @@ -34722,9 +34796,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, if (IsStrict) { SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other}, {N->getOperand(0), Or, VBias}); - SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, - {MVT::v4f32, MVT::Other}, - {Sub.getValue(1), Sub}); + SDValue Res = + DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {MVT::v4f32, MVT::Other}, + {Sub.getValue(1), Sub}); Results.push_back(Res); Results.push_back(Res.getValue(1)); } else { @@ -34805,8 +34879,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, case ISD::INTRINSIC_W_CHAIN: { unsigned IntNo = N->getConstantOperandVal(1); switch (IntNo) { - default : llvm_unreachable("Do not know how to custom type " - "legalize this intrinsic operation!"); + default: + llvm_unreachable("Do not know how to custom type " + "legalize this intrinsic operation!"); case Intrinsic::x86_rdtsc: return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results); @@ -34819,7 +34894,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; case Intrinsic::x86_rdpru: expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget, - Results); + Results); return; case Intrinsic::x86_xgetbv: expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget, @@ -34876,12 +34951,12 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, } SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, - Regs64bit ? X86::RAX : X86::EAX, - HalfT, Result.getValue(1)); + Regs64bit ? X86::RAX : X86::EAX, HalfT, + Result.getValue(1)); SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, - Regs64bit ? X86::RDX : X86::EDX, - HalfT, cpOutL.getValue(2)); - SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; + Regs64bit ? X86::RDX : X86::EDX, HalfT, + cpOutL.getValue(2)); + SDValue OpsF[] = {cpOutL.getValue(0), cpOutH.getValue(0)}; SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS, MVT::i32, cpOutH.getValue(2)); @@ -34923,7 +34998,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, // Then extract the lower 64-bits. MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32; SDVTList Tys = DAG.getVTList(LdVT, MVT::Other); - SDValue Ops[] = { Node->getChain(), Node->getBasePtr() }; + SDValue Ops[] = {Node->getChain(), Node->getBasePtr()}; SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MVT::i64, Node->getMemOperand()); if (Subtarget.hasSSE2()) { @@ -34947,10 +35022,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, // First load this into an 80-bit X87 register. This will put the whole // integer into the significand. SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); - SDValue Ops[] = { Node->getChain(), Node->getBasePtr() }; - SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD, - dl, Tys, Ops, MVT::i64, - Node->getMemOperand()); + SDValue Ops[] = {Node->getChain(), Node->getBasePtr()}; + SDValue Result = DAG.getMemIntrinsicNode( + X86ISD::FILD, dl, Tys, Ops, MVT::i64, Node->getMemOperand()); SDValue Chain = Result.getValue(1); // Now store the X87 register to a stack temporary and convert to i64. @@ -34961,7 +35035,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, int SPFI = cast(StackPtr.getNode())->getIndex(); MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI); - SDValue StoreOps[] = { Chain, Result, StackPtr }; + SDValue StoreOps[] = {Chain, Result, StackPtr}; Chain = DAG.getMemIntrinsicNode( X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64, MPI, std::nullopt /*Align*/, MachineMemOperand::MOStore); @@ -35019,8 +35093,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && "Unexpected type action!"); EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT); - SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, - N->getOperand(0)); + SDValue Res = + DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, N->getOperand(0)); Res = DAG.getBitcast(WideVT, Res); Results.push_back(Res); return; @@ -35042,8 +35116,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, SDValue Mask = Gather->getMask(); assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type"); SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, - Gather->getPassThru(), - DAG.getUNDEF(VT)); + Gather->getPassThru(), DAG.getUNDEF(VT)); if (!Subtarget.hasVLX()) { // We need to widen the mask, but the instruction will only use 2 // of its elements. So we can use undef. @@ -35051,8 +35124,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, DAG.getUNDEF(MVT::v2i1)); Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask); } - SDValue Ops[] = { Gather->getChain(), PassThru, Mask, - Gather->getBasePtr(), Index, Gather->getScale() }; + SDValue Ops[] = {Gather->getChain(), PassThru, Mask, + Gather->getBasePtr(), Index, Gather->getScale()}; SDValue Res = DAG.getMemIntrinsicNode( X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops, Gather->getMemoryVT(), Gather->getMemOperand()); @@ -35097,7 +35170,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } case ISD::ADDRSPACECAST: { - SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG); + SDValue V = LowerADDRSPACECAST(SDValue(N, 0), DAG); Results.push_back(V); return; } @@ -35128,470 +35201,473 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { switch ((X86ISD::NodeType)Opcode) { - case X86ISD::FIRST_NUMBER: break; -#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE; - NODE_NAME_CASE(BSF) - NODE_NAME_CASE(BSR) - NODE_NAME_CASE(FSHL) - NODE_NAME_CASE(FSHR) - NODE_NAME_CASE(FAND) - NODE_NAME_CASE(FANDN) - NODE_NAME_CASE(FOR) - NODE_NAME_CASE(FXOR) - NODE_NAME_CASE(FILD) - NODE_NAME_CASE(FIST) - NODE_NAME_CASE(FP_TO_INT_IN_MEM) - NODE_NAME_CASE(FLD) - NODE_NAME_CASE(FST) - NODE_NAME_CASE(CALL) - NODE_NAME_CASE(CALL_RVMARKER) - NODE_NAME_CASE(IMP_CALL) - NODE_NAME_CASE(BT) - NODE_NAME_CASE(CMP) - NODE_NAME_CASE(FCMP) - NODE_NAME_CASE(STRICT_FCMP) - NODE_NAME_CASE(STRICT_FCMPS) - NODE_NAME_CASE(COMI) - NODE_NAME_CASE(UCOMI) - NODE_NAME_CASE(COMX) - NODE_NAME_CASE(UCOMX) - NODE_NAME_CASE(CMPM) - NODE_NAME_CASE(CMPMM) - NODE_NAME_CASE(STRICT_CMPM) - NODE_NAME_CASE(CMPMM_SAE) - NODE_NAME_CASE(SETCC) - NODE_NAME_CASE(CTSELECT) - NODE_NAME_CASE(SETCC_CARRY) - NODE_NAME_CASE(FSETCC) - NODE_NAME_CASE(FSETCCM) - NODE_NAME_CASE(FSETCCM_SAE) - NODE_NAME_CASE(CMOV) - NODE_NAME_CASE(BRCOND) - NODE_NAME_CASE(RET_GLUE) - NODE_NAME_CASE(IRET) - NODE_NAME_CASE(REP_STOS) - NODE_NAME_CASE(REP_MOVS) - NODE_NAME_CASE(GlobalBaseReg) - NODE_NAME_CASE(Wrapper) - NODE_NAME_CASE(WrapperRIP) - NODE_NAME_CASE(MOVQ2DQ) - NODE_NAME_CASE(MOVDQ2Q) - NODE_NAME_CASE(MMX_MOVD2W) - NODE_NAME_CASE(MMX_MOVW2D) - NODE_NAME_CASE(PEXTRB) - NODE_NAME_CASE(PEXTRW) - NODE_NAME_CASE(INSERTPS) - NODE_NAME_CASE(PINSRB) - NODE_NAME_CASE(PINSRW) - NODE_NAME_CASE(PSHUFB) - NODE_NAME_CASE(ANDNP) - NODE_NAME_CASE(BLENDI) - NODE_NAME_CASE(BLENDV) - NODE_NAME_CASE(HADD) - NODE_NAME_CASE(HSUB) - NODE_NAME_CASE(FHADD) - NODE_NAME_CASE(FHSUB) - NODE_NAME_CASE(CONFLICT) - NODE_NAME_CASE(FMAX) - NODE_NAME_CASE(FMAXS) - NODE_NAME_CASE(FMAX_SAE) - NODE_NAME_CASE(FMAXS_SAE) - NODE_NAME_CASE(STRICT_FMAX) - NODE_NAME_CASE(FMIN) - NODE_NAME_CASE(FMINS) - NODE_NAME_CASE(FMIN_SAE) - NODE_NAME_CASE(FMINS_SAE) - NODE_NAME_CASE(STRICT_FMIN) - NODE_NAME_CASE(FMAXC) - NODE_NAME_CASE(FMINC) - NODE_NAME_CASE(FRSQRT) - NODE_NAME_CASE(FRCP) - NODE_NAME_CASE(EXTRQI) - NODE_NAME_CASE(INSERTQI) - NODE_NAME_CASE(TLSADDR) - NODE_NAME_CASE(TLSBASEADDR) - NODE_NAME_CASE(TLSCALL) - NODE_NAME_CASE(TLSDESC) - NODE_NAME_CASE(EH_SJLJ_SETJMP) - NODE_NAME_CASE(EH_SJLJ_LONGJMP) - NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH) - NODE_NAME_CASE(EH_RETURN) - NODE_NAME_CASE(TC_RETURN) - NODE_NAME_CASE(FNSTCW16m) - NODE_NAME_CASE(FLDCW16m) - NODE_NAME_CASE(FNSTENVm) - NODE_NAME_CASE(FLDENVm) - NODE_NAME_CASE(LCMPXCHG_DAG) - NODE_NAME_CASE(LCMPXCHG8_DAG) - NODE_NAME_CASE(LCMPXCHG16_DAG) - NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG) - NODE_NAME_CASE(LADD) - NODE_NAME_CASE(LSUB) - NODE_NAME_CASE(LOR) - NODE_NAME_CASE(LXOR) - NODE_NAME_CASE(LAND) - NODE_NAME_CASE(LBTS) - NODE_NAME_CASE(LBTC) - NODE_NAME_CASE(LBTR) - NODE_NAME_CASE(LBTS_RM) - NODE_NAME_CASE(LBTC_RM) - NODE_NAME_CASE(LBTR_RM) - NODE_NAME_CASE(AADD) - NODE_NAME_CASE(AOR) - NODE_NAME_CASE(AXOR) - NODE_NAME_CASE(AAND) - NODE_NAME_CASE(VZEXT_MOVL) - NODE_NAME_CASE(VZEXT_LOAD) - NODE_NAME_CASE(VEXTRACT_STORE) - NODE_NAME_CASE(VTRUNC) - NODE_NAME_CASE(VTRUNCS) - NODE_NAME_CASE(VTRUNCUS) - NODE_NAME_CASE(VMTRUNC) - NODE_NAME_CASE(VMTRUNCS) - NODE_NAME_CASE(VMTRUNCUS) - NODE_NAME_CASE(VTRUNCSTORES) - NODE_NAME_CASE(VTRUNCSTOREUS) - NODE_NAME_CASE(VMTRUNCSTORES) - NODE_NAME_CASE(VMTRUNCSTOREUS) - NODE_NAME_CASE(VFPEXT) - NODE_NAME_CASE(STRICT_VFPEXT) - NODE_NAME_CASE(VFPEXT_SAE) - NODE_NAME_CASE(VFPEXTS) - NODE_NAME_CASE(VFPEXTS_SAE) - NODE_NAME_CASE(VFPROUND) - NODE_NAME_CASE(VFPROUND2) - NODE_NAME_CASE(VFPROUND2_RND) - NODE_NAME_CASE(STRICT_VFPROUND) - NODE_NAME_CASE(VMFPROUND) - NODE_NAME_CASE(VFPROUND_RND) - NODE_NAME_CASE(VFPROUNDS) - NODE_NAME_CASE(VFPROUNDS_RND) - NODE_NAME_CASE(VSHLDQ) - NODE_NAME_CASE(VSRLDQ) - NODE_NAME_CASE(VSHL) - NODE_NAME_CASE(VSRL) - NODE_NAME_CASE(VSRA) - NODE_NAME_CASE(VSHLI) - NODE_NAME_CASE(VSRLI) - NODE_NAME_CASE(VSRAI) - NODE_NAME_CASE(VSHLV) - NODE_NAME_CASE(VSRLV) - NODE_NAME_CASE(VSRAV) - NODE_NAME_CASE(VROTLI) - NODE_NAME_CASE(VROTRI) - NODE_NAME_CASE(VPPERM) - NODE_NAME_CASE(CMPP) - NODE_NAME_CASE(STRICT_CMPP) - NODE_NAME_CASE(PCMPEQ) - NODE_NAME_CASE(PCMPGT) - NODE_NAME_CASE(PHMINPOS) - NODE_NAME_CASE(ADD) - NODE_NAME_CASE(SUB) - NODE_NAME_CASE(ADC) - NODE_NAME_CASE(SBB) - NODE_NAME_CASE(SMUL) - NODE_NAME_CASE(UMUL) - NODE_NAME_CASE(OR) - NODE_NAME_CASE(XOR) - NODE_NAME_CASE(AND) - NODE_NAME_CASE(BEXTR) - NODE_NAME_CASE(BEXTRI) - NODE_NAME_CASE(BZHI) - NODE_NAME_CASE(PDEP) - NODE_NAME_CASE(PEXT) - NODE_NAME_CASE(MUL_IMM) - NODE_NAME_CASE(MOVMSK) - NODE_NAME_CASE(PTEST) - NODE_NAME_CASE(TESTP) - NODE_NAME_CASE(KORTEST) - NODE_NAME_CASE(KTEST) - NODE_NAME_CASE(KADD) - NODE_NAME_CASE(KSHIFTL) - NODE_NAME_CASE(KSHIFTR) - NODE_NAME_CASE(PACKSS) - NODE_NAME_CASE(PACKUS) - NODE_NAME_CASE(PALIGNR) - NODE_NAME_CASE(VALIGN) - NODE_NAME_CASE(VSHLD) - NODE_NAME_CASE(VSHRD) - NODE_NAME_CASE(PSHUFD) - NODE_NAME_CASE(PSHUFHW) - NODE_NAME_CASE(PSHUFLW) - NODE_NAME_CASE(SHUFP) - NODE_NAME_CASE(SHUF128) - NODE_NAME_CASE(MOVLHPS) - NODE_NAME_CASE(MOVHLPS) - NODE_NAME_CASE(MOVDDUP) - NODE_NAME_CASE(MOVSHDUP) - NODE_NAME_CASE(MOVSLDUP) - NODE_NAME_CASE(MOVSD) - NODE_NAME_CASE(MOVSS) - NODE_NAME_CASE(MOVSH) - NODE_NAME_CASE(UNPCKL) - NODE_NAME_CASE(UNPCKH) - NODE_NAME_CASE(VBROADCAST) - NODE_NAME_CASE(VBROADCAST_LOAD) - NODE_NAME_CASE(VBROADCASTM) - NODE_NAME_CASE(SUBV_BROADCAST_LOAD) - NODE_NAME_CASE(VPERMILPV) - NODE_NAME_CASE(VPERMILPI) - NODE_NAME_CASE(VPERM2X128) - NODE_NAME_CASE(VPERMV) - NODE_NAME_CASE(VPERMV3) - NODE_NAME_CASE(VPERMI) - NODE_NAME_CASE(VPTERNLOG) - NODE_NAME_CASE(FP_TO_SINT_SAT) - NODE_NAME_CASE(FP_TO_UINT_SAT) - NODE_NAME_CASE(VFIXUPIMM) - NODE_NAME_CASE(VFIXUPIMM_SAE) - NODE_NAME_CASE(VFIXUPIMMS) - NODE_NAME_CASE(VFIXUPIMMS_SAE) - NODE_NAME_CASE(VRANGE) - NODE_NAME_CASE(VRANGE_SAE) - NODE_NAME_CASE(VRANGES) - NODE_NAME_CASE(VRANGES_SAE) - NODE_NAME_CASE(PMULUDQ) - NODE_NAME_CASE(PMULDQ) - NODE_NAME_CASE(PSADBW) - NODE_NAME_CASE(DBPSADBW) - NODE_NAME_CASE(VASTART_SAVE_XMM_REGS) - NODE_NAME_CASE(VAARG_64) - NODE_NAME_CASE(VAARG_X32) - NODE_NAME_CASE(DYN_ALLOCA) - NODE_NAME_CASE(MFENCE) - NODE_NAME_CASE(SEG_ALLOCA) - NODE_NAME_CASE(PROBED_ALLOCA) - NODE_NAME_CASE(RDRAND) - NODE_NAME_CASE(RDSEED) - NODE_NAME_CASE(RDPKRU) - NODE_NAME_CASE(WRPKRU) - NODE_NAME_CASE(VPMADDUBSW) - NODE_NAME_CASE(VPMADDWD) - NODE_NAME_CASE(VPSHA) - NODE_NAME_CASE(VPSHL) - NODE_NAME_CASE(VPCOM) - NODE_NAME_CASE(VPCOMU) - NODE_NAME_CASE(VPERMIL2) - NODE_NAME_CASE(FMSUB) - NODE_NAME_CASE(STRICT_FMSUB) - NODE_NAME_CASE(FNMADD) - NODE_NAME_CASE(STRICT_FNMADD) - NODE_NAME_CASE(FNMSUB) - NODE_NAME_CASE(STRICT_FNMSUB) - NODE_NAME_CASE(FMADDSUB) - NODE_NAME_CASE(FMSUBADD) - NODE_NAME_CASE(FMADD_RND) - NODE_NAME_CASE(FNMADD_RND) - NODE_NAME_CASE(FMSUB_RND) - NODE_NAME_CASE(FNMSUB_RND) - NODE_NAME_CASE(FMADDSUB_RND) - NODE_NAME_CASE(FMSUBADD_RND) - NODE_NAME_CASE(VFMADDC) - NODE_NAME_CASE(VFMADDC_RND) - NODE_NAME_CASE(VFCMADDC) - NODE_NAME_CASE(VFCMADDC_RND) - NODE_NAME_CASE(VFMULC) - NODE_NAME_CASE(VFMULC_RND) - NODE_NAME_CASE(VFCMULC) - NODE_NAME_CASE(VFCMULC_RND) - NODE_NAME_CASE(VFMULCSH) - NODE_NAME_CASE(VFMULCSH_RND) - NODE_NAME_CASE(VFCMULCSH) - NODE_NAME_CASE(VFCMULCSH_RND) - NODE_NAME_CASE(VFMADDCSH) - NODE_NAME_CASE(VFMADDCSH_RND) - NODE_NAME_CASE(VFCMADDCSH) - NODE_NAME_CASE(VFCMADDCSH_RND) - NODE_NAME_CASE(VPMADD52H) - NODE_NAME_CASE(VPMADD52L) - NODE_NAME_CASE(VRNDSCALE) - NODE_NAME_CASE(STRICT_VRNDSCALE) - NODE_NAME_CASE(VRNDSCALE_SAE) - NODE_NAME_CASE(VRNDSCALES) - NODE_NAME_CASE(VRNDSCALES_SAE) - NODE_NAME_CASE(VREDUCE) - NODE_NAME_CASE(VREDUCE_SAE) - NODE_NAME_CASE(VREDUCES) - NODE_NAME_CASE(VREDUCES_SAE) - NODE_NAME_CASE(VGETMANT) - NODE_NAME_CASE(VGETMANT_SAE) - NODE_NAME_CASE(VGETMANTS) - NODE_NAME_CASE(VGETMANTS_SAE) - NODE_NAME_CASE(PCMPESTR) - NODE_NAME_CASE(PCMPISTR) - NODE_NAME_CASE(XTEST) - NODE_NAME_CASE(COMPRESS) - NODE_NAME_CASE(EXPAND) - NODE_NAME_CASE(SELECTS) - NODE_NAME_CASE(ADDSUB) - NODE_NAME_CASE(RCP14) - NODE_NAME_CASE(RCP14S) - NODE_NAME_CASE(RSQRT14) - NODE_NAME_CASE(RSQRT14S) - NODE_NAME_CASE(FADD_RND) - NODE_NAME_CASE(FADDS) - NODE_NAME_CASE(FADDS_RND) - NODE_NAME_CASE(FSUB_RND) - NODE_NAME_CASE(FSUBS) - NODE_NAME_CASE(FSUBS_RND) - NODE_NAME_CASE(FMUL_RND) - NODE_NAME_CASE(FMULS) - NODE_NAME_CASE(FMULS_RND) - NODE_NAME_CASE(FDIV_RND) - NODE_NAME_CASE(FDIVS) - NODE_NAME_CASE(FDIVS_RND) - NODE_NAME_CASE(FSQRT_RND) - NODE_NAME_CASE(FSQRTS) - NODE_NAME_CASE(FSQRTS_RND) - NODE_NAME_CASE(FGETEXP) - NODE_NAME_CASE(FGETEXP_SAE) - NODE_NAME_CASE(FGETEXPS) - NODE_NAME_CASE(FGETEXPS_SAE) - NODE_NAME_CASE(SCALEF) - NODE_NAME_CASE(SCALEF_RND) - NODE_NAME_CASE(SCALEFS) - NODE_NAME_CASE(SCALEFS_RND) - NODE_NAME_CASE(MULHRS) - NODE_NAME_CASE(SINT_TO_FP_RND) - NODE_NAME_CASE(UINT_TO_FP_RND) - NODE_NAME_CASE(CVTTP2SI) - NODE_NAME_CASE(CVTTP2UI) - NODE_NAME_CASE(STRICT_CVTTP2SI) - NODE_NAME_CASE(STRICT_CVTTP2UI) - NODE_NAME_CASE(MCVTTP2SI) - NODE_NAME_CASE(MCVTTP2UI) - NODE_NAME_CASE(CVTTP2SI_SAE) - NODE_NAME_CASE(CVTTP2UI_SAE) - NODE_NAME_CASE(CVTTS2SI) - NODE_NAME_CASE(CVTTS2UI) - NODE_NAME_CASE(CVTTS2SI_SAE) - NODE_NAME_CASE(CVTTS2UI_SAE) - NODE_NAME_CASE(CVTSI2P) - NODE_NAME_CASE(CVTUI2P) - NODE_NAME_CASE(STRICT_CVTSI2P) - NODE_NAME_CASE(STRICT_CVTUI2P) - NODE_NAME_CASE(MCVTSI2P) - NODE_NAME_CASE(MCVTUI2P) - NODE_NAME_CASE(VFPCLASS) - NODE_NAME_CASE(VFPCLASSS) - NODE_NAME_CASE(MULTISHIFT) - NODE_NAME_CASE(SCALAR_SINT_TO_FP) - NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND) - NODE_NAME_CASE(SCALAR_UINT_TO_FP) - NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND) - NODE_NAME_CASE(CVTPS2PH) - NODE_NAME_CASE(STRICT_CVTPS2PH) - NODE_NAME_CASE(CVTPS2PH_SAE) - NODE_NAME_CASE(MCVTPS2PH) - NODE_NAME_CASE(MCVTPS2PH_SAE) - NODE_NAME_CASE(CVTPH2PS) - NODE_NAME_CASE(STRICT_CVTPH2PS) - NODE_NAME_CASE(CVTPH2PS_SAE) - NODE_NAME_CASE(CVTP2SI) - NODE_NAME_CASE(CVTP2UI) - NODE_NAME_CASE(MCVTP2SI) - NODE_NAME_CASE(MCVTP2UI) - NODE_NAME_CASE(CVTP2SI_RND) - NODE_NAME_CASE(CVTP2UI_RND) - NODE_NAME_CASE(CVTS2SI) - NODE_NAME_CASE(CVTS2UI) - NODE_NAME_CASE(CVTS2SI_RND) - NODE_NAME_CASE(CVTS2UI_RND) - NODE_NAME_CASE(CVTNEPS2BF16) - NODE_NAME_CASE(MCVTNEPS2BF16) - NODE_NAME_CASE(DPBF16PS) - NODE_NAME_CASE(DPFP16PS) - NODE_NAME_CASE(MPSADBW) - NODE_NAME_CASE(LWPINS) - NODE_NAME_CASE(MGATHER) - NODE_NAME_CASE(MSCATTER) - NODE_NAME_CASE(VPDPBUSD) - NODE_NAME_CASE(VPDPBUSDS) - NODE_NAME_CASE(VPDPWSSD) - NODE_NAME_CASE(VPDPWSSDS) - NODE_NAME_CASE(VPSHUFBITQMB) - NODE_NAME_CASE(GF2P8MULB) - NODE_NAME_CASE(GF2P8AFFINEQB) - NODE_NAME_CASE(GF2P8AFFINEINVQB) - NODE_NAME_CASE(NT_CALL) - NODE_NAME_CASE(NT_BRIND) - NODE_NAME_CASE(UMWAIT) - NODE_NAME_CASE(TPAUSE) - NODE_NAME_CASE(ENQCMD) - NODE_NAME_CASE(ENQCMDS) - NODE_NAME_CASE(VP2INTERSECT) - NODE_NAME_CASE(VPDPBSUD) - NODE_NAME_CASE(VPDPBSUDS) - NODE_NAME_CASE(VPDPBUUD) - NODE_NAME_CASE(VPDPBUUDS) - NODE_NAME_CASE(VPDPBSSD) - NODE_NAME_CASE(VPDPBSSDS) - NODE_NAME_CASE(VPDPWSUD) - NODE_NAME_CASE(VPDPWSUDS) - NODE_NAME_CASE(VPDPWUSD) - NODE_NAME_CASE(VPDPWUSDS) - NODE_NAME_CASE(VPDPWUUD) - NODE_NAME_CASE(VPDPWUUDS) - NODE_NAME_CASE(VMINMAX) - NODE_NAME_CASE(VMINMAX_SAE) - NODE_NAME_CASE(VMINMAXS) - NODE_NAME_CASE(VMINMAXS_SAE) - NODE_NAME_CASE(CVTP2IBS) - NODE_NAME_CASE(CVTP2IUBS) - NODE_NAME_CASE(CVTP2IBS_RND) - NODE_NAME_CASE(CVTP2IUBS_RND) - NODE_NAME_CASE(CVTTP2IBS) - NODE_NAME_CASE(CVTTP2IUBS) - NODE_NAME_CASE(CVTTP2IBS_SAE) - NODE_NAME_CASE(CVTTP2IUBS_SAE) - NODE_NAME_CASE(VCVT2PH2BF8) - NODE_NAME_CASE(VCVT2PH2BF8S) - NODE_NAME_CASE(VCVT2PH2HF8) - NODE_NAME_CASE(VCVT2PH2HF8S) - NODE_NAME_CASE(VCVTBIASPH2BF8) - NODE_NAME_CASE(VCVTBIASPH2BF8S) - NODE_NAME_CASE(VCVTBIASPH2HF8) - NODE_NAME_CASE(VCVTBIASPH2HF8S) - NODE_NAME_CASE(VCVTPH2BF8) - NODE_NAME_CASE(VCVTPH2BF8S) - NODE_NAME_CASE(VCVTPH2HF8) - NODE_NAME_CASE(VCVTPH2HF8S) - NODE_NAME_CASE(VMCVTBIASPH2BF8) - NODE_NAME_CASE(VMCVTBIASPH2BF8S) - NODE_NAME_CASE(VMCVTBIASPH2HF8) - NODE_NAME_CASE(VMCVTBIASPH2HF8S) - NODE_NAME_CASE(VMCVTPH2BF8) - NODE_NAME_CASE(VMCVTPH2BF8S) - NODE_NAME_CASE(VMCVTPH2HF8) - NODE_NAME_CASE(VMCVTPH2HF8S) - NODE_NAME_CASE(VCVTHF82PH) - NODE_NAME_CASE(AESENC128KL) - NODE_NAME_CASE(AESDEC128KL) - NODE_NAME_CASE(AESENC256KL) - NODE_NAME_CASE(AESDEC256KL) - NODE_NAME_CASE(AESENCWIDE128KL) - NODE_NAME_CASE(AESDECWIDE128KL) - NODE_NAME_CASE(AESENCWIDE256KL) - NODE_NAME_CASE(AESDECWIDE256KL) - NODE_NAME_CASE(CMPCCXADD) - NODE_NAME_CASE(TESTUI) - NODE_NAME_CASE(FP80_ADD) - NODE_NAME_CASE(STRICT_FP80_ADD) - NODE_NAME_CASE(CCMP) - NODE_NAME_CASE(CTEST) - NODE_NAME_CASE(CLOAD) - NODE_NAME_CASE(CSTORE) - NODE_NAME_CASE(CVTTS2SIS) - NODE_NAME_CASE(CVTTS2UIS) - NODE_NAME_CASE(CVTTS2SIS_SAE) - NODE_NAME_CASE(CVTTS2UIS_SAE) - NODE_NAME_CASE(CVTTP2SIS) - NODE_NAME_CASE(MCVTTP2SIS) - NODE_NAME_CASE(CVTTP2UIS_SAE) - NODE_NAME_CASE(CVTTP2SIS_SAE) - NODE_NAME_CASE(CVTTP2UIS) - NODE_NAME_CASE(MCVTTP2UIS) - NODE_NAME_CASE(POP_FROM_X87_REG) + case X86ISD::FIRST_NUMBER: + break; +#define NODE_NAME_CASE(NODE) \ + case X86ISD::NODE: \ + return "X86ISD::" #NODE; + NODE_NAME_CASE(BSF) + NODE_NAME_CASE(BSR) + NODE_NAME_CASE(FSHL) + NODE_NAME_CASE(FSHR) + NODE_NAME_CASE(FAND) + NODE_NAME_CASE(FANDN) + NODE_NAME_CASE(FOR) + NODE_NAME_CASE(FXOR) + NODE_NAME_CASE(FILD) + NODE_NAME_CASE(FIST) + NODE_NAME_CASE(FP_TO_INT_IN_MEM) + NODE_NAME_CASE(FLD) + NODE_NAME_CASE(FST) + NODE_NAME_CASE(CALL) + NODE_NAME_CASE(CALL_RVMARKER) + NODE_NAME_CASE(IMP_CALL) + NODE_NAME_CASE(BT) + NODE_NAME_CASE(CMP) + NODE_NAME_CASE(FCMP) + NODE_NAME_CASE(STRICT_FCMP) + NODE_NAME_CASE(STRICT_FCMPS) + NODE_NAME_CASE(COMI) + NODE_NAME_CASE(UCOMI) + NODE_NAME_CASE(COMX) + NODE_NAME_CASE(UCOMX) + NODE_NAME_CASE(CMPM) + NODE_NAME_CASE(CMPMM) + NODE_NAME_CASE(STRICT_CMPM) + NODE_NAME_CASE(CMPMM_SAE) + NODE_NAME_CASE(SETCC) + NODE_NAME_CASE(CTSELECT) + NODE_NAME_CASE(SETCC_CARRY) + NODE_NAME_CASE(FSETCC) + NODE_NAME_CASE(FSETCCM) + NODE_NAME_CASE(FSETCCM_SAE) + NODE_NAME_CASE(CMOV) + NODE_NAME_CASE(BRCOND) + NODE_NAME_CASE(RET_GLUE) + NODE_NAME_CASE(IRET) + NODE_NAME_CASE(REP_STOS) + NODE_NAME_CASE(REP_MOVS) + NODE_NAME_CASE(GlobalBaseReg) + NODE_NAME_CASE(Wrapper) + NODE_NAME_CASE(WrapperRIP) + NODE_NAME_CASE(MOVQ2DQ) + NODE_NAME_CASE(MOVDQ2Q) + NODE_NAME_CASE(MMX_MOVD2W) + NODE_NAME_CASE(MMX_MOVW2D) + NODE_NAME_CASE(PEXTRB) + NODE_NAME_CASE(PEXTRW) + NODE_NAME_CASE(INSERTPS) + NODE_NAME_CASE(PINSRB) + NODE_NAME_CASE(PINSRW) + NODE_NAME_CASE(PSHUFB) + NODE_NAME_CASE(ANDNP) + NODE_NAME_CASE(BLENDI) + NODE_NAME_CASE(BLENDV) + NODE_NAME_CASE(HADD) + NODE_NAME_CASE(HSUB) + NODE_NAME_CASE(FHADD) + NODE_NAME_CASE(FHSUB) + NODE_NAME_CASE(CONFLICT) + NODE_NAME_CASE(FMAX) + NODE_NAME_CASE(FMAXS) + NODE_NAME_CASE(FMAX_SAE) + NODE_NAME_CASE(FMAXS_SAE) + NODE_NAME_CASE(STRICT_FMAX) + NODE_NAME_CASE(FMIN) + NODE_NAME_CASE(FMINS) + NODE_NAME_CASE(FMIN_SAE) + NODE_NAME_CASE(FMINS_SAE) + NODE_NAME_CASE(STRICT_FMIN) + NODE_NAME_CASE(FMAXC) + NODE_NAME_CASE(FMINC) + NODE_NAME_CASE(FRSQRT) + NODE_NAME_CASE(FRCP) + NODE_NAME_CASE(EXTRQI) + NODE_NAME_CASE(INSERTQI) + NODE_NAME_CASE(TLSADDR) + NODE_NAME_CASE(TLSBASEADDR) + NODE_NAME_CASE(TLSCALL) + NODE_NAME_CASE(TLSDESC) + NODE_NAME_CASE(EH_SJLJ_SETJMP) + NODE_NAME_CASE(EH_SJLJ_LONGJMP) + NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH) + NODE_NAME_CASE(EH_RETURN) + NODE_NAME_CASE(TC_RETURN) + NODE_NAME_CASE(FNSTCW16m) + NODE_NAME_CASE(FLDCW16m) + NODE_NAME_CASE(FNSTENVm) + NODE_NAME_CASE(FLDENVm) + NODE_NAME_CASE(LCMPXCHG_DAG) + NODE_NAME_CASE(LCMPXCHG8_DAG) + NODE_NAME_CASE(LCMPXCHG16_DAG) + NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG) + NODE_NAME_CASE(LADD) + NODE_NAME_CASE(LSUB) + NODE_NAME_CASE(LOR) + NODE_NAME_CASE(LXOR) + NODE_NAME_CASE(LAND) + NODE_NAME_CASE(LBTS) + NODE_NAME_CASE(LBTC) + NODE_NAME_CASE(LBTR) + NODE_NAME_CASE(LBTS_RM) + NODE_NAME_CASE(LBTC_RM) + NODE_NAME_CASE(LBTR_RM) + NODE_NAME_CASE(AADD) + NODE_NAME_CASE(AOR) + NODE_NAME_CASE(AXOR) + NODE_NAME_CASE(AAND) + NODE_NAME_CASE(VZEXT_MOVL) + NODE_NAME_CASE(VZEXT_LOAD) + NODE_NAME_CASE(VEXTRACT_STORE) + NODE_NAME_CASE(VTRUNC) + NODE_NAME_CASE(VTRUNCS) + NODE_NAME_CASE(VTRUNCUS) + NODE_NAME_CASE(VMTRUNC) + NODE_NAME_CASE(VMTRUNCS) + NODE_NAME_CASE(VMTRUNCUS) + NODE_NAME_CASE(VTRUNCSTORES) + NODE_NAME_CASE(VTRUNCSTOREUS) + NODE_NAME_CASE(VMTRUNCSTORES) + NODE_NAME_CASE(VMTRUNCSTOREUS) + NODE_NAME_CASE(VFPEXT) + NODE_NAME_CASE(STRICT_VFPEXT) + NODE_NAME_CASE(VFPEXT_SAE) + NODE_NAME_CASE(VFPEXTS) + NODE_NAME_CASE(VFPEXTS_SAE) + NODE_NAME_CASE(VFPROUND) + NODE_NAME_CASE(VFPROUND2) + NODE_NAME_CASE(VFPROUND2_RND) + NODE_NAME_CASE(STRICT_VFPROUND) + NODE_NAME_CASE(VMFPROUND) + NODE_NAME_CASE(VFPROUND_RND) + NODE_NAME_CASE(VFPROUNDS) + NODE_NAME_CASE(VFPROUNDS_RND) + NODE_NAME_CASE(VSHLDQ) + NODE_NAME_CASE(VSRLDQ) + NODE_NAME_CASE(VSHL) + NODE_NAME_CASE(VSRL) + NODE_NAME_CASE(VSRA) + NODE_NAME_CASE(VSHLI) + NODE_NAME_CASE(VSRLI) + NODE_NAME_CASE(VSRAI) + NODE_NAME_CASE(VSHLV) + NODE_NAME_CASE(VSRLV) + NODE_NAME_CASE(VSRAV) + NODE_NAME_CASE(VROTLI) + NODE_NAME_CASE(VROTRI) + NODE_NAME_CASE(VPPERM) + NODE_NAME_CASE(CMPP) + NODE_NAME_CASE(STRICT_CMPP) + NODE_NAME_CASE(PCMPEQ) + NODE_NAME_CASE(PCMPGT) + NODE_NAME_CASE(PHMINPOS) + NODE_NAME_CASE(ADD) + NODE_NAME_CASE(SUB) + NODE_NAME_CASE(ADC) + NODE_NAME_CASE(SBB) + NODE_NAME_CASE(SMUL) + NODE_NAME_CASE(UMUL) + NODE_NAME_CASE(OR) + NODE_NAME_CASE(XOR) + NODE_NAME_CASE(AND) + NODE_NAME_CASE(BEXTR) + NODE_NAME_CASE(BEXTRI) + NODE_NAME_CASE(BZHI) + NODE_NAME_CASE(PDEP) + NODE_NAME_CASE(PEXT) + NODE_NAME_CASE(MUL_IMM) + NODE_NAME_CASE(MOVMSK) + NODE_NAME_CASE(PTEST) + NODE_NAME_CASE(TESTP) + NODE_NAME_CASE(KORTEST) + NODE_NAME_CASE(KTEST) + NODE_NAME_CASE(KADD) + NODE_NAME_CASE(KSHIFTL) + NODE_NAME_CASE(KSHIFTR) + NODE_NAME_CASE(PACKSS) + NODE_NAME_CASE(PACKUS) + NODE_NAME_CASE(PALIGNR) + NODE_NAME_CASE(VALIGN) + NODE_NAME_CASE(VSHLD) + NODE_NAME_CASE(VSHRD) + NODE_NAME_CASE(PSHUFD) + NODE_NAME_CASE(PSHUFHW) + NODE_NAME_CASE(PSHUFLW) + NODE_NAME_CASE(SHUFP) + NODE_NAME_CASE(SHUF128) + NODE_NAME_CASE(MOVLHPS) + NODE_NAME_CASE(MOVHLPS) + NODE_NAME_CASE(MOVDDUP) + NODE_NAME_CASE(MOVSHDUP) + NODE_NAME_CASE(MOVSLDUP) + NODE_NAME_CASE(MOVSD) + NODE_NAME_CASE(MOVSS) + NODE_NAME_CASE(MOVSH) + NODE_NAME_CASE(UNPCKL) + NODE_NAME_CASE(UNPCKH) + NODE_NAME_CASE(VBROADCAST) + NODE_NAME_CASE(VBROADCAST_LOAD) + NODE_NAME_CASE(VBROADCASTM) + NODE_NAME_CASE(SUBV_BROADCAST_LOAD) + NODE_NAME_CASE(VPERMILPV) + NODE_NAME_CASE(VPERMILPI) + NODE_NAME_CASE(VPERM2X128) + NODE_NAME_CASE(VPERMV) + NODE_NAME_CASE(VPERMV3) + NODE_NAME_CASE(VPERMI) + NODE_NAME_CASE(VPTERNLOG) + NODE_NAME_CASE(FP_TO_SINT_SAT) + NODE_NAME_CASE(FP_TO_UINT_SAT) + NODE_NAME_CASE(VFIXUPIMM) + NODE_NAME_CASE(VFIXUPIMM_SAE) + NODE_NAME_CASE(VFIXUPIMMS) + NODE_NAME_CASE(VFIXUPIMMS_SAE) + NODE_NAME_CASE(VRANGE) + NODE_NAME_CASE(VRANGE_SAE) + NODE_NAME_CASE(VRANGES) + NODE_NAME_CASE(VRANGES_SAE) + NODE_NAME_CASE(PMULUDQ) + NODE_NAME_CASE(PMULDQ) + NODE_NAME_CASE(PSADBW) + NODE_NAME_CASE(DBPSADBW) + NODE_NAME_CASE(VASTART_SAVE_XMM_REGS) + NODE_NAME_CASE(VAARG_64) + NODE_NAME_CASE(VAARG_X32) + NODE_NAME_CASE(DYN_ALLOCA) + NODE_NAME_CASE(MFENCE) + NODE_NAME_CASE(SEG_ALLOCA) + NODE_NAME_CASE(PROBED_ALLOCA) + NODE_NAME_CASE(RDRAND) + NODE_NAME_CASE(RDSEED) + NODE_NAME_CASE(RDPKRU) + NODE_NAME_CASE(WRPKRU) + NODE_NAME_CASE(VPMADDUBSW) + NODE_NAME_CASE(VPMADDWD) + NODE_NAME_CASE(VPSHA) + NODE_NAME_CASE(VPSHL) + NODE_NAME_CASE(VPCOM) + NODE_NAME_CASE(VPCOMU) + NODE_NAME_CASE(VPERMIL2) + NODE_NAME_CASE(FMSUB) + NODE_NAME_CASE(STRICT_FMSUB) + NODE_NAME_CASE(FNMADD) + NODE_NAME_CASE(STRICT_FNMADD) + NODE_NAME_CASE(FNMSUB) + NODE_NAME_CASE(STRICT_FNMSUB) + NODE_NAME_CASE(FMADDSUB) + NODE_NAME_CASE(FMSUBADD) + NODE_NAME_CASE(FMADD_RND) + NODE_NAME_CASE(FNMADD_RND) + NODE_NAME_CASE(FMSUB_RND) + NODE_NAME_CASE(FNMSUB_RND) + NODE_NAME_CASE(FMADDSUB_RND) + NODE_NAME_CASE(FMSUBADD_RND) + NODE_NAME_CASE(VFMADDC) + NODE_NAME_CASE(VFMADDC_RND) + NODE_NAME_CASE(VFCMADDC) + NODE_NAME_CASE(VFCMADDC_RND) + NODE_NAME_CASE(VFMULC) + NODE_NAME_CASE(VFMULC_RND) + NODE_NAME_CASE(VFCMULC) + NODE_NAME_CASE(VFCMULC_RND) + NODE_NAME_CASE(VFMULCSH) + NODE_NAME_CASE(VFMULCSH_RND) + NODE_NAME_CASE(VFCMULCSH) + NODE_NAME_CASE(VFCMULCSH_RND) + NODE_NAME_CASE(VFMADDCSH) + NODE_NAME_CASE(VFMADDCSH_RND) + NODE_NAME_CASE(VFCMADDCSH) + NODE_NAME_CASE(VFCMADDCSH_RND) + NODE_NAME_CASE(VPMADD52H) + NODE_NAME_CASE(VPMADD52L) + NODE_NAME_CASE(VRNDSCALE) + NODE_NAME_CASE(STRICT_VRNDSCALE) + NODE_NAME_CASE(VRNDSCALE_SAE) + NODE_NAME_CASE(VRNDSCALES) + NODE_NAME_CASE(VRNDSCALES_SAE) + NODE_NAME_CASE(VREDUCE) + NODE_NAME_CASE(VREDUCE_SAE) + NODE_NAME_CASE(VREDUCES) + NODE_NAME_CASE(VREDUCES_SAE) + NODE_NAME_CASE(VGETMANT) + NODE_NAME_CASE(VGETMANT_SAE) + NODE_NAME_CASE(VGETMANTS) + NODE_NAME_CASE(VGETMANTS_SAE) + NODE_NAME_CASE(PCMPESTR) + NODE_NAME_CASE(PCMPISTR) + NODE_NAME_CASE(XTEST) + NODE_NAME_CASE(COMPRESS) + NODE_NAME_CASE(EXPAND) + NODE_NAME_CASE(SELECTS) + NODE_NAME_CASE(ADDSUB) + NODE_NAME_CASE(RCP14) + NODE_NAME_CASE(RCP14S) + NODE_NAME_CASE(RSQRT14) + NODE_NAME_CASE(RSQRT14S) + NODE_NAME_CASE(FADD_RND) + NODE_NAME_CASE(FADDS) + NODE_NAME_CASE(FADDS_RND) + NODE_NAME_CASE(FSUB_RND) + NODE_NAME_CASE(FSUBS) + NODE_NAME_CASE(FSUBS_RND) + NODE_NAME_CASE(FMUL_RND) + NODE_NAME_CASE(FMULS) + NODE_NAME_CASE(FMULS_RND) + NODE_NAME_CASE(FDIV_RND) + NODE_NAME_CASE(FDIVS) + NODE_NAME_CASE(FDIVS_RND) + NODE_NAME_CASE(FSQRT_RND) + NODE_NAME_CASE(FSQRTS) + NODE_NAME_CASE(FSQRTS_RND) + NODE_NAME_CASE(FGETEXP) + NODE_NAME_CASE(FGETEXP_SAE) + NODE_NAME_CASE(FGETEXPS) + NODE_NAME_CASE(FGETEXPS_SAE) + NODE_NAME_CASE(SCALEF) + NODE_NAME_CASE(SCALEF_RND) + NODE_NAME_CASE(SCALEFS) + NODE_NAME_CASE(SCALEFS_RND) + NODE_NAME_CASE(MULHRS) + NODE_NAME_CASE(SINT_TO_FP_RND) + NODE_NAME_CASE(UINT_TO_FP_RND) + NODE_NAME_CASE(CVTTP2SI) + NODE_NAME_CASE(CVTTP2UI) + NODE_NAME_CASE(STRICT_CVTTP2SI) + NODE_NAME_CASE(STRICT_CVTTP2UI) + NODE_NAME_CASE(MCVTTP2SI) + NODE_NAME_CASE(MCVTTP2UI) + NODE_NAME_CASE(CVTTP2SI_SAE) + NODE_NAME_CASE(CVTTP2UI_SAE) + NODE_NAME_CASE(CVTTS2SI) + NODE_NAME_CASE(CVTTS2UI) + NODE_NAME_CASE(CVTTS2SI_SAE) + NODE_NAME_CASE(CVTTS2UI_SAE) + NODE_NAME_CASE(CVTSI2P) + NODE_NAME_CASE(CVTUI2P) + NODE_NAME_CASE(STRICT_CVTSI2P) + NODE_NAME_CASE(STRICT_CVTUI2P) + NODE_NAME_CASE(MCVTSI2P) + NODE_NAME_CASE(MCVTUI2P) + NODE_NAME_CASE(VFPCLASS) + NODE_NAME_CASE(VFPCLASSS) + NODE_NAME_CASE(MULTISHIFT) + NODE_NAME_CASE(SCALAR_SINT_TO_FP) + NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND) + NODE_NAME_CASE(SCALAR_UINT_TO_FP) + NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND) + NODE_NAME_CASE(CVTPS2PH) + NODE_NAME_CASE(STRICT_CVTPS2PH) + NODE_NAME_CASE(CVTPS2PH_SAE) + NODE_NAME_CASE(MCVTPS2PH) + NODE_NAME_CASE(MCVTPS2PH_SAE) + NODE_NAME_CASE(CVTPH2PS) + NODE_NAME_CASE(STRICT_CVTPH2PS) + NODE_NAME_CASE(CVTPH2PS_SAE) + NODE_NAME_CASE(CVTP2SI) + NODE_NAME_CASE(CVTP2UI) + NODE_NAME_CASE(MCVTP2SI) + NODE_NAME_CASE(MCVTP2UI) + NODE_NAME_CASE(CVTP2SI_RND) + NODE_NAME_CASE(CVTP2UI_RND) + NODE_NAME_CASE(CVTS2SI) + NODE_NAME_CASE(CVTS2UI) + NODE_NAME_CASE(CVTS2SI_RND) + NODE_NAME_CASE(CVTS2UI_RND) + NODE_NAME_CASE(CVTNEPS2BF16) + NODE_NAME_CASE(MCVTNEPS2BF16) + NODE_NAME_CASE(DPBF16PS) + NODE_NAME_CASE(DPFP16PS) + NODE_NAME_CASE(MPSADBW) + NODE_NAME_CASE(LWPINS) + NODE_NAME_CASE(MGATHER) + NODE_NAME_CASE(MSCATTER) + NODE_NAME_CASE(VPDPBUSD) + NODE_NAME_CASE(VPDPBUSDS) + NODE_NAME_CASE(VPDPWSSD) + NODE_NAME_CASE(VPDPWSSDS) + NODE_NAME_CASE(VPSHUFBITQMB) + NODE_NAME_CASE(GF2P8MULB) + NODE_NAME_CASE(GF2P8AFFINEQB) + NODE_NAME_CASE(GF2P8AFFINEINVQB) + NODE_NAME_CASE(NT_CALL) + NODE_NAME_CASE(NT_BRIND) + NODE_NAME_CASE(UMWAIT) + NODE_NAME_CASE(TPAUSE) + NODE_NAME_CASE(ENQCMD) + NODE_NAME_CASE(ENQCMDS) + NODE_NAME_CASE(VP2INTERSECT) + NODE_NAME_CASE(VPDPBSUD) + NODE_NAME_CASE(VPDPBSUDS) + NODE_NAME_CASE(VPDPBUUD) + NODE_NAME_CASE(VPDPBUUDS) + NODE_NAME_CASE(VPDPBSSD) + NODE_NAME_CASE(VPDPBSSDS) + NODE_NAME_CASE(VPDPWSUD) + NODE_NAME_CASE(VPDPWSUDS) + NODE_NAME_CASE(VPDPWUSD) + NODE_NAME_CASE(VPDPWUSDS) + NODE_NAME_CASE(VPDPWUUD) + NODE_NAME_CASE(VPDPWUUDS) + NODE_NAME_CASE(VMINMAX) + NODE_NAME_CASE(VMINMAX_SAE) + NODE_NAME_CASE(VMINMAXS) + NODE_NAME_CASE(VMINMAXS_SAE) + NODE_NAME_CASE(CVTP2IBS) + NODE_NAME_CASE(CVTP2IUBS) + NODE_NAME_CASE(CVTP2IBS_RND) + NODE_NAME_CASE(CVTP2IUBS_RND) + NODE_NAME_CASE(CVTTP2IBS) + NODE_NAME_CASE(CVTTP2IUBS) + NODE_NAME_CASE(CVTTP2IBS_SAE) + NODE_NAME_CASE(CVTTP2IUBS_SAE) + NODE_NAME_CASE(VCVT2PH2BF8) + NODE_NAME_CASE(VCVT2PH2BF8S) + NODE_NAME_CASE(VCVT2PH2HF8) + NODE_NAME_CASE(VCVT2PH2HF8S) + NODE_NAME_CASE(VCVTBIASPH2BF8) + NODE_NAME_CASE(VCVTBIASPH2BF8S) + NODE_NAME_CASE(VCVTBIASPH2HF8) + NODE_NAME_CASE(VCVTBIASPH2HF8S) + NODE_NAME_CASE(VCVTPH2BF8) + NODE_NAME_CASE(VCVTPH2BF8S) + NODE_NAME_CASE(VCVTPH2HF8) + NODE_NAME_CASE(VCVTPH2HF8S) + NODE_NAME_CASE(VMCVTBIASPH2BF8) + NODE_NAME_CASE(VMCVTBIASPH2BF8S) + NODE_NAME_CASE(VMCVTBIASPH2HF8) + NODE_NAME_CASE(VMCVTBIASPH2HF8S) + NODE_NAME_CASE(VMCVTPH2BF8) + NODE_NAME_CASE(VMCVTPH2BF8S) + NODE_NAME_CASE(VMCVTPH2HF8) + NODE_NAME_CASE(VMCVTPH2HF8S) + NODE_NAME_CASE(VCVTHF82PH) + NODE_NAME_CASE(AESENC128KL) + NODE_NAME_CASE(AESDEC128KL) + NODE_NAME_CASE(AESENC256KL) + NODE_NAME_CASE(AESDEC256KL) + NODE_NAME_CASE(AESENCWIDE128KL) + NODE_NAME_CASE(AESDECWIDE128KL) + NODE_NAME_CASE(AESENCWIDE256KL) + NODE_NAME_CASE(AESDECWIDE256KL) + NODE_NAME_CASE(CMPCCXADD) + NODE_NAME_CASE(TESTUI) + NODE_NAME_CASE(FP80_ADD) + NODE_NAME_CASE(STRICT_FP80_ADD) + NODE_NAME_CASE(CCMP) + NODE_NAME_CASE(CTEST) + NODE_NAME_CASE(CLOAD) + NODE_NAME_CASE(CSTORE) + NODE_NAME_CASE(CVTTS2SIS) + NODE_NAME_CASE(CVTTS2UIS) + NODE_NAME_CASE(CVTTS2SIS_SAE) + NODE_NAME_CASE(CVTTS2UIS_SAE) + NODE_NAME_CASE(CVTTP2SIS) + NODE_NAME_CASE(MCVTTP2SIS) + NODE_NAME_CASE(CVTTP2UIS_SAE) + NODE_NAME_CASE(CVTTP2SIS_SAE) + NODE_NAME_CASE(CVTTP2UIS) + NODE_NAME_CASE(MCVTTP2UIS) + NODE_NAME_CASE(POP_FROM_X87_REG) } return nullptr; #undef NODE_NAME_CASE @@ -35644,7 +35720,7 @@ bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL, if (AM.HasBaseReg) return false; break; - default: // Other stuff never works. + default: // Other stuff never works. return false; } @@ -35749,12 +35825,13 @@ bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { if (Val.getOpcode() != ISD::LOAD) return false; - if (!VT1.isSimple() || !VT1.isInteger() || - !VT2.isSimple() || !VT2.isInteger()) + if (!VT1.isSimple() || !VT1.isInteger() || !VT2.isSimple() || + !VT2.isInteger()) return false; switch (VT1.getSimpleVT().SimpleTy) { - default: break; + default: + break; case MVT::i8: case MVT::i16: case MVT::i32: @@ -35985,8 +36062,10 @@ static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, // sinkMBB: // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB) BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg) - .addReg(mainDstReg).addMBB(mainMBB) - .addReg(fallDstReg).addMBB(fallMBB); + .addReg(mainDstReg) + .addMBB(mainMBB) + .addReg(fallDstReg) + .addMBB(fallMBB); MI.eraseFromParent(); return sinkMBB; @@ -36052,8 +36131,8 @@ X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI, unsigned TotalNumXMMRegs = 8; bool UseGPOffset = (ArgMode == 1); bool UseFPOffset = (ArgMode == 2); - unsigned MaxOffset = TotalNumIntRegs * 8 + - (UseFPOffset ? TotalNumXMMRegs * 16 : 0); + unsigned MaxOffset = + TotalNumIntRegs * 8 + (UseFPOffset ? TotalNumXMMRegs * 16 : 0); /* Align ArgSize to a multiple of 8 */ unsigned ArgSizeA8 = (ArgSize + 7) & ~7; @@ -36131,13 +36210,14 @@ X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI, // Check if there is enough room left to pull this argument. BuildMI(thisMBB, MIMD, TII->get(X86::CMP32ri)) - .addReg(OffsetReg) - .addImm(MaxOffset + 8 - ArgSizeA8); + .addReg(OffsetReg) + .addImm(MaxOffset + 8 - ArgSizeA8); // Branch to "overflowMBB" if offset >= max // Fall through to "offsetMBB" otherwise BuildMI(thisMBB, MIMD, TII->get(X86::JCC_1)) - .addMBB(overflowMBB).addImm(X86::COND_AE); + .addMBB(overflowMBB) + .addImm(X86::COND_AE); } // In offsetMBB, emit code to use the reg_save_area. @@ -36179,8 +36259,8 @@ X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI, // Compute the offset for the next argument Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass); BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32ri), NextOffsetReg) - .addReg(OffsetReg) - .addImm(UseFPOffset ? 16 : 8); + .addReg(OffsetReg) + .addImm(UseFPOffset ? 16 : 8); // Store it back into the va_list. BuildMI(offsetMBB, MIMD, TII->get(X86::MOV32mr)) @@ -36193,8 +36273,7 @@ X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI, .setMemRefs(StoreOnlyMMO); // Jump to endMBB - BuildMI(offsetMBB, MIMD, TII->get(X86::JMP_1)) - .addMBB(endMBB); + BuildMI(offsetMBB, MIMD, TII->get(X86::JMP_1)).addMBB(endMBB); } // @@ -36235,7 +36314,7 @@ X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI, .addImm(~(uint64_t)(Alignment.value() - 1)); } else { BuildMI(overflowMBB, MIMD, TII->get(TargetOpcode::COPY), OverflowDestReg) - .addReg(OverflowAddrReg); + .addReg(OverflowAddrReg); } // Compute the next overflow address after this argument. @@ -36261,10 +36340,11 @@ X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI, // If we branched, emit the PHI to the front of endMBB. if (offsetMBB) { - BuildMI(*endMBB, endMBB->begin(), MIMD, - TII->get(X86::PHI), DestReg) - .addReg(OffsetDestReg).addMBB(offsetMBB) - .addReg(OverflowDestReg).addMBB(overflowMBB); + BuildMI(*endMBB, endMBB->begin(), MIMD, TII->get(X86::PHI), DestReg) + .addReg(OffsetDestReg) + .addMBB(offsetMBB) + .addReg(OverflowDestReg) + .addMBB(overflowMBB); } // Erase the pseudo instruction @@ -36279,8 +36359,8 @@ X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI, // kill marker, and set it if it should. Returns the correct kill // marker value. static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, - MachineBasicBlock* BB, - const TargetRegisterInfo* TRI) { + MachineBasicBlock *BB, + const TargetRegisterInfo *TRI) { if (isPhysRegUsedAfter(X86::EFLAGS, SelectItr)) return false; @@ -36747,11 +36827,21 @@ X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI, // // + ---- <- ------------ <- ------------- <- ------------ + // | | - // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] + - // | | - // + <- ----------- <- ------------ <- ----------- <- ------------ + + // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn + // probe] -> [page alloc] -> [dyn probe] -> [tail alloc] + + // | | + // + <- + // ----------- + // <- + // ------------ + // <- + // ----------- + // <- + // ------------ + // + // - // The property we want to enforce is to never have more than [page alloc] between two probes. + // The property we want to enforce is to never have more than [page alloc] + // between two probes. const unsigned XORMIOpc = TFI.Uses64BitFramePtr ? X86::XOR64mi32 : X86::XOR32mi; @@ -36843,56 +36933,61 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI, // Add code to the main basic block to check if the stack limit has been hit, // and if so, jump to mallocMBB otherwise to bumpMBB. BuildMI(BB, MIMD, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg); - BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg) - .addReg(tmpSPVReg).addReg(sizeVReg); - BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr)) - .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg) - .addReg(SPLimitVReg); + BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::SUB64rr : X86::SUB32rr), SPLimitVReg) + .addReg(tmpSPVReg) + .addReg(sizeVReg); + BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::CMP64mr : X86::CMP32mr)) + .addReg(0) + .addImm(1) + .addReg(0) + .addImm(TlsOffset) + .addReg(TlsReg) + .addReg(SPLimitVReg); BuildMI(BB, MIMD, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G); // bumpMBB simply decreases the stack pointer, since we know the current // stacklet has enough space. BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), physSPReg) - .addReg(SPLimitVReg); + .addReg(SPLimitVReg); BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), bumpSPPtrVReg) - .addReg(SPLimitVReg); + .addReg(SPLimitVReg); BuildMI(bumpMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB); // Calls into a routine in libgcc to allocate more space from the heap. const uint32_t *RegMask = Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C); if (IsLP64) { - BuildMI(mallocMBB, MIMD, TII->get(X86::MOV64rr), X86::RDI) - .addReg(sizeVReg); + BuildMI(mallocMBB, MIMD, TII->get(X86::MOV64rr), X86::RDI).addReg(sizeVReg); BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32)) - .addExternalSymbol("__morestack_allocate_stack_space") - .addRegMask(RegMask) - .addReg(X86::RDI, RegState::Implicit) - .addReg(X86::RAX, RegState::ImplicitDefine); + .addExternalSymbol("__morestack_allocate_stack_space") + .addRegMask(RegMask) + .addReg(X86::RDI, RegState::Implicit) + .addReg(X86::RAX, RegState::ImplicitDefine); } else if (Is64Bit) { - BuildMI(mallocMBB, MIMD, TII->get(X86::MOV32rr), X86::EDI) - .addReg(sizeVReg); + BuildMI(mallocMBB, MIMD, TII->get(X86::MOV32rr), X86::EDI).addReg(sizeVReg); BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32)) - .addExternalSymbol("__morestack_allocate_stack_space") - .addRegMask(RegMask) - .addReg(X86::EDI, RegState::Implicit) - .addReg(X86::EAX, RegState::ImplicitDefine); + .addExternalSymbol("__morestack_allocate_stack_space") + .addRegMask(RegMask) + .addReg(X86::EDI, RegState::Implicit) + .addReg(X86::EAX, RegState::ImplicitDefine); } else { - BuildMI(mallocMBB, MIMD, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg) - .addImm(12); + BuildMI(mallocMBB, MIMD, TII->get(X86::SUB32ri), physSPReg) + .addReg(physSPReg) + .addImm(12); BuildMI(mallocMBB, MIMD, TII->get(X86::PUSH32r)).addReg(sizeVReg); BuildMI(mallocMBB, MIMD, TII->get(X86::CALLpcrel32)) - .addExternalSymbol("__morestack_allocate_stack_space") - .addRegMask(RegMask) - .addReg(X86::EAX, RegState::ImplicitDefine); + .addExternalSymbol("__morestack_allocate_stack_space") + .addRegMask(RegMask) + .addReg(X86::EAX, RegState::ImplicitDefine); } if (!Is64Bit) - BuildMI(mallocMBB, MIMD, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg) - .addImm(16); + BuildMI(mallocMBB, MIMD, TII->get(X86::ADD32ri), physSPReg) + .addReg(physSPReg) + .addImm(16); BuildMI(mallocMBB, MIMD, TII->get(TargetOpcode::COPY), mallocPtrVReg) - .addReg(IsLP64 ? X86::RAX : X86::EAX); + .addReg(IsLP64 ? X86::RAX : X86::EAX); BuildMI(mallocMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB); // Set up the CFG correctly. @@ -36947,7 +37042,8 @@ X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI, RestoreMBB->setIsEHPad(true); auto RestoreMBBI = RestoreMBB->begin(); - BuildMI(*RestoreMBB, RestoreMBBI, MIMD, TII.get(X86::JMP_4)).addMBB(TargetMBB); + BuildMI(*RestoreMBB, RestoreMBBI, MIMD, TII.get(X86::JMP_4)) + .addMBB(TargetMBB); return BB; } @@ -36969,9 +37065,10 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI, // FIXME: The 32-bit calls have non-standard calling conventions. Use a // proper register mask. const uint32_t *RegMask = - Subtarget.is64Bit() ? - Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() : - Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C); + Subtarget.is64Bit() + ? Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() + : Subtarget.getRegisterInfo()->getCallPreservedMask(*F, + CallingConv::C); if (Subtarget.is64Bit()) { MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::MOV64rm), X86::RDI) @@ -37227,8 +37324,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, MemOpndSlot = CurOp; MVT PVT = getPointerTy(MF->getDataLayout()); - assert((PVT == MVT::i64 || PVT == MVT::i32) && - "Invalid Pointer Size!"); + assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"); // For v = setjmp(buf), we generate // @@ -37276,19 +37372,19 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, LabelReg = MRI.createVirtualRegister(PtrRC); if (Subtarget.is64Bit()) { MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA64r), LabelReg) - .addReg(X86::RIP) - .addImm(0) - .addReg(0) - .addMBB(restoreMBB) - .addReg(0); + .addReg(X86::RIP) + .addImm(0) + .addReg(0) + .addMBB(restoreMBB) + .addReg(0); } else { - const X86InstrInfo *XII = static_cast(TII); + const X86InstrInfo *XII = static_cast(TII); MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA32r), LabelReg) - .addReg(XII->getGlobalBaseReg(MF)) - .addImm(0) - .addReg(0) - .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference()) - .addReg(0); + .addReg(XII->getGlobalBaseReg(MF)) + .addImm(0) + .addReg(0) + .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference()) + .addReg(0); } } else PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi; @@ -37312,7 +37408,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, // Setup MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::EH_SjLj_Setup)) - .addMBB(restoreMBB); + .addMBB(restoreMBB); const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); MIB.addRegMask(RegInfo->getNoPreservedMask()); @@ -37339,9 +37435,9 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, Register FramePtr = RegInfo->getFrameRegister(*MF); Register BasePtr = RegInfo->getBaseRegister(); unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm; - addRegOffset(BuildMI(restoreMBB, MIMD, TII->get(Opm), BasePtr), - FramePtr, true, X86FI->getRestoreBasePointerOffset()) - .setMIFlag(MachineInstr::FrameSetup); + addRegOffset(BuildMI(restoreMBB, MIMD, TII->get(Opm), BasePtr), FramePtr, + true, X86FI->getRestoreBasePointerOffset()) + .setMIFlag(MachineInstr::FrameSetup); } BuildMI(restoreMBB, MIMD, TII->get(X86::MOV32ri), restoreDstReg).addImm(1); BuildMI(restoreMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB); @@ -37424,9 +37520,9 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI, if (PVT == MVT::i64) { Register TmpZReg = MRI.createVirtualRegister(PtrRC); BuildMI(checkSspMBB, MIMD, TII->get(X86::SUBREG_TO_REG), TmpZReg) - .addImm(0) - .addReg(ZReg) - .addImm(X86::sub_32bit); + .addImm(0) + .addReg(ZReg) + .addImm(X86::sub_32bit); ZReg = TmpZReg; } @@ -37557,11 +37653,10 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI, SmallVector MMOs(MI.memoperands()); MVT PVT = getPointerTy(MF->getDataLayout()); - assert((PVT == MVT::i64 || PVT == MVT::i32) && - "Invalid Pointer Size!"); + assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"); const TargetRegisterClass *RC = - (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass; + (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass; Register Tmp = MRI.createVirtualRegister(RC); // Since FP is only updated here but NOT referenced, it's treated as GPR. const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); @@ -37944,10 +38039,8 @@ X86TargetLowering::emitPatchableEventCall(MachineInstr &MI, /// This approach ensures that when i64 is type-legalized into two i32 /// operations, both operations share the same condition byte rather than /// each independently reading (and destroying) EFLAGS. -static MachineBasicBlock * -emitCTSelectI386WithConditionMaterialization(MachineInstr &MI, - MachineBasicBlock *BB, - unsigned InternalPseudoOpcode) { +static MachineBasicBlock *emitCTSelectI386WithConditionMaterialization( + MachineInstr &MI, MachineBasicBlock *BB, unsigned InternalPseudoOpcode) { const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo(); const MIMetadata MIMD(MI); MachineFunction *MF = BB->getParent(); @@ -37991,12 +38084,12 @@ emitCTSelectI386WithConditionMaterialization(MachineInstr &MI, } BuildMI(*BB, MI, MIMD, TII->get(InternalPseudoOpcode)) - .addDef(DstReg) // dst (output) - .addDef(TmpByteReg) // tmp_byte (output) - .addDef(TmpMaskReg) // tmp_mask (output) - .addReg(Src1Reg) // src1 (input) - .addReg(Src2Reg) // src2 (input) - .addReg(CondByteReg); // pre-materialized condition byte (input) + .addDef(DstReg) // dst (output) + .addDef(TmpByteReg) // tmp_byte (output) + .addDef(TmpMaskReg) // tmp_mask (output) + .addReg(Src1Reg) // src1 (input) + .addReg(Src2Reg) // src2 (input) + .addReg(CondByteReg); // pre-materialized condition byte (input) MI.eraseFromParent(); return BB; @@ -38022,8 +38115,8 @@ struct FPLoadMemOperands { // Check if a virtual register is defined by a simple FP load instruction // Returns the memory operands if it's a simple load, otherwise returns invalid static FPLoadMemOperands getFPLoadMemOperands(Register Reg, - MachineRegisterInfo &MRI, - unsigned ExpectedLoadOpcode) { + MachineRegisterInfo &MRI, + unsigned ExpectedLoadOpcode) { FPLoadMemOperands Result; if (!Reg.isVirtual()) @@ -38042,9 +38135,9 @@ static FPLoadMemOperands getFPLoadMemOperands(Register Reg, if (DefMI->hasOrderedMemoryRef()) return Result; - // The load should have a single def (the destination register) and memory operands - // Format: %reg = LD_Fpxxm , 1, %noreg, 0, %noreg - // or: %reg = LD_Fpxxm %base, scale, %index, disp, %segment + // The load should have a single def (the destination register) and memory + // operands Format: %reg = LD_Fpxxm , 1, %noreg, 0, %noreg or: %reg = + // LD_Fpxxm %base, scale, %index, disp, %segment if (DefMI->getNumOperands() < 6) return Result; @@ -38069,9 +38162,8 @@ static FPLoadMemOperands getFPLoadMemOperands(Register Reg, // Check if this is a constant pool load // Format: %reg = LD_Fpxxm $noreg, 1, $noreg, %const.N, $noreg - if (BaseMO.isReg() && BaseMO.getReg() == X86::NoRegister && - ScaleMO.isImm() && IndexMO.isReg() && - IndexMO.getReg() == X86::NoRegister && + if (BaseMO.isReg() && BaseMO.getReg() == X86::NoRegister && ScaleMO.isImm() && + IndexMO.isReg() && IndexMO.getReg() == X86::NoRegister && DispMO.isCPI() && SegMO.isReg()) { Result.IsValid = true; Result.IsConstantPool = true; @@ -38085,9 +38177,8 @@ static FPLoadMemOperands getFPLoadMemOperands(Register Reg, // Check if this is a global variable load // Format: %reg = LD_Fpxxm $noreg, 1, $noreg, @global_name, $noreg - if (BaseMO.isReg() && BaseMO.getReg() == X86::NoRegister && - ScaleMO.isImm() && IndexMO.isReg() && - IndexMO.getReg() == X86::NoRegister && + if (BaseMO.isReg() && BaseMO.getReg() == X86::NoRegister && ScaleMO.isImm() && + IndexMO.isReg() && IndexMO.getReg() == X86::NoRegister && DispMO.isGlobal() && SegMO.isReg()) { Result.IsValid = true; Result.IsGlobal = true; @@ -38101,8 +38192,8 @@ static FPLoadMemOperands getFPLoadMemOperands(Register Reg, } // Regular memory operands (e.g., pointer loads) - if (BaseMO.isReg() && ScaleMO.isImm() && IndexMO.isReg() && - DispMO.isImm() && SegMO.isReg()) { + if (BaseMO.isReg() && ScaleMO.isImm() && IndexMO.isReg() && DispMO.isImm() && + SegMO.isReg()) { Result.IsValid = true; Result.IsFrameIndex = false; Result.IsConstantPool = false; @@ -38128,7 +38219,8 @@ static MachineBasicBlock *emitCTSelectI386WithFpType(MachineInstr &MI, unsigned RegSizeInByte = 4; // Get operands - // MI operands: %result:rfp80 = CTSELECT_I386 %false:rfp80, %true:rfp80, %cond:i8imm + // MI operands: %result:rfp80 = CTSELECT_I386 %false:rfp80, %true:rfp80, + // %cond:i8imm unsigned DestReg = MI.getOperand(0).getReg(); unsigned FalseReg = MI.getOperand(1).getReg(); unsigned TrueReg = MI.getOperand(2).getReg(); @@ -38146,7 +38238,7 @@ static MachineBasicBlock *emitCTSelectI386WithFpType(MachineInstr &MI, // Helper to load integer from memory operands auto loadIntFromMemOperands = [&](const FPLoadMemOperands &MemOps, - unsigned Offset) -> unsigned { + unsigned Offset) -> unsigned { unsigned IntReg = MRI.createVirtualRegister(&X86::GR32RegClass); MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), IntReg); @@ -38162,18 +38254,21 @@ static MachineBasicBlock *emitCTSelectI386WithFpType(MachineInstr &MI, // Constant pool: base_reg + scale + index + CP_index + segment // MOV32rm format: base, scale, index, displacement, segment MIB.addReg(X86::NoRegister) // Base register - .addImm(MemOps.ScaleVal) // Scale - .addReg(MemOps.IndexReg) // Index register - .addConstantPoolIndex(MemOps.ConstantPoolIndex, Offset) // Displacement (CP index) - .addReg(MemOps.SegReg); // Segment + .addImm(MemOps.ScaleVal) // Scale + .addReg(MemOps.IndexReg) // Index register + .addConstantPoolIndex(MemOps.ConstantPoolIndex, + Offset) // Displacement (CP index) + .addReg(MemOps.SegReg); // Segment } else if (MemOps.IsGlobal) { // Global variable: base_reg + scale + index + global + segment // MOV32rm format: base, scale, index, displacement, segment MIB.addReg(X86::NoRegister) // Base register - .addImm(MemOps.ScaleVal) // Scale - .addReg(MemOps.IndexReg) // Index register - .addGlobalAddress(MemOps.Global, MemOps.GlobalOffset + Offset) // Displacement (global address) - .addReg(MemOps.SegReg); // Segment + .addImm(MemOps.ScaleVal) // Scale + .addReg(MemOps.IndexReg) // Index register + .addGlobalAddress(MemOps.Global, + MemOps.GlobalOffset + + Offset) // Displacement (global address) + .addReg(MemOps.SegReg); // Segment } else { // Regular memory: base_reg + scale + index + disp + segment MIB.addReg(MemOps.BaseReg) @@ -38188,45 +38283,47 @@ static MachineBasicBlock *emitCTSelectI386WithFpType(MachineInstr &MI, // Optimized path: load integers directly from memory when both operands are // memory loads, avoiding FP register round-trip - auto emitCtSelectFromMemory = [&](unsigned NumValues, - const FPLoadMemOperands &TrueMemOps, - const FPLoadMemOperands &FalseMemOps, - int ResultSlot) { - for (unsigned Val = 0; Val < NumValues; ++Val) { - unsigned Offset = Val * RegSizeInByte; - - // Load true and false values directly from their memory locations as integers - unsigned TrueIntReg = loadIntFromMemOperands(TrueMemOps, Offset); - unsigned FalseIntReg = loadIntFromMemOperands(FalseMemOps, Offset); - - // Use CTSELECT_I386_INT_GR32 pseudo instruction for constant-time selection - unsigned ResultIntReg = MRI.createVirtualRegister(&X86::GR32RegClass); - unsigned TmpByteReg = MRI.createVirtualRegister(&X86::GR8RegClass); - unsigned TmpMaskReg = MRI.createVirtualRegister(&X86::GR32RegClass); - - BuildMI(*BB, MI, MIMD, TII->get(X86::CTSELECT_I386_INT_GR32rr)) - .addDef(ResultIntReg) // dst (output) - .addDef(TmpByteReg) // tmp_byte (output) - .addDef(TmpMaskReg) // tmp_mask (output) - .addReg(FalseIntReg) // src1 (input) - false value - .addReg(TrueIntReg) // src2 (input) - true value - .addReg(CondByteReg); // pre-materialized condition byte (input) - - // Store result back to result slot - BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32mr)) - .addFrameIndex(ResultSlot) - .addImm(1) - .addReg(0) - .addImm(Offset) - .addReg(0) - .addReg(ResultIntReg, RegState::Kill); - } - }; + auto emitCtSelectFromMemory = + [&](unsigned NumValues, const FPLoadMemOperands &TrueMemOps, + const FPLoadMemOperands &FalseMemOps, int ResultSlot) { + for (unsigned Val = 0; Val < NumValues; ++Val) { + unsigned Offset = Val * RegSizeInByte; + + // Load true and false values directly from their memory locations as + // integers + unsigned TrueIntReg = loadIntFromMemOperands(TrueMemOps, Offset); + unsigned FalseIntReg = loadIntFromMemOperands(FalseMemOps, Offset); + + // Use CTSELECT_I386_INT_GR32 pseudo instruction for constant-time + // selection + unsigned ResultIntReg = MRI.createVirtualRegister(&X86::GR32RegClass); + unsigned TmpByteReg = MRI.createVirtualRegister(&X86::GR8RegClass); + unsigned TmpMaskReg = MRI.createVirtualRegister(&X86::GR32RegClass); + + BuildMI(*BB, MI, MIMD, TII->get(X86::CTSELECT_I386_INT_GR32rr)) + .addDef(ResultIntReg) // dst (output) + .addDef(TmpByteReg) // tmp_byte (output) + .addDef(TmpMaskReg) // tmp_mask (output) + .addReg(FalseIntReg) // src1 (input) - false value + .addReg(TrueIntReg) // src2 (input) - true value + .addReg(CondByteReg); // pre-materialized condition byte (input) + + // Store result back to result slot + BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32mr)) + .addFrameIndex(ResultSlot) + .addImm(1) + .addReg(0) + .addImm(Offset) + .addReg(0) + .addReg(ResultIntReg, RegState::Kill); + } + }; - auto emitCtSelectWithPseudo = [&](unsigned NumValues, int TrueSlot, int FalseSlot, int ResultSlot) { + auto emitCtSelectWithPseudo = [&](unsigned NumValues, int TrueSlot, + int FalseSlot, int ResultSlot) { for (unsigned Val = 0; Val < NumValues; ++Val) { unsigned Offset = Val * RegSizeInByte; - + // Load true and false values from stack as 32-bit integers unsigned TrueIntReg = MRI.createVirtualRegister(&X86::GR32RegClass); BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), TrueIntReg) @@ -38244,18 +38341,19 @@ static MachineBasicBlock *emitCTSelectI386WithFpType(MachineInstr &MI, .addImm(Offset) .addReg(0); - // Use CTSELECT_I386_INT_GR32 pseudo instruction for constant-time selection + // Use CTSELECT_I386_INT_GR32 pseudo instruction for constant-time + // selection unsigned ResultIntReg = MRI.createVirtualRegister(&X86::GR32RegClass); unsigned TmpByteReg = MRI.createVirtualRegister(&X86::GR8RegClass); unsigned TmpMaskReg = MRI.createVirtualRegister(&X86::GR32RegClass); - + BuildMI(*BB, MI, MIMD, TII->get(X86::CTSELECT_I386_INT_GR32rr)) - .addDef(ResultIntReg) // dst (output) - .addDef(TmpByteReg) // tmp_byte (output) - .addDef(TmpMaskReg) // tmp_mask (output) - .addReg(FalseIntReg) // src1 (input) - false value - .addReg(TrueIntReg) // src2 (input) - true value - .addReg(CondByteReg); // pre-materialized condition byte (input) + .addDef(ResultIntReg) // dst (output) + .addDef(TmpByteReg) // tmp_byte (output) + .addDef(TmpMaskReg) // tmp_mask (output) + .addReg(FalseIntReg) // src1 (input) - false value + .addReg(TrueIntReg) // src2 (input) - true value + .addReg(CondByteReg); // pre-materialized condition byte (input) // Store result back to result slot BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32mr)) @@ -38416,7 +38514,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, const MIMetadata MIMD(MI); auto TMMImmToTMMReg = [](unsigned Imm) { - assert (Imm < 8 && "Illegal tmm index"); + assert(Imm < 8 && "Illegal tmm index"); return X86::TMM0 + Imm; }; switch (MI.getOpcode()) { @@ -38483,7 +38581,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, return emitCTSelectI386WithFpType(MI, BB, X86::CTSELECT_I386_FP64rr); case X86::CTSELECT_I386_FP80rr: return emitCTSelectI386WithFpType(MI, BB, X86::CTSELECT_I386_FP80rr); - + case X86::FP80_ADDr: case X86::FP80_ADDm32: { // Change the floating point control register to use double extended @@ -38571,29 +38669,30 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero. Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass); BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW) - .addReg(OldCW, RegState::Kill).addImm(0xC00); + .addReg(OldCW, RegState::Kill) + .addImm(0xC00); // Extract to 16 bits. Register NewCW16 = MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass); BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16) - .addReg(NewCW, RegState::Kill, X86::sub_16bit); + .addReg(NewCW, RegState::Kill, X86::sub_16bit); // Prepare memory for FLDCW. int NewCWFrameIdx = MF->getFrameInfo().CreateStackObject(2, Align(2), false); addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)), NewCWFrameIdx) - .addReg(NewCW16, RegState::Kill); + .addReg(NewCW16, RegState::Kill); // Reload the modified control word now... - addFrameReference(BuildMI(*BB, MI, MIMD, - TII->get(X86::FLDCW16m)), NewCWFrameIdx); + addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)), + NewCWFrameIdx); // Get the X86 opcode to use. unsigned Opc; switch (MI.getOpcode()) { - // clang-format off + // clang-format off default: llvm_unreachable("illegal opcode!"); case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; @@ -38604,7 +38703,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; - // clang-format on + // clang-format on } X86AddressMode AM = getAddressFromInstr(&MI, 0); @@ -38821,7 +38920,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case X86::PTMMULTF32PS: { unsigned Opc; switch (MI.getOpcode()) { - default: llvm_unreachable("illegal opcode!"); + default: + llvm_unreachable("illegal opcode!"); // clang-format off case X86::PTDPBSSD: Opc = X86::TDPBSSD; break; case X86::PTDPBSUD: Opc = X86::TDPBSUD; break; @@ -38868,7 +38968,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case X86::PTILESTORED: { unsigned Opc; switch (MI.getOpcode()) { - default: llvm_unreachable("illegal opcode!"); + default: + llvm_unreachable("illegal opcode!"); #define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC) case X86::PTILELOADD: Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD); @@ -38990,11 +39091,9 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, // X86 Optimization Hooks //===----------------------------------------------------------------------===// -bool -X86TargetLowering::targetShrinkDemandedConstant(SDValue Op, - const APInt &DemandedBits, - const APInt &DemandedElts, - TargetLoweringOpt &TLO) const { +bool X86TargetLowering::targetShrinkDemandedConstant( + SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, + TargetLoweringOpt &TLO) const { EVT VT = Op.getValueType(); unsigned Opcode = Op.getOpcode(); unsigned EltSize = VT.getScalarSizeInBits(); @@ -39179,16 +39278,15 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, unsigned NumElts = DemandedElts.getBitWidth(); unsigned Opc = Op.getOpcode(); EVT VT = Op.getValueType(); - assert((Opc >= ISD::BUILTIN_OP_END || - Opc == ISD::INTRINSIC_WO_CHAIN || - Opc == ISD::INTRINSIC_W_CHAIN || - Opc == ISD::INTRINSIC_VOID) && + assert((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || + Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op" " is a target node!"); Known.resetAll(); switch (Opc) { - default: break; + default: + break; case X86ISD::MUL_IMM: { KnownBits Known2; Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); @@ -39417,7 +39515,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); - if (auto* Cst1 = dyn_cast(Op1)) { + if (auto *Cst1 = dyn_cast(Op1)) { unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0); unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8); @@ -39611,7 +39709,8 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, unsigned NumElts = VT.getVectorNumElements(); if (Mask.size() == NumElts) { SmallVector DemandedOps(NumOps, APInt(NumElts, 0)); - Known.Zero.setAllBits(); Known.One.setAllBits(); + Known.Zero.setAllBits(); + Known.One.setAllBits(); for (unsigned i = 0; i != NumElts; ++i) { if (!DemandedElts[i]) continue; @@ -39756,16 +39855,18 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( case X86ISD::ANDNP: { unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1); - if (Tmp0 == 1) return 1; // Early out. + if (Tmp0 == 1) + return 1; // Early out. unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1); return std::min(Tmp0, Tmp1); } case X86ISD::CMOV: { - unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1); - if (Tmp0 == 1) return 1; // Early out. - unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1); + unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1); + if (Tmp0 == 1) + return 1; // Early out. + unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1); return std::min(Tmp0, Tmp1); } } @@ -40141,7 +40242,6 @@ static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef Mask, PermuteImm = (unsigned)ShiftAmt; return true; } - } } @@ -40201,7 +40301,8 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef Mask, // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle. if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) || - ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) || + ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && + Subtarget.hasInt256()) || ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) { if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG, Subtarget)) { @@ -40760,9 +40861,9 @@ static SDValue combineX86ShuffleChain( SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2; SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2; return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT, - CanonicalizeShuffleInput(RootVT, LHS), - CanonicalizeShuffleInput(RootVT, RHS), - DAG.getTargetConstant(PermMask, DL, MVT::i8)); + CanonicalizeShuffleInput(RootVT, LHS), + CanonicalizeShuffleInput(RootVT, RHS), + DAG.getTargetConstant(PermMask, DL, MVT::i8)); } } } @@ -40856,8 +40957,8 @@ static SDValue combineX86ShuffleChain( } if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain, - AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT, - PermuteImm) && + AllowIntDomain, DAG, Subtarget, Shuffle, + ShuffleVT, PermuteImm) && (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 0 && RootOpc == Shuffle) @@ -41736,11 +41837,9 @@ static SDValue combineX86ShufflesConstants(MVT VT, ArrayRef Ops, } namespace llvm { - namespace X86 { - enum { - MaxShuffleCombineDepth = 8 - }; - } // namespace X86 +namespace X86 { +enum { MaxShuffleCombineDepth = 8 }; +} // namespace X86 } // namespace llvm /// Fully generic combining of x86 shuffle instructions. @@ -42144,7 +42243,8 @@ static SDValue combineX86ShufflesRecursively( // The Op itself may be of different VT, so we need to scale the mask. unsigned NumOpElts = Op.getValueType().getVectorNumElements(); - APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts); + APInt OpScaledDemandedElts = + APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts); // Can this operand be simplified any further, given it's demanded elements? if (SDValue NewOp = TLI.SimplifyMultipleUseDemandedVectorElts( @@ -42950,7 +43050,7 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, ISD::isNormalLoad(Src.getNode())) { LoadSDNode *LN = cast(Src); SDVTList Tys = DAG.getVTList(VT, MVT::Other); - SDValue Ops[] = { LN->getChain(), LN->getBasePtr() }; + SDValue Ops[] = {LN->getChain(), LN->getBasePtr()}; SDValue BcastLd = DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, LN->getMemoryVT(), LN->getMemOperand()); @@ -42982,7 +43082,7 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, // Unless its volatile or atomic. if (LN->isSimple()) { SDVTList Tys = DAG.getVTList(VT, MVT::Other); - SDValue Ops[] = { LN->getChain(), LN->getBasePtr() }; + SDValue Ops[] = {LN->getChain(), LN->getBasePtr()}; SDValue BcastLd = DAG.getMemIntrinsicNode( X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16, LN->getPointerInfo(), LN->getBaseAlign(), @@ -43000,7 +43100,7 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, LoadSDNode *LN = cast(Src.getOperand(0)); if (LN->getMemoryVT().getSizeInBits() == 16) { SDVTList Tys = DAG.getVTList(VT, MVT::Other); - SDValue Ops[] = { LN->getChain(), LN->getBasePtr() }; + SDValue Ops[] = {LN->getChain(), LN->getBasePtr()}; SDValue BcastLd = DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, LN->getMemoryVT(), LN->getMemOperand()); @@ -43027,7 +43127,7 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, SDVTList Tys = DAG.getVTList(VT, MVT::Other); SDValue Ptr = DAG.getMemBasePlusOffset( LN->getBasePtr(), TypeSize::getFixed(Offset), DL); - SDValue Ops[] = { LN->getChain(), Ptr }; + SDValue Ops[] = {LN->getChain(), Ptr}; SDValue BcastLd = DAG.getMemIntrinsicNode( X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16, LN->getPointerInfo().getWithOffset(Offset), LN->getBaseAlign(), @@ -43045,7 +43145,7 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, MemSDNode *LN = cast(Src); if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) { SDVTList Tys = DAG.getVTList(VT, MVT::Other); - SDValue Ops[] = { LN->getChain(), LN->getBasePtr() }; + SDValue Ops[] = {LN->getChain(), LN->getBasePtr()}; SDValue BcastLd = DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, LN->getMemoryVT(), LN->getMemOperand()); @@ -43554,13 +43654,13 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) { auto *MemIntr = cast(Op1); if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) { - SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(), - MemIntr->getBasePtr(), - MemIntr->getMemOperand()); - SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, - DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, - Load), - DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8)); + SDValue Load = + DAG.getLoad(MVT::f32, DL, MemIntr->getChain(), + MemIntr->getBasePtr(), MemIntr->getMemOperand()); + SDValue Insert = DAG.getNode( + X86ISD::INSERTPS, DL, VT, Op0, + DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Load), + DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8)); DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1)); return Insert; } @@ -43714,8 +43814,8 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, if (Mask[0] == Mask[1] && Mask[2] == Mask[3] && (V.getOpcode() == X86ISD::PSHUFLW || V.getOpcode() == X86ISD::PSHUFHW) && - V.getOpcode() != N.getOpcode() && - V.hasOneUse() && V.getOperand(0).hasOneUse()) { + V.getOpcode() != N.getOpcode() && V.hasOneUse() && + V.getOperand(0).hasOneUse()) { SDValue D = peekThroughOneUseBitcasts(V.getOperand(0)); if (D.getOpcode() == X86ISD::PSHUFD) { SmallVector VMask = getPSHUFShuffleMask(V); @@ -43789,11 +43889,11 @@ static bool isAddSubOrSubAddMask(ArrayRef Mask, bool &Op0Even) { /// operation. If true is returned then the operands of ADDSUB(SUBADD) operation /// are written to the parameters \p Opnd0 and \p Opnd1. /// -/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes -/// so it is easier to generically match. We also insert dummy vector shuffle -/// nodes for the operands which explicitly discard the lanes which are unused -/// by this operation to try to flow through the rest of the combiner -/// the fact that they're unused. +/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle +/// nodes so it is easier to generically match. We also insert dummy vector +/// shuffle nodes for the operands which explicitly discard the lanes which are +/// unused by this operation to try to flow through the rest of the combiner the +/// fact that they're unused. static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, bool &IsSubAdd, bool &HasAllowContract) { @@ -43827,13 +43927,15 @@ static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget, // commute the FADD operands. SDValue LHS, RHS; if (V1.getOpcode() == ISD::FSUB) { - LHS = V1->getOperand(0); RHS = V1->getOperand(1); + LHS = V1->getOperand(0); + RHS = V1->getOperand(1); if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) && (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS)) return false; } else { assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode"); - LHS = V2->getOperand(0); RHS = V2->getOperand(1); + LHS = V2->getOperand(0); + RHS = V2->getOperand(1); if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) && (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS)) return false; @@ -43845,8 +43947,8 @@ static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget, return false; // It's a subadd if the vector in the even parity is an FADD. - IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD - : V2->getOpcode() == ISD::FADD; + IsSubAdd = + Op0Even ? V1->getOpcode() == ISD::FADD : V2->getOpcode() == ISD::FADD; HasAllowContract = V1->getFlags().hasAllowContract() && V2->getFlags().hasAllowContract(); @@ -44135,7 +44237,8 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( // TODO: Multiply by zero. - // If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent. + // If RHS/LHS elements are known zero then we don't need the LHS/RHS + // equivalent. APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero; if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO, Depth + 1)) @@ -44909,7 +45012,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( // For splats, unless we *only* demand the 0'th element, // stop attempts at simplification here, we aren't going to improve things, // this is better than any potential shuffle. - if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false)) + if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/ false)) return false; // Get target/faux shuffle mask. @@ -45007,7 +45110,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( EVT VT = Op.getValueType(); unsigned BitWidth = OriginalDemandedBits.getBitWidth(); unsigned Opc = Op.getOpcode(); - switch(Opc) { + switch (Opc) { case X86ISD::VTRUNC: { KnownBits KnownOp; SDValue Src = Op.getOperand(0); @@ -45015,8 +45118,10 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( // Simplify the input, using demanded bit information. APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits()); - APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements()); - if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1)) + APInt DemandedElts = + OriginalDemandedElts.trunc(SrcVT.getVectorNumElements()); + if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, + Depth + 1)) return true; break; } @@ -45120,7 +45225,8 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( } } - // If we are only demanding sign bits then we can use the shift source directly. + // If we are only demanding sign bits then we can use the shift source + // directly. unsigned NumSignBits = TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1); unsigned UpperDemandedBits = BitWidth - OriginalDemandedBits.countr_zero(); @@ -45311,8 +45417,8 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( return true; KnownBits KnownVec; - if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts, - KnownVec, TLO, Depth + 1)) + if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts, KnownVec, + TLO, Depth + 1)) return true; if (SDValue V = SimplifyMultipleUseDemandedBits( @@ -45948,13 +46054,13 @@ static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size, // Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents. static unsigned getAltBitOpcode(unsigned Opcode) { - switch(Opcode) { - // clang-format off + switch (Opcode) { + // clang-format off case ISD::AND: return X86ISD::FAND; case ISD::OR: return X86ISD::FOR; case ISD::XOR: return X86ISD::FXOR; case X86ISD::ANDNP: return X86ISD::FANDN; - // clang-format on + // clang-format on } llvm_unreachable("Unknown bitwise opcode"); } @@ -46177,8 +46283,7 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, // Convert a vXi1 constant build vector to the same width scalar integer. static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) { EVT SrcVT = Op.getValueType(); - assert(SrcVT.getVectorElementType() == MVT::i1 && - "Expected a vXi1 vector"); + assert(SrcVT.getVectorElementType() == MVT::i1 && "Expected a vXi1 vector"); assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && "Expected a constant build vector"); @@ -46496,7 +46601,7 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, // and the vbroadcast_load are both integer or both fp. In some cases this // will remove the bitcast entirely. if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() && - VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) { + VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) { auto *BCast = cast(N0); unsigned SrcVTSize = SrcVT.getScalarSizeInBits(); unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits(); @@ -46509,7 +46614,7 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements()); SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other); - SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() }; + SDValue Ops[] = {BCast->getChain(), BCast->getBasePtr()}; SDValue ResNode = DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops, MemVT, BCast->getMemOperand()); @@ -46559,7 +46664,7 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, bool LowUndef = true, AllUndefOrZero = true; for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) { SDValue Op = N0.getOperand(i); - LowUndef &= Op.isUndef() || (i >= e/2); + LowUndef &= Op.isUndef() || (i >= e / 2); AllUndefOrZero &= isNullConstantOrUndef(Op); } if (AllUndefOrZero) { @@ -46601,8 +46706,8 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, // Try to remove a bitcast of constant vXi1 vector. We have to legalize // most of these to scalar anyway. - if (Subtarget.hasAVX512() && VT.isScalarInteger() && - SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 && + if (Subtarget.hasAVX512() && VT.isScalarInteger() && SrcVT.isVector() && + SrcVT.getVectorElementType() == MVT::i1 && ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) { return combinevXi1ConstantToInteger(N0, DAG); } @@ -46620,8 +46725,8 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1. // Turn it into a sign bit compare that produces a k-register. This avoids // a trip through a GPR. - if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() && - VT.isVector() && VT.getVectorElementType() == MVT::i1 && + if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() && VT.isVector() && + VT.getVectorElementType() == MVT::i1 && isPowerOf2_32(VT.getVectorNumElements())) { unsigned NumElts = VT.getVectorNumElements(); SDValue Src = N0; @@ -46675,12 +46780,12 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, // transferring the SSE operand to integer register and back. unsigned FPOpcode; switch (N0.getOpcode()) { - // clang-format off + // clang-format off case ISD::AND: FPOpcode = X86ISD::FAND; break; case ISD::OR: FPOpcode = X86ISD::FOR; break; case ISD::XOR: FPOpcode = X86ISD::FXOR; break; default: return SDValue(); - // clang-format on + // clang-format on } // Check if we have a bitcast from another integer type as well. @@ -46781,7 +46886,7 @@ static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS, // Actually build the DotProduct, split as 256/512 bits for // AVXVNNI/AVX512VNNI. auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL, - ArrayRef Ops) { + ArrayRef Ops) { MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32); return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops); }; @@ -46896,7 +47001,8 @@ static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG, DAG.getVectorIdxConstant(0, DL)); } -// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK. +// Attempt to replace an all_of/any_of/parity style horizontal reduction with a +// MOVMSK. static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // Bail without SSE2. @@ -47171,9 +47277,9 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, if (Stages > 3) { unsigned SadElems = SadVT.getVectorNumElements(); - for(unsigned i = Stages - 3; i > 0; --i) { + for (unsigned i = Stages - 3; i > 0; --i) { SmallVector Mask(SadElems, -1); - for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j) + for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j) Mask[j] = MaskEnd + j; SDValue Shuffle = @@ -47489,10 +47595,10 @@ static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG, SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Vec.getOperand(0).getValueType().getScalarType(), Vec.getOperand(0), Index); - SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, - Vec.getOperand(1), Index); - SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, - Vec.getOperand(2), Index); + SDValue Ext1 = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Vec.getOperand(1), Index); + SDValue Ext2 = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Vec.getOperand(2), Index); return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2); } @@ -47772,8 +47878,9 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, return DAG.getConstant(EltBits[Idx].zext(NumEltBits), dl, VT); } - // Convert extract_element(bitcast() -> bitcast(extract_subvector()). - // Improves lowering of bool masks on rust which splits them into byte array. + // Convert extract_element(bitcast() -> + // bitcast(extract_subvector()). Improves lowering of bool masks on rust + // which splits them into byte array. if (InputVector.getOpcode() == ISD::BITCAST && (NumEltBits % 8) == 0) { SDValue Src = peekThroughBitcasts(InputVector); if (Src.getValueType().getScalarType() == MVT::i1 && @@ -48123,8 +48230,7 @@ static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDValue Cond = N->getOperand(0); - if ((N->getOpcode() != ISD::VSELECT && - N->getOpcode() != X86ISD::BLENDV) || + if ((N->getOpcode() != ISD::VSELECT && N->getOpcode() != X86ISD::BLENDV) || ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) return SDValue(); @@ -48397,7 +48503,8 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // Check for x CC y ? x : y. if (DAG.isEqualTo(LHS, Op0) && DAG.isEqualTo(RHS, Op1)) { switch (CC) { - default: break; + default: + break; case ISD::SETULT: // Converting this to a min would handle NaNs incorrectly, and swapping // the operands would cause it to handle comparisons between positive @@ -48462,10 +48569,11 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, Opcode = X86ISD::FMAX; break; } - // Check for x CC y ? y : x -- a min/max with reversed arms. + // Check for x CC y ? y : x -- a min/max with reversed arms. } else if (DAG.isEqualTo(LHS, Op1) && DAG.isEqualTo(RHS, Op0)) { switch (CC) { - default: break; + default: + break; case ISD::SETOGE: // Converting this to a min would handle comparisons between positive // and negative zero incorrectly, and swapping the operands would @@ -48669,13 +48777,13 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, Cond1 == InnerSetCC.getOperand(1)) { ISD::CondCode NewCC; switch (CC == ISD::SETEQ ? InnerCC : CC) { - // clang-format off + // clang-format off case ISD::SETGT: NewCC = ISD::SETGE; break; case ISD::SETLT: NewCC = ISD::SETLE; break; case ISD::SETUGT: NewCC = ISD::SETUGE; break; case ISD::SETULT: NewCC = ISD::SETULE; break; default: NewCC = ISD::SETCC_INVALID; break; - // clang-format on + // clang-format on } if (NewCC != ISD::SETCC_INVALID) { Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC); @@ -48845,9 +48953,9 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // 16-bit lacks a proper blendv. unsigned EltBitWidth = VT.getScalarSizeInBits(); bool CanShiftBlend = - TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) || - (Subtarget.hasAVX2() && EltBitWidth == 64) || - (Subtarget.hasXOP())); + TLI.isTypeLegal(VT) && + ((Subtarget.hasAVX() && EltBitWidth == 32) || + (Subtarget.hasAVX2() && EltBitWidth == 64) || (Subtarget.hasXOP())); if (CanShiftBlend && ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) { return C->getAPIntValue().isPowerOf2(); @@ -49086,7 +49194,7 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { SDValue Op2 = Cmp.getOperand(1); SDValue SetCC; - const ConstantSDNode* C = nullptr; + const ConstantSDNode *C = nullptr; bool needOppositeCond = (CC == X86::COND_E); bool checkAgainstTrue = false; // Is it a comparison against 1? @@ -49107,8 +49215,7 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { bool truncatedToBoolWithAnd = false; // Skip (zext $x), (trunc $x), or (and $x, 1) node. while (SetCC.getOpcode() == ISD::ZERO_EXTEND || - SetCC.getOpcode() == ISD::TRUNCATE || - SetCC.getOpcode() == ISD::AND) { + SetCC.getOpcode() == ISD::TRUNCATE || SetCC.getOpcode() == ISD::AND) { if (SetCC.getOpcode() == ISD::AND) { int OpIdx = -1; if (isOneConstant(SetCC.getOperand(0))) @@ -49151,13 +49258,13 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { if (!FVal) { SDValue Op = SetCC.getOperand(0); // Skip 'zext' or 'trunc' node. - if (Op.getOpcode() == ISD::ZERO_EXTEND || - Op.getOpcode() == ISD::TRUNCATE) + if (Op.getOpcode() == ISD::ZERO_EXTEND || Op.getOpcode() == ISD::TRUNCATE) Op = Op.getOperand(0); // A special case for rdrand/rdseed, where 0 is set if false cond is // found. if ((Op.getOpcode() != X86ISD::RDRAND && - Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0) + Op.getOpcode() != X86ISD::RDSEED) || + Op.getResNo() != 0) return SDValue(); } // Quit if false value is not the constant 0 or 1. @@ -49202,7 +49309,8 @@ static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, SDValue SetCC0, SetCC1; switch (Cond->getOpcode()) { - default: return false; + default: + return false; case ISD::AND: case X86ISD::AND: isAnd = true; @@ -49267,8 +49375,7 @@ static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) { } // If this is a check of the z flag of an add with 1, switch to the // C flag. - if (CarryCC == X86::COND_E && - CarryOp1.getOpcode() == X86ISD::ADD && + if (CarryCC == X86::COND_E && CarryOp1.getOpcode() == X86ISD::ADD && isOneConstant(CarryOp1.getOperand(1))) return CarryOp1; } else if (FoundAndLSB) { @@ -49801,12 +49908,11 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient // for any integer data type, including i8/i16. - if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { + if (FalseC->getAPIntValue() + 1 == TrueC->getAPIntValue()) { Cond = getSETCC(CC, Cond, DL, DAG); // Zero extend the condition if needed. - Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, - FalseC->getValueType(0), Cond); + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond); Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, SDValue(FalseC, 0)); return Cond; @@ -49822,24 +49928,25 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, bool isFastMultiplier = false; if (Diff.ult(10)) { switch (Diff.getZExtValue()) { - default: break; - case 1: // result = add base, cond - case 2: // result = lea base( , cond*2) - case 3: // result = lea base(cond, cond*2) - case 4: // result = lea base( , cond*4) - case 5: // result = lea base(cond, cond*4) - case 8: // result = lea base( , cond*8) - case 9: // result = lea base(cond, cond*8) + default: + break; + case 1: // result = add base, cond + case 2: // result = lea base( , cond*2) + case 3: // result = lea base(cond, cond*2) + case 4: // result = lea base( , cond*4) + case 5: // result = lea base(cond, cond*4) + case 8: // result = lea base( , cond*8) + case 9: // result = lea base(cond, cond*8) isFastMultiplier = true; break; } } if (isFastMultiplier) { - Cond = getSETCC(CC, Cond, DL ,DAG); + Cond = getSETCC(CC, Cond, DL, DAG); // Zero extend the condition if needed. - Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), - Cond); + Cond = + DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond); // Scale the condition by the difference. if (Diff != 1) Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, @@ -50630,8 +50737,7 @@ static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG, // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) // since the result of setcc_c is all zero's or all ones. - if (VT.isInteger() && !VT.isVector() && - N1C && N0.getOpcode() == ISD::AND && + if (VT.isInteger() && !VT.isVector() && N1C && N0.getOpcode() == ISD::AND && N0.getOperand(1).getOpcode() == ISD::Constant) { SDValue N00 = N0.getOperand(0); APInt Mask = N0.getConstantOperandAPInt(1); @@ -50715,7 +50821,7 @@ static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG, if (SraConst.isNegative()) return SDValue(); - for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) { + for (MVT SVT : {MVT::i8, MVT::i16, MVT::i32}) { unsigned ShiftSize = SVT.getSizeInBits(); // Only deal with (Size - ShlConst) being equal to 8, 16 or 32. if (ShiftSize >= Size || ShlConst != Size - ShiftSize) @@ -51049,8 +51155,8 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular // truncate to create a larger truncate. - if (Subtarget.hasAVX512() && - N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 && + if (Subtarget.hasAVX512() && N0.getOpcode() == ISD::TRUNCATE && + N1.isUndef() && VT == MVT::v16i8 && N0.getOperand(0).getValueType() == MVT::v8i32) { if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) || (!IsSigned && @@ -51397,7 +51503,7 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, SDValue CMP00 = CMP0->getOperand(0); SDValue CMP01 = CMP0->getOperand(1); - EVT VT = CMP00.getValueType(); + EVT VT = CMP00.getValueType(); if (VT == MVT::f32 || VT == MVT::f64 || (VT == MVT::f16 && Subtarget.hasFP16())) { @@ -51423,8 +51529,10 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, } if (!ExpectingFlags) { - enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0); - enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0); + enum X86::CondCode cc0 = + (enum X86::CondCode)N0.getConstantOperandVal(0); + enum X86::CondCode cc1 = + (enum X86::CondCode)N1.getConstantOperandVal(0); if (cc1 == X86::COND_E || cc1 == X86::COND_NE) { X86::CondCode tmp = cc0; @@ -51432,7 +51540,7 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, cc1 = tmp; } - if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) || + if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) || (cc0 == X86::COND_NE && cc1 == X86::COND_P)) { // FIXME: need symbolic constants for these magic numbers. // See X86ATTInstPrinter.cpp:printSSECC(). @@ -51442,7 +51550,8 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8)); // Need to fill with zeros to ensure the bitcast will produce zeroes - // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that. + // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee + // that. SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1, DAG.getConstant(0, DL, MVT::v16i1), FSetCC, DAG.getVectorIdxConstant(0, DL)); @@ -51474,8 +51583,8 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF); SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI, DAG.getConstant(1, DL, IntVT)); - SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, - ANDed); + SDValue OneBitOfTruth = + DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed); return OneBitOfTruth; } } @@ -51670,7 +51779,8 @@ static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, assert(VT.isVector() && "Expected vector type"); assert((N.getOpcode() == ISD::ANY_EXTEND || N.getOpcode() == ISD::ZERO_EXTEND || - N.getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node"); + N.getOpcode() == ISD::SIGN_EXTEND) && + "Invalid Node"); SDValue Narrow = N.getOperand(0); EVT NarrowVT = Narrow.getValueType(); @@ -51680,26 +51790,27 @@ static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, if (!Op) return SDValue(); switch (N.getOpcode()) { - default: llvm_unreachable("Unexpected opcode"); + default: + llvm_unreachable("Unexpected opcode"); case ISD::ANY_EXTEND: return Op; case ISD::ZERO_EXTEND: return DAG.getZeroExtendInReg(Op, DL, NarrowVT); case ISD::SIGN_EXTEND: - return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, - Op, DAG.getValueType(NarrowVT)); + return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Op, + DAG.getValueType(NarrowVT)); } } static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) { unsigned FPOpcode; switch (Opcode) { - // clang-format off + // clang-format off default: llvm_unreachable("Unexpected input node for FP logic conversion"); case ISD::AND: FPOpcode = X86ISD::FAND; break; case ISD::OR: FPOpcode = X86ISD::FOR; break; case ISD::XOR: FPOpcode = X86ISD::FXOR; break; - // clang-format on + // clang-format on } return FPOpcode; } @@ -52142,8 +52253,7 @@ static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG, SmallVector Ops(Src.getNumOperands(), DAG.getConstant(0, dl, SubVecVT)); Ops[0] = SubVec; - SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, - Ops); + SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, Ops); EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getSizeInBits()); return DAG.getZExtOrTrunc(DAG.getBitcast(IntVT, Concat), dl, VT); } @@ -52492,7 +52602,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget)) return R; - if (SDValue R = combineAndNotIntoANDNP(N, dl ,DAG)) + if (SDValue R = combineAndNotIntoANDNP(N, dl, DAG)) return R; if (SDValue ShiftRight = combineAndMaskToShift(N, dl, DAG, Subtarget)) @@ -53268,7 +53378,8 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, if (NotCond) { SDValue R = DAG.getZExtOrTrunc(NotCond, dl, VT); - R = DAG.getNode(ISD::MUL, dl, VT, R, DAG.getConstant(Val + 1, dl, VT)); + R = DAG.getNode(ISD::MUL, dl, VT, R, + DAG.getConstant(Val + 1, dl, VT)); R = DAG.getNode(ISD::SUB, dl, VT, R, DAG.getConstant(1, dl, VT)); return R; } @@ -53405,7 +53516,7 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, return SDValue(); switch (VT.getSimpleVT().SimpleTy) { - // clang-format off + // clang-format off default: return SDValue(); case MVT::v16i8: case MVT::v8i16: @@ -53535,8 +53646,8 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, // split across two registers. We can use a packusdw+perm to clamp to 0-65535 // and concatenate at the same time. Then we can use a final vpmovuswb to // clip to 0-255. - if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() && - InVT == MVT::v16i32 && VT == MVT::v16i8) { + if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() && InVT == MVT::v16i32 && + VT == MVT::v16i8) { if (SDValue USatVal = detectSSatPattern(In, VT, true)) { // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB. SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal, @@ -53552,11 +53663,12 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, // FIXME: We could widen truncates to 512 to remove the VLX restriction. // If the result type is 256-bits or larger and we have disable 512-bit // registers, we should go ahead and use the pack instructions if possible. - bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) || - (Subtarget.hasBWI() && InSVT == MVT::i16)) && - (InVT.getSizeInBits() > 128) && - (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) && - !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256); + bool PreferAVX512 = + ((Subtarget.hasAVX512() && InSVT == MVT::i32) || + (Subtarget.hasBWI() && InSVT == MVT::i16)) && + (InVT.getSizeInBits() > 128) && + (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) && + !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256); if (!PreferAVX512 && VT.getVectorNumElements() > 1 && isPowerOf2_32(VT.getVectorNumElements()) && @@ -53569,8 +53681,8 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL, DAG, Subtarget); assert(Mid && "Failed to pack!"); - SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG, - Subtarget); + SDValue V = + truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG, Subtarget); assert(V && "Failed to pack!"); return V; } else if (SVT == MVT::i8 || Subtarget.hasSSE41()) @@ -53894,10 +54006,9 @@ reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG, CastVT = VT.changeVectorElementType(EltVT); } - SDValue Load = - DAG.getLoad(EltVT, DL, ML->getChain(), Addr, - ML->getPointerInfo().getWithOffset(Offset), - Alignment, ML->getMemOperand()->getFlags()); + SDValue Load = DAG.getLoad(EltVT, DL, ML->getChain(), Addr, + ML->getPointerInfo().getWithOffset(Offset), + Alignment, ML->getMemOperand()->getFlags()); SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru()); @@ -53928,8 +54039,8 @@ combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG, if (LoadFirstElt && LoadLastElt) { SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(), ML->getMemOperand()); - SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, - ML->getPassThru()); + SDValue Blend = + DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getPassThru()); return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true); } @@ -53951,8 +54062,8 @@ combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG, VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(), DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(), ML->getAddressingMode(), ML->getExtensionType()); - SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, - ML->getPassThru()); + SDValue Blend = + DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getPassThru()); return DCI.CombineTo(ML, Blend, NewML.getValue(1), true); } @@ -54032,8 +54143,8 @@ static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS, // Store that element at the appropriate offset from the base pointer. return DAG.getStore(MS->getChain(), DL, Extract, Addr, - MS->getPointerInfo().getWithOffset(Offset), - Alignment, MS->getMemOperand()->getFlags()); + MS->getPointerInfo().getWithOffset(Offset), Alignment, + MS->getMemOperand()->getFlags()); } static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, @@ -54230,15 +54341,16 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, // Turn vXi1 stores of constants into a scalar store. if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 || - VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) && + VT == MVT::v64i1) && + VT == StVT && TLI.isTypeLegal(VT) && ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) { // If its a v64i1 store without 64-bit support, we need two stores. if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) { - SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl, - StoredVal->ops().slice(0, 32)); + SDValue Lo = + DAG.getBuildVector(MVT::v32i1, dl, StoredVal->ops().slice(0, 32)); Lo = combinevXi1ConstantToInteger(Lo, DAG); - SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl, - StoredVal->ops().slice(32, 32)); + SDValue Hi = + DAG.getBuildVector(MVT::v32i1, dl, StoredVal->ops().slice(32, 32)); Hi = combinevXi1ConstantToInteger(Hi, DAG); SDValue Ptr0 = St->getBasePtr(); @@ -54338,9 +54450,9 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, StoredVal.hasOneUse() && TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) { bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS; - return EmitTruncSStore(IsSigned, St->getChain(), - dl, StoredVal.getOperand(0), St->getBasePtr(), - VT, St->getMemOperand(), DAG); + return EmitTruncSStore(IsSigned, St->getChain(), dl, + StoredVal.getOperand(0), St->getBasePtr(), VT, + St->getMemOperand(), DAG); } // Try to fold a extract_element(VTRUNC) pattern into a truncating store. @@ -54379,14 +54491,14 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, if (St->isTruncatingStore() && VT.isVector()) { if (TLI.isTruncStoreLegal(VT, StVT)) { if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT())) - return EmitTruncSStore(true /* Signed saturation */, St->getChain(), - dl, Val, St->getBasePtr(), - St->getMemoryVT(), St->getMemOperand(), DAG); - if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(), - DAG, dl)) + return EmitTruncSStore(true /* Signed saturation */, St->getChain(), dl, + Val, St->getBasePtr(), St->getMemoryVT(), + St->getMemOperand(), DAG); + if (SDValue Val = + detectUSatPattern(St->getValue(), St->getMemoryVT(), DAG, dl)) return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(), - dl, Val, St->getBasePtr(), - St->getMemoryVT(), St->getMemOperand(), DAG); + dl, Val, St->getBasePtr(), St->getMemoryVT(), + St->getMemOperand(), DAG); } return SDValue(); @@ -55194,8 +55306,7 @@ static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, // (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))), // (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B)))))))) static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG, - const X86Subtarget &Subtarget, - const SDLoc &DL) { + const X86Subtarget &Subtarget, const SDLoc &DL) { using namespace SDPatternMatch; if (!VT.isVector() || !Subtarget.hasSSSE3()) return SDValue(); @@ -55269,8 +55380,8 @@ static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG, std::swap(IdxN01, IdxN11); } // N0 indices be the even element. N1 indices must be the next odd element. - if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 || - IdxN01 != 2 * i || IdxN11 != 2 * i + 1) + if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 || IdxN01 != 2 * i || + IdxN11 != 2 * i + 1) return SDValue(); SDValue N00In = N00Elt.getOperand(0); SDValue N01In = N01Elt.getOperand(0); @@ -55281,8 +55392,8 @@ static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG, ZExtIn = N00In; SExtIn = N01In; } - if (ZExtIn != N00In || SExtIn != N01In || - ZExtIn != N10In || SExtIn != N11In) + if (ZExtIn != N00In || SExtIn != N01In || ZExtIn != N10In || + SExtIn != N11In) return SDValue(); } @@ -55302,14 +55413,13 @@ static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG, // Shrink by adding truncate nodes and let DAGCombine fold with the // sources. EVT InVT = Ops[0].getValueType(); - assert(InVT.getScalarType() == MVT::i8 && - "Unexpected scalar element type"); + assert(InVT.getScalarType() == MVT::i8 && "Unexpected scalar element type"); assert(InVT == Ops[1].getValueType() && "Operands' types mismatch"); EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, InVT.getVectorNumElements() / 2); return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]); }; - return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn }, + return SplitOpsAndApply(DAG, Subtarget, DL, VT, {ZExtIn, SExtIn}, PMADDBuilder); } @@ -55494,7 +55604,7 @@ static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc, bool NegRes) { if (NegMul) { switch (Opcode) { - // clang-format off + // clang-format off default: llvm_unreachable("Unexpected opcode"); case ISD::FMA: Opcode = X86ISD::FNMADD; break; case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break; @@ -55508,13 +55618,13 @@ static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc, case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break; case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break; case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break; - // clang-format on + // clang-format on } } if (NegAcc) { switch (Opcode) { - // clang-format off + // clang-format off default: llvm_unreachable("Unexpected opcode"); case ISD::FMA: Opcode = X86ISD::FMSUB; break; case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break; @@ -55532,7 +55642,7 @@ static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc, case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break; case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break; case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break; - // clang-format on + // clang-format on } } @@ -55549,7 +55659,7 @@ static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc, case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break; case X86ISD::FNMSUB: Opcode = ISD::FMA; break; case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break; - // clang-format on + // clang-format on } } @@ -55681,13 +55791,13 @@ static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1)); unsigned IntOpcode; switch (N->getOpcode()) { - // clang-format off + // clang-format off default: llvm_unreachable("Unexpected FP logic op"); case X86ISD::FOR: IntOpcode = ISD::OR; break; case X86ISD::FXOR: IntOpcode = ISD::XOR; break; case X86ISD::FAND: IntOpcode = ISD::AND; break; case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break; - // clang-format on + // clang-format on } SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1); return DAG.getBitcast(VT, IntOp); @@ -56039,13 +56149,18 @@ static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) { // into FMINC and FMAXC, which are Commutative operations. unsigned NewOp = 0; switch (N->getOpcode()) { - default: llvm_unreachable("unknown opcode"); - case X86ISD::FMIN: NewOp = X86ISD::FMINC; break; - case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break; + default: + llvm_unreachable("unknown opcode"); + case X86ISD::FMIN: + NewOp = X86ISD::FMINC; + break; + case X86ISD::FMAX: + NewOp = X86ISD::FMAXC; + break; } - return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0), - N->getOperand(0), N->getOperand(1)); + return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0), N->getOperand(0), + N->getOperand(1)); } static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, @@ -56091,8 +56206,8 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize()) return SDValue(); - EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), - VT); + EVT SetCCType = + TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); // There are 4 possibilities involving NaN inputs, and these are the required // outputs: @@ -56142,8 +56257,8 @@ static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG, MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits); if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) { SDLoc dl(N); - SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT, - DAG.getBitcast(InVT, VZLoad)); + SDValue Convert = + DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad)); DCI.CombineTo(N, Convert); DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1)); DCI.recursivelyDeleteUnusedNodes(LN); @@ -56638,8 +56753,8 @@ static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG, // Only combine legal element types. EVT SVT = VT.getVectorElementType(); - if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 && - SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64) + if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 && SVT != MVT::i64 && + SVT != MVT::f32 && SVT != MVT::f64) return SDValue(); // We don't have CMPP Instruction for vxf16 @@ -56679,16 +56794,15 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG, SDLoc DL(N); // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry)) - if (!DCI.isBeforeLegalizeOps() && - N0.getOpcode() == X86ISD::SETCC_CARRY) { + if (!DCI.isBeforeLegalizeOps() && N0.getOpcode() == X86ISD::SETCC_CARRY) { SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0), - N0->getOperand(1)); + N0->getOperand(1)); bool ReplaceOtherUses = !N0.hasOneUse(); DCI.CombineTo(N, Setcc); // Replace other uses with a truncate of the widened setcc_carry. if (ReplaceOtherUses) { - SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), - N0.getValueType(), Setcc); + SDValue Trunc = + DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), Setcc); DCI.CombineTo(N0.getNode(), Trunc); } @@ -56981,13 +57095,13 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG, if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND && N0.getOpcode() == X86ISD::SETCC_CARRY) { SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0), - N0->getOperand(1)); + N0->getOperand(1)); bool ReplaceOtherUses = !N0.hasOneUse(); DCI.CombineTo(N, Setcc); // Replace other uses with a truncate of the widened setcc_carry. if (ReplaceOtherUses) { - SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), - N0.getValueType(), Setcc); + SDValue Trunc = + DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), Setcc); DCI.CombineTo(N0.getNode(), Trunc); } @@ -57263,8 +57377,8 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, if (CInt.isPowerOf2() && !CInt.isMinSignedValue()) { SDValue BaseOp = LHS.getOperand(0); SDValue SETCC0 = DAG.getSetCC(DL, VT, BaseOp, RHS, CC); - SDValue SETCC1 = DAG.getSetCC( - DL, VT, BaseOp, DAG.getConstant(-CInt, DL, OpVT), CC); + SDValue SETCC1 = DAG.getSetCC(DL, VT, BaseOp, + DAG.getConstant(-CInt, DL, OpVT), CC); return DAG.getNode(CC == ISD::SETEQ ? ISD::OR : ISD::AND, DL, VT, SETCC0, SETCC1); } @@ -57624,19 +57738,25 @@ static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS, SDLoc DL(GorS); if (auto *Gather = dyn_cast(GorS)) { - SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(), - Gather->getMask(), Base, Index, Scale } ; - return DAG.getMaskedGather(Gather->getVTList(), - Gather->getMemoryVT(), DL, Ops, - Gather->getMemOperand(), + SDValue Ops[] = {Gather->getChain(), + Gather->getPassThru(), + Gather->getMask(), + Base, + Index, + Scale}; + return DAG.getMaskedGather(Gather->getVTList(), Gather->getMemoryVT(), DL, + Ops, Gather->getMemOperand(), Gather->getIndexType(), Gather->getExtensionType()); } auto *Scatter = cast(GorS); - SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(), - Scatter->getMask(), Base, Index, Scale }; - return DAG.getMaskedScatter(Scatter->getVTList(), - Scatter->getMemoryVT(), DL, + SDValue Ops[] = {Scatter->getChain(), + Scatter->getValue(), + Scatter->getMask(), + Base, + Index, + Scale}; + return DAG.getMaskedScatter(Scatter->getVTList(), Scatter->getMemoryVT(), DL, Ops, Scatter->getMemOperand(), Scatter->getIndexType(), Scatter->isTruncatingStore()); @@ -57867,8 +57987,8 @@ static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N, SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0)); // The AND node needs bitcasts to/from an integer vector type around it. SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst); - SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0), - MaskConst); + SDValue NewAnd = + DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0), MaskConst); SDValue Res = DAG.getBitcast(VT, NewAnd); if (IsStrict) return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL); @@ -58054,8 +58174,8 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, // use CVTSI2P. assert(InVT == MVT::v2i64 && "Unexpected VT!"); SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0); - SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast, - { 0, 2, -1, -1 }); + SDValue Shuf = + DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast, {0, 2, -1, -1}); if (IsStrict) return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other}, {N->getOperand(0), Shuf}); @@ -58156,7 +58276,7 @@ static bool needCarryOrOverflowFlag(SDValue Flags) { } switch (CC) { - // clang-format off + // clang-format off default: break; case X86::COND_A: case X86::COND_AE: case X86::COND_B: case X86::COND_BE: @@ -58164,7 +58284,7 @@ static bool needCarryOrOverflowFlag(SDValue Flags) { case X86::COND_G: case X86::COND_GE: case X86::COND_L: case X86::COND_LE: return true; - // clang-format on + // clang-format on } } @@ -58300,11 +58420,12 @@ static SDValue combineCMP(SDNode *N, SelectionDAG &DAG, // After this the truncate and arithmetic op must have a single use. if (!Trunc.hasOneUse() || !Op.hasOneUse()) - return SDValue(); + return SDValue(); unsigned NewOpc; switch (Op.getOpcode()) { - default: return SDValue(); + default: + return SDValue(); case ISD::AND: // Skip and with constant. We have special handling for and with immediate // during isel to generate test instructions. @@ -58312,8 +58433,12 @@ static SDValue combineCMP(SDNode *N, SelectionDAG &DAG, return SDValue(); NewOpc = X86ISD::AND; break; - case ISD::OR: NewOpc = X86ISD::OR; break; - case ISD::XOR: NewOpc = X86ISD::XOR; break; + case ISD::OR: + NewOpc = X86ISD::OR; + break; + case ISD::XOR: + NewOpc = X86ISD::XOR; + break; case ISD::ADD: // If the carry or overflow flag is used, we can't truncate. if (needCarryOrOverflowFlag(SDValue(N, 0))) @@ -58490,9 +58615,8 @@ static SDValue combineADC(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue matchPMADDWD(SelectionDAG &DAG, SDNode *N, - const SDLoc &DL, EVT VT, - const X86Subtarget &Subtarget) { +static SDValue matchPMADDWD(SelectionDAG &DAG, SDNode *N, const SDLoc &DL, + EVT VT, const X86Subtarget &Subtarget) { using namespace SDPatternMatch; // Example of pattern we try to detect: @@ -58600,9 +58724,8 @@ static SDValue matchPMADDWD(SelectionDAG &DAG, SDNode *N, // Attempt to turn this pattern into PMADDWD. // (add (mul (sext (build_vector)), (sext (build_vector))), // (mul (sext (build_vector)), (sext (build_vector))) -static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDNode *N, - const SDLoc &DL, EVT VT, - const X86Subtarget &Subtarget) { +static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDNode *N, const SDLoc &DL, + EVT VT, const X86Subtarget &Subtarget) { using namespace SDPatternMatch; if (!Subtarget.hasSSE2()) @@ -58698,7 +58821,7 @@ static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDNode *N, // If the output is narrower than an input, extract the low part of the input // vector. EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16, - VT.getVectorNumElements() * 2); + VT.getVectorNumElements() * 2); if (OutVT16.bitsLT(In0.getValueType())) { In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0, DAG.getVectorIdxConstant(0, DL)); @@ -58707,8 +58830,7 @@ static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDNode *N, In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1, DAG.getVectorIdxConstant(0, DL)); } - return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 }, - PMADDBuilder); + return SplitOpsAndApply(DAG, Subtarget, DL, VT, {In0, In1}, PMADDBuilder); } // ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W)) @@ -59677,8 +59799,8 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, unsigned Imm1 = Ops[1].getConstantOperandVal(2); // TODO: Handle zero'd subvectors. if ((Imm0 & 0x88) == 0 && (Imm1 & 0x88) == 0) { - int Mask[4] = {(int)(Imm0 & 0x03), (int)((Imm0 >> 4) & 0x3), (int)(Imm1 & 0x03), - (int)((Imm1 >> 4) & 0x3)}; + int Mask[4] = {(int)(Imm0 & 0x03), (int)((Imm0 >> 4) & 0x3), + (int)(Imm1 & 0x03), (int)((Imm1 >> 4) & 0x3)}; MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64; SDValue LHS = concatSubVectors(Ops[0].getOperand(0), Ops[0].getOperand(1), DAG, DL); @@ -59866,8 +59988,7 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, break; } - ISD::CondCode ICC = - Opcode == X86ISD::PCMPEQ ? ISD::SETEQ : ISD::SETGT; + ISD::CondCode ICC = Opcode == X86ISD::PCMPEQ ? ISD::SETEQ : ISD::SETGT; ISD::CondCode FCC = Opcode == X86ISD::PCMPEQ ? ISD::SETOEQ : ISD::SETOGT; @@ -60217,7 +60338,8 @@ static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG, APInt Constant = APInt::getZero(VT.getSizeInBits()); for (unsigned I = 0, E = Ops.size(); I != E; ++I) { auto *C = dyn_cast(peekThroughBitcasts(Ops[I])); - if (!C) break; + if (!C) + break; Constant.insertBits(C->getAPIntValue(), I * SubSizeInBits); if (I == (E - 1)) { EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits()); @@ -60290,9 +60412,9 @@ static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) && Ins.getOperand(1).getValueSizeInBits().getFixedValue() <= SubVecVT.getFixedSizeInBits()) - return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, - getZeroVector(OpVT, Subtarget, DAG, dl), - Ins.getOperand(1), N->getOperand(2)); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, + getZeroVector(OpVT, Subtarget, DAG, dl), + Ins.getOperand(1), N->getOperand(2)); } } @@ -60982,7 +61104,7 @@ static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG, LHS.getOperand(0).getValueType() == MVT::v4i32) { SDLoc dl(N); LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0), - LHS.getOperand(0), { 0, -1, 1, -1 }); + LHS.getOperand(0), {0, -1, 1, -1}); LHS = DAG.getBitcast(MVT::v2i64, LHS); return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS); } @@ -60992,7 +61114,7 @@ static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG, RHS.getOperand(0).getValueType() == MVT::v4i32) { SDLoc dl(N); RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0), - RHS.getOperand(0), { 0, -1, 1, -1 }); + RHS.getOperand(0), {0, -1, 1, -1}); RHS = DAG.getBitcast(MVT::v2i64, RHS); return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS); } @@ -61263,16 +61385,16 @@ static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG, // Widen to at least 8 input elements. if (NumElts < 8) { unsigned NumConcats = 8 / NumElts; - SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT) - : DAG.getConstant(0, dl, IntVT); + SDValue Fill = + NumElts == 4 ? DAG.getUNDEF(IntVT) : DAG.getConstant(0, dl, IntVT); SmallVector Ops(NumConcats, Fill); Ops[0] = Src; Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops); } // Destination is vXf32 with at least 4 elements. - EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, - std::max(4U, NumElts)); + EVT CvtVT = + EVT::getVectorVT(*DAG.getContext(), MVT::f32, std::max(4U, NumElts)); SDValue Cvt, Chain; if (IsStrict) { Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other}, @@ -61542,7 +61664,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; switch (N->getOpcode()) { - // clang-format off + // clang-format off default: break; case ISD::SCALAR_TO_VECTOR: return combineSCALAR_TO_VECTOR(N, DAG, Subtarget); @@ -61893,7 +62015,8 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { bool Commute = false; switch (Op.getOpcode()) { - default: return false; + default: + return false; case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: @@ -61933,8 +62056,7 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { ((Commute && !isa(N1)) || (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op)))) return false; - if (IsFoldableAtomicRMW(N0, Op) || - (Commute && IsFoldableAtomicRMW(N1, Op))) + if (IsFoldableAtomicRMW(N0, Op) || (Commute && IsFoldableAtomicRMW(N1, Op))) return false; } } @@ -62021,8 +62143,7 @@ X86TargetLowering::getConstraintType(StringRef Constraint) const { default: break; } - } - else if (Constraint.size() == 2) { + } else if (Constraint.size() == 2) { switch (Constraint[0]) { default: break; @@ -62211,8 +62332,7 @@ X86TargetLowering::getSingleConstraintMatchWeight( /// Try to replace an X constraint, which matches anything, with another that /// has more specific requirements based on the type of the corresponding /// operand. -const char *X86TargetLowering:: -LowerXConstraint(EVT ConstraintVT) const { +const char *X86TargetLowering::LowerXConstraint(EVT ConstraintVT) const { // FP X constraints get lowered to SSE1/2 registers if available, otherwise // 'f' like normal targets. if (ConstraintVT.isFloatingPoint()) { @@ -62258,7 +62378,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, SDValue Result; char ConstraintLetter = Constraint[0]; switch (ConstraintLetter) { - default: break; + default: + break; case 'I': if (auto *C = dyn_cast(Op)) { if (C->getZExtValue() <= 31) { @@ -62332,8 +62453,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64); break; } - // FIXME gcc accepts some relocatable values here too, but only in certain - // memory models; it's complicated. + // FIXME gcc accepts some relocatable values here too, but only in certain + // memory models; it's complicated. } return; } @@ -62376,8 +62497,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, if (auto *CST = dyn_cast(Op)) { bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1; BooleanContent BCont = getBooleanContents(MVT::i64); - ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont) - : ISD::SIGN_EXTEND; + ISD::NodeType ExtOpc = + IsBool ? getExtendForContent(BCont) : ISD::SIGN_EXTEND; int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue() : CST->getSExtValue(); Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64); @@ -62456,7 +62577,8 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, if (Constraint.size() == 1) { // GCC Constraint Letters switch (Constraint[0]) { - default: break; + default: + break; // 'A' means [ER]AX + [ER]DX. case 'A': if (Subtarget.is64Bit()) @@ -62484,7 +62606,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return std::make_pair(0U, &X86::VK64RegClass); } break; - case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. + case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. if (Subtarget.is64Bit()) { if (VT == MVT::i8 || VT == MVT::i1) return std::make_pair(0U, useEGPRInlineAsm(Subtarget) @@ -62506,7 +62628,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, } [[fallthrough]]; // 32-bit fallthrough - case 'Q': // Q_REGS + case 'Q': // Q_REGS if (VT == MVT::i8 || VT == MVT::i1) return std::make_pair(0U, &X86::GR8_ABCD_LRegClass); if (VT == MVT::i16) @@ -62517,8 +62639,8 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, if (VT != MVT::f80 && !VT.isVector()) return std::make_pair(0U, &X86::GR64_ABCDRegClass); break; - case 'r': // GENERAL_REGS - case 'l': // INDEX_REGS + case 'r': // GENERAL_REGS + case 'l': // INDEX_REGS if (VT == MVT::i8 || VT == MVT::i1) return std::make_pair(0U, useEGPRInlineAsm(Subtarget) ? &X86::GR8RegClass @@ -62537,7 +62659,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, ? &X86::GR64RegClass : &X86::GR64_NOREX2RegClass); break; - case 'R': // LEGACY_REGS + case 'R': // LEGACY_REGS if (VT == MVT::i8 || VT == MVT::i1) return std::make_pair(0U, &X86::GR8_NOREXRegClass); if (VT == MVT::i16) @@ -62548,7 +62670,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, if (VT != MVT::f80 && !VT.isVector()) return std::make_pair(0U, &X86::GR64_NOREXRegClass); break; - case 'f': // FP Stack registers. + case 'f': // FP Stack registers. // If SSE is enabled for this VT, use f80 to ensure the isel moves the // value to the correct fpstack register class. if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) @@ -62558,16 +62680,19 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) return std::make_pair(0U, &X86::RFP80RegClass); break; - case 'y': // MMX_REGS if MMX allowed. - if (!Subtarget.hasMMX()) break; + case 'y': // MMX_REGS if MMX allowed. + if (!Subtarget.hasMMX()) + break; return std::make_pair(0U, &X86::VR64RegClass); case 'v': - case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed - if (!Subtarget.hasSSE1()) break; + case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed + if (!Subtarget.hasSSE1()) + break; bool VConstraint = (Constraint[0] == 'v'); switch (VT.SimpleTy) { - default: break; + default: + break; // Scalar SSE types. case MVT::f16: if (VConstraint && Subtarget.hasFP16()) @@ -62658,7 +62783,8 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, case MVT::v16f32: case MVT::v16i32: case MVT::v8i64: - if (!Subtarget.hasAVX512()) break; + if (!Subtarget.hasAVX512()) + break; if (VConstraint) return std::make_pair(0U, &X86::VR512RegClass); return std::make_pair(0U, &X86::VR512_0_15RegClass); @@ -62674,12 +62800,15 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, case '2': return getRegForInlineAsmConstraint(TRI, "x", VT); case 'm': - if (!Subtarget.hasMMX()) break; + if (!Subtarget.hasMMX()) + break; return std::make_pair(0U, &X86::VR64RegClass); case 'z': - if (!Subtarget.hasSSE1()) break; + if (!Subtarget.hasSSE1()) + break; switch (VT.SimpleTy) { - default: break; + default: + break; // Scalar SSE types. case MVT::f16: if (!Subtarget.hasFP16()) @@ -62794,14 +62923,15 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, // Use the default implementation in TargetLowering to convert the register // constraint into a member of a register class. - std::pair Res; + std::pair Res; Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); // Not found as a standard register? if (!Res.second) { // Only match x87 registers if the VT is one SelectionDAGBuilder can convert // to/from f80. - if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) { + if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || + VT == MVT::f80) { // Map st(0) -> st(7) -> ST0 if (Constraint.size() == 7 && Constraint[0] == '{' && tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' && @@ -62859,7 +62989,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, // turn into {ax},{dx}. // MVT::Other is used to specify clobber names. if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other) - return Res; // Correct type already, nothing to do. + return Res; // Correct type already, nothing to do. // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should // return "eax". This should even work for things like getting 64bit integer @@ -62871,7 +63001,8 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, // Therefore, use a helper method. if (isGRClass(*Class)) { unsigned Size = VT.getSizeInBits(); - if (Size == 1) Size = 8; + if (Size == 1) + Size = 8; if (Size != 8 && Size != 16 && Size != 32 && Size != 64) return std::make_pair(0, nullptr); Register DestReg = getX86SubSuperRegister(Res.first, Size); @@ -62879,9 +63010,11 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, bool is64Bit = Subtarget.is64Bit(); const TargetRegisterClass *RC = Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass) - : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass) - : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass) - : /*Size == 64*/ (is64Bit ? &X86::GR64RegClass : nullptr); + : Size == 16 + ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass) + : Size == 32 + ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass) + : /*Size == 64*/ (is64Bit ? &X86::GR64RegClass : nullptr); if (Size == 64 && !is64Bit) { // Model GCC's behavior here and select a fixed pair of 32-bit // registers. @@ -63133,8 +63266,7 @@ X86TargetLowering::getStackProbeSymbolName(const MachineFunction &MF) const { return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk"; } -unsigned -X86TargetLowering::getStackProbeSize(const MachineFunction &MF) const { +unsigned X86TargetLowering::getStackProbeSize(const MachineFunction &MF) const { // The default stack probe size is 4096 if the function has no stackprobesize // attribute. return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size", diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index d759895719388..df3838fab4ae9 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -18,1975 +18,1964 @@ #include "llvm/CodeGen/TargetLowering.h" namespace llvm { - class X86Subtarget; - class X86TargetMachine; - - namespace X86ISD { - // X86 Specific DAG Nodes - enum NodeType : unsigned { - // Start the numbering where the builtin ops leave off. - FIRST_NUMBER = ISD::BUILTIN_OP_END, - - /// Bit scan forward. - BSF, - /// Bit scan reverse. - BSR, - - /// X86 funnel/double shift i16 instructions. These correspond to - /// X86::SHLDW and X86::SHRDW instructions which have different amt - /// modulo rules to generic funnel shifts. - /// NOTE: The operand order matches ISD::FSHL/FSHR not SHLD/SHRD. - FSHL, - FSHR, - - /// Bitwise logical AND of floating point values. This corresponds - /// to X86::ANDPS or X86::ANDPD. - FAND, - - /// Bitwise logical OR of floating point values. This corresponds - /// to X86::ORPS or X86::ORPD. - FOR, - - /// Bitwise logical XOR of floating point values. This corresponds - /// to X86::XORPS or X86::XORPD. - FXOR, - - /// Bitwise logical ANDNOT of floating point values. This - /// corresponds to X86::ANDNPS or X86::ANDNPD. - FANDN, - - /// These operations represent an abstract X86 call - /// instruction, which includes a bunch of information. In particular the - /// operands of these node are: - /// - /// #0 - The incoming token chain - /// #1 - The callee - /// #2 - The number of arg bytes the caller pushes on the stack. - /// #3 - The number of arg bytes the callee pops off the stack. - /// #4 - The value to pass in AL/AX/EAX (optional) - /// #5 - The value to pass in DL/DX/EDX (optional) - /// - /// The result values of these nodes are: - /// - /// #0 - The outgoing token chain - /// #1 - The first register result value (optional) - /// #2 - The second register result value (optional) - /// - CALL, - - /// Same as call except it adds the NoTrack prefix. - NT_CALL, - - // Pseudo for a OBJC call that gets emitted together with a special - // marker instruction. - CALL_RVMARKER, - - /// The same as ISD::CopyFromReg except that this node makes it explicit - /// that it may lower to an x87 FPU stack pop. Optimizations should be more - /// cautious when handling this node than a normal CopyFromReg to avoid - /// removing a required FPU stack pop. A key requirement is optimizations - /// should not optimize any users of a chain that contains a - /// POP_FROM_X87_REG to use a chain from a point earlier than the - /// POP_FROM_X87_REG (which may remove a required FPU stack pop). - POP_FROM_X87_REG, - - // Pseudo for a call to an imported function to ensure the correct machine - // instruction is emitted for Import Call Optimization. - IMP_CALL, - - /// X86 compare and logical compare instructions. - CMP, - FCMP, - COMI, - UCOMI, - - // X86 compare with Intrinsics similar to COMI. - COMX, - UCOMX, - - /// X86 bit-test instructions. - BT, - - /// X86 SetCC. Operand 0 is condition code, and operand 1 is the EFLAGS - /// operand, usually produced by a CMP instruction. - SETCC, - - /// X86 Select - SELECTS, - - /// X86 Constant-time Select, implemented with CMOV instruction. This is - /// used to implement constant-time select. - CTSELECT, - - // Same as SETCC except it's materialized with a sbb and the value is all - // one's or all zero's. - SETCC_CARRY, // R = carry_bit ? ~0 : 0 - - /// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD. - /// Operands are two FP values to compare; result is a mask of - /// 0s or 1s. Generally DTRT for C/C++ with NaNs. - FSETCC, - - /// X86 FP SETCC, similar to above, but with output as an i1 mask and - /// and a version with SAE. - FSETCCM, - FSETCCM_SAE, - - /// X86 conditional moves. Operand 0 and operand 1 are the two values - /// to select from. Operand 2 is the condition code, and operand 3 is the - /// flag operand produced by a CMP or TEST instruction. - CMOV, - - /// X86 conditional branches. Operand 0 is the chain operand, operand 1 - /// is the block to branch if condition is true, operand 2 is the - /// condition code, and operand 3 is the flag operand produced by a CMP - /// or TEST instruction. - BRCOND, - - /// BRIND node with NoTrack prefix. Operand 0 is the chain operand and - /// operand 1 is the target address. - NT_BRIND, - - /// Return with a glue operand. Operand 0 is the chain operand, operand - /// 1 is the number of bytes of stack to pop. - RET_GLUE, - - /// Return from interrupt. Operand 0 is the number of bytes to pop. - IRET, - - /// Repeat fill, corresponds to X86::REP_STOSx. - REP_STOS, - - /// Repeat move, corresponds to X86::REP_MOVSx. - REP_MOVS, - - /// On Darwin, this node represents the result of the popl - /// at function entry, used for PIC code. - GlobalBaseReg, - - /// A wrapper node for TargetConstantPool, TargetJumpTable, - /// TargetExternalSymbol, TargetGlobalAddress, TargetGlobalTLSAddress, - /// MCSymbol and TargetBlockAddress. - Wrapper, - - /// Special wrapper used under X86-64 PIC mode for RIP - /// relative displacements. - WrapperRIP, - - /// Copies a 64-bit value from an MMX vector to the low word - /// of an XMM vector, with the high word zero filled. - MOVQ2DQ, - - /// Copies a 64-bit value from the low word of an XMM vector - /// to an MMX vector. - MOVDQ2Q, - - /// Copies a 32-bit value from the low word of a MMX - /// vector to a GPR. - MMX_MOVD2W, - - /// Copies a GPR into the low 32-bit word of a MMX vector - /// and zero out the high word. - MMX_MOVW2D, - - /// Extract an 8-bit value from a vector and zero extend it to - /// i32, corresponds to X86::PEXTRB. - PEXTRB, - - /// Extract a 16-bit value from a vector and zero extend it to - /// i32, corresponds to X86::PEXTRW. - PEXTRW, - - /// Insert any element of a 4 x float vector into any element - /// of a destination 4 x floatvector. - INSERTPS, - - /// Insert the lower 8-bits of a 32-bit value to a vector, - /// corresponds to X86::PINSRB. - PINSRB, - - /// Insert the lower 16-bits of a 32-bit value to a vector, - /// corresponds to X86::PINSRW. - PINSRW, - - /// Shuffle 16 8-bit values within a vector. - PSHUFB, - - /// Compute Sum of Absolute Differences. - PSADBW, - /// Compute Double Block Packed Sum-Absolute-Differences - DBPSADBW, - - /// Bitwise Logical AND NOT of Packed FP values. - ANDNP, - - /// Blend where the selector is an immediate. - BLENDI, - - /// Dynamic (non-constant condition) vector blend where only the sign bits - /// of the condition elements are used. This is used to enforce that the - /// condition mask is not valid for generic VSELECT optimizations. This - /// is also used to implement the intrinsics. - /// Operands are in VSELECT order: MASK, TRUE, FALSE - BLENDV, - - /// Combined add and sub on an FP vector. - ADDSUB, - - // FP vector ops with rounding mode. - FADD_RND, - FADDS, - FADDS_RND, - FSUB_RND, - FSUBS, - FSUBS_RND, - FMUL_RND, - FMULS, - FMULS_RND, - FDIV_RND, - FDIVS, - FDIVS_RND, - FMAX_SAE, - FMAXS_SAE, - FMIN_SAE, - FMINS_SAE, - FSQRT_RND, - FSQRTS, - FSQRTS_RND, - - // FP vector get exponent. - FGETEXP, - FGETEXP_SAE, - FGETEXPS, - FGETEXPS_SAE, - // Extract Normalized Mantissas. - VGETMANT, - VGETMANT_SAE, - VGETMANTS, - VGETMANTS_SAE, - // FP Scale. - SCALEF, - SCALEF_RND, - SCALEFS, - SCALEFS_RND, - - /// Integer horizontal add/sub. - HADD, - HSUB, - - /// Floating point horizontal add/sub. - FHADD, - FHSUB, - - // Detect Conflicts Within a Vector - CONFLICT, - - /// Floating point max and min. - FMAX, - FMIN, - - /// Commutative FMIN and FMAX. - FMAXC, - FMINC, - - /// Scalar intrinsic floating point max and min. - FMAXS, - FMINS, - - /// Floating point reciprocal-sqrt and reciprocal approximation. - /// Note that these typically require refinement - /// in order to obtain suitable precision. - FRSQRT, - FRCP, - - // AVX-512 reciprocal approximations with a little more precision. - RSQRT14, - RSQRT14S, - RCP14, - RCP14S, - - // Thread Local Storage. - TLSADDR, - - // Thread Local Storage. A call to get the start address - // of the TLS block for the current module. - TLSBASEADDR, - - // Thread Local Storage. When calling to an OS provided - // thunk at the address from an earlier relocation. - TLSCALL, - - // Thread Local Storage. A descriptor containing pointer to - // code and to argument to get the TLS offset for the symbol. - TLSDESC, - - // Exception Handling helpers. - EH_RETURN, - - // SjLj exception handling setjmp. - EH_SJLJ_SETJMP, - - // SjLj exception handling longjmp. - EH_SJLJ_LONGJMP, - - // SjLj exception handling dispatch. - EH_SJLJ_SETUP_DISPATCH, - - /// Tail call return. See X86TargetLowering::LowerCall for - /// the list of operands. - TC_RETURN, - - // Vector move to low scalar and zero higher vector elements. - VZEXT_MOVL, - - // Vector integer truncate. - VTRUNC, - // Vector integer truncate with unsigned/signed saturation. - VTRUNCUS, - VTRUNCS, - - // Masked version of the above. Used when less than a 128-bit result is - // produced since the mask only applies to the lower elements and can't - // be represented by a select. - // SRC, PASSTHRU, MASK - VMTRUNC, - VMTRUNCUS, - VMTRUNCS, - - // Vector FP extend. - VFPEXT, - VFPEXT_SAE, - VFPEXTS, - VFPEXTS_SAE, - - // Vector FP round. - VFPROUND, - // Convert TWO packed single data to one packed data - VFPROUND2, - VFPROUND2_RND, - VFPROUND_RND, - VFPROUNDS, - VFPROUNDS_RND, - - // Masked version of above. Used for v2f64->v4f32. - // SRC, PASSTHRU, MASK - VMFPROUND, - - // 128-bit vector logical left / right shift - VSHLDQ, - VSRLDQ, - - // Vector shift elements - VSHL, - VSRL, - VSRA, - - // Vector variable shift - VSHLV, - VSRLV, - VSRAV, - - // Vector shift elements by immediate - VSHLI, - VSRLI, - VSRAI, - - // Shifts of mask registers. - KSHIFTL, - KSHIFTR, - - // Bit rotate by immediate - VROTLI, - VROTRI, - - // Vector packed double/float comparison. - CMPP, - - // Vector integer comparisons. - PCMPEQ, - PCMPGT, - - // v8i16 Horizontal minimum and position. - PHMINPOS, - - MULTISHIFT, - - /// Vector comparison generating mask bits for fp and - /// integer signed and unsigned data types. - CMPM, - // Vector mask comparison generating mask bits for FP values. - CMPMM, - // Vector mask comparison with SAE for FP values. - CMPMM_SAE, - - // Arithmetic operations with FLAGS results. - ADD, - SUB, - ADC, - SBB, - SMUL, - UMUL, - OR, - XOR, - AND, - - // Bit field extract. - BEXTR, - BEXTRI, - - // Zero High Bits Starting with Specified Bit Position. - BZHI, - - // Parallel extract and deposit. - PDEP, - PEXT, - - // X86-specific multiply by immediate. - MUL_IMM, - - // Vector sign bit extraction. - MOVMSK, - - // Vector bitwise comparisons. - PTEST, - - // Vector packed fp sign bitwise comparisons. - TESTP, - - // OR/AND test for masks. - KORTEST, - KTEST, - - // ADD for masks. - KADD, - - // Several flavors of instructions with vector shuffle behaviors. - // Saturated signed/unnsigned packing. - PACKSS, - PACKUS, - // Intra-lane alignr. - PALIGNR, - // AVX512 inter-lane alignr. - VALIGN, - PSHUFD, - PSHUFHW, - PSHUFLW, - SHUFP, - // VBMI2 Concat & Shift. - VSHLD, - VSHRD, - - // Shuffle Packed Values at 128-bit granularity. - SHUF128, - MOVDDUP, - MOVSHDUP, - MOVSLDUP, - MOVLHPS, - MOVHLPS, - MOVSD, - MOVSS, - MOVSH, - UNPCKL, - UNPCKH, - VPERMILPV, - VPERMILPI, - VPERMI, - VPERM2X128, - - // Variable Permute (VPERM). - // Res = VPERMV MaskV, V0 - VPERMV, - - // 3-op Variable Permute (VPERMT2). - // Res = VPERMV3 V0, MaskV, V1 - VPERMV3, - - // Bitwise ternary logic. - VPTERNLOG, - // Fix Up Special Packed Float32/64 values. - VFIXUPIMM, - VFIXUPIMM_SAE, - VFIXUPIMMS, - VFIXUPIMMS_SAE, - // Range Restriction Calculation For Packed Pairs of Float32/64 values. - VRANGE, - VRANGE_SAE, - VRANGES, - VRANGES_SAE, - // Reduce - Perform Reduction Transformation on scalar\packed FP. - VREDUCE, - VREDUCE_SAE, - VREDUCES, - VREDUCES_SAE, - // RndScale - Round FP Values To Include A Given Number Of Fraction Bits. - // Also used by the legacy (V)ROUND intrinsics where we mask out the - // scaling part of the immediate. - VRNDSCALE, - VRNDSCALE_SAE, - VRNDSCALES, - VRNDSCALES_SAE, - // Tests Types Of a FP Values for packed types. - VFPCLASS, - // Tests Types Of a FP Values for scalar types. - VFPCLASSS, - - // Broadcast (splat) scalar or element 0 of a vector. If the operand is - // a vector, this node may change the vector length as part of the splat. - VBROADCAST, - // Broadcast mask to vector. - VBROADCASTM, - - /// SSE4A Extraction and Insertion. - EXTRQI, - INSERTQI, - - // XOP arithmetic/logical shifts. - VPSHA, - VPSHL, - // XOP signed/unsigned integer comparisons. - VPCOM, - VPCOMU, - // XOP packed permute bytes. - VPPERM, - // XOP two source permutation. - VPERMIL2, - - // Vector multiply packed unsigned doubleword integers. - PMULUDQ, - // Vector multiply packed signed doubleword integers. - PMULDQ, - // Vector Multiply Packed UnsignedIntegers with Round and Scale. - MULHRS, - - // Multiply and Add Packed Integers. - VPMADDUBSW, - VPMADDWD, - - // AVX512IFMA multiply and add. - // NOTE: These are different than the instruction and perform - // op0 x op1 + op2. - VPMADD52L, - VPMADD52H, - - // VNNI - VPDPBUSD, - VPDPBUSDS, - VPDPWSSD, - VPDPWSSDS, - - // FMA nodes. - // We use the target independent ISD::FMA for the non-inverted case. - FNMADD, - FMSUB, - FNMSUB, - FMADDSUB, - FMSUBADD, - - // FMA with rounding mode. - FMADD_RND, - FNMADD_RND, - FMSUB_RND, - FNMSUB_RND, - FMADDSUB_RND, - FMSUBADD_RND, - - // AVX512-FP16 complex addition and multiplication. - VFMADDC, - VFMADDC_RND, - VFCMADDC, - VFCMADDC_RND, - - VFMULC, - VFMULC_RND, - VFCMULC, - VFCMULC_RND, - - VFMADDCSH, - VFMADDCSH_RND, - VFCMADDCSH, - VFCMADDCSH_RND, - - VFMULCSH, - VFMULCSH_RND, - VFCMULCSH, - VFCMULCSH_RND, - - VPDPBSUD, - VPDPBSUDS, - VPDPBUUD, - VPDPBUUDS, - VPDPBSSD, - VPDPBSSDS, - - VPDPWSUD, - VPDPWSUDS, - VPDPWUSD, - VPDPWUSDS, - VPDPWUUD, - VPDPWUUDS, - - VMINMAX, - VMINMAX_SAE, - VMINMAXS, - VMINMAXS_SAE, - - CVTP2IBS, - CVTP2IUBS, - CVTP2IBS_RND, - CVTP2IUBS_RND, - CVTTP2IBS, - CVTTP2IUBS, - CVTTP2IBS_SAE, - CVTTP2IUBS_SAE, - - MPSADBW, - - VCVT2PH2BF8, - VCVT2PH2BF8S, - VCVT2PH2HF8, - VCVT2PH2HF8S, - VCVTBIASPH2BF8, - VCVTBIASPH2BF8S, - VCVTBIASPH2HF8, - VCVTBIASPH2HF8S, - VCVTPH2BF8, - VCVTPH2BF8S, - VCVTPH2HF8, - VCVTPH2HF8S, - VMCVTBIASPH2BF8, - VMCVTBIASPH2BF8S, - VMCVTBIASPH2HF8, - VMCVTBIASPH2HF8S, - VMCVTPH2BF8, - VMCVTPH2BF8S, - VMCVTPH2HF8, - VMCVTPH2HF8S, - VCVTHF82PH, - - // Compress and expand. - COMPRESS, - EXPAND, - - // Bits shuffle - VPSHUFBITQMB, - - // Convert Unsigned/Integer to Floating-Point Value with rounding mode. - SINT_TO_FP_RND, - UINT_TO_FP_RND, - SCALAR_SINT_TO_FP, - SCALAR_UINT_TO_FP, - SCALAR_SINT_TO_FP_RND, - SCALAR_UINT_TO_FP_RND, - - // Vector float/double to signed/unsigned integer. - CVTP2SI, - CVTP2UI, - CVTP2SI_RND, - CVTP2UI_RND, - // Scalar float/double to signed/unsigned integer. - CVTS2SI, - CVTS2UI, - CVTS2SI_RND, - CVTS2UI_RND, - - // Vector float/double to signed/unsigned integer with truncation. - CVTTP2SI, - CVTTP2UI, - CVTTP2SI_SAE, - CVTTP2UI_SAE, - - // Saturation enabled Vector float/double to signed/unsigned - // integer with truncation. - CVTTP2SIS, - CVTTP2UIS, - CVTTP2SIS_SAE, - CVTTP2UIS_SAE, - // Masked versions of above. Used for v2f64 to v4i32. - // SRC, PASSTHRU, MASK - MCVTTP2SIS, - MCVTTP2UIS, - - // Scalar float/double to signed/unsigned integer with truncation. - CVTTS2SI, - CVTTS2UI, - CVTTS2SI_SAE, - CVTTS2UI_SAE, - - // Vector signed/unsigned integer to float/double. - CVTSI2P, - CVTUI2P, - - // Scalar float/double to signed/unsigned integer with saturation. - CVTTS2SIS, - CVTTS2UIS, - CVTTS2SIS_SAE, - CVTTS2UIS_SAE, - - // Masked versions of above. Used for v2f64->v4f32. - // SRC, PASSTHRU, MASK - MCVTP2SI, - MCVTP2UI, - MCVTTP2SI, - MCVTTP2UI, - MCVTSI2P, - MCVTUI2P, - - // Custom handling for FP_TO_xINT_SAT - FP_TO_SINT_SAT, - FP_TO_UINT_SAT, - - // Vector float to bfloat16. - // Convert packed single data to packed BF16 data - CVTNEPS2BF16, - // Masked version of above. - // SRC, PASSTHRU, MASK - MCVTNEPS2BF16, - - // Dot product of BF16/FP16 pairs to accumulated into - // packed single precision. - DPBF16PS, - DPFP16PS, - - // A stack checking function call. On Windows it's _chkstk call. - DYN_ALLOCA, - - // For allocating variable amounts of stack space when using - // segmented stacks. Check if the current stacklet has enough space, and - // falls back to heap allocation if not. - SEG_ALLOCA, - - // For allocating stack space when using stack clash protector. - // Allocation is performed by block, and each block is probed. - PROBED_ALLOCA, - - // Memory barriers. - MFENCE, - - // Get a random integer and indicate whether it is valid in CF. - RDRAND, - - // Get a NIST SP800-90B & C compliant random integer and - // indicate whether it is valid in CF. - RDSEED, - - // Protection keys - // RDPKRU - Operand 0 is chain. Operand 1 is value for ECX. - // WRPKRU - Operand 0 is chain. Operand 1 is value for EDX. Operand 2 is - // value for ECX. - RDPKRU, - WRPKRU, - - // SSE42 string comparisons. - // These nodes produce 3 results, index, mask, and flags. X86ISelDAGToDAG - // will emit one or two instructions based on which results are used. If - // flags and index/mask this allows us to use a single instruction since - // we won't have to pick and opcode for flags. Instead we can rely on the - // DAG to CSE everything and decide at isel. - PCMPISTR, - PCMPESTR, - - // Test if in transactional execution. - XTEST, - - // Conversions between float and half-float. - CVTPS2PH, - CVTPS2PH_SAE, - CVTPH2PS, - CVTPH2PS_SAE, - - // Masked version of above. - // SRC, RND, PASSTHRU, MASK - MCVTPS2PH, - MCVTPS2PH_SAE, - - // Galois Field Arithmetic Instructions - GF2P8AFFINEINVQB, - GF2P8AFFINEQB, - GF2P8MULB, - - // LWP insert record. - LWPINS, - - // User level wait - UMWAIT, - TPAUSE, - - // Enqueue Stores Instructions - ENQCMD, - ENQCMDS, - - // For avx512-vp2intersect - VP2INTERSECT, - - // User level interrupts - testui - TESTUI, - - // Perform an FP80 add after changing precision control in FPCW. - FP80_ADD, - - // Conditional compare instructions - CCMP, - CTEST, - - /// X86 strict FP compare instructions. - FIRST_STRICTFP_OPCODE, - STRICT_FCMP = FIRST_STRICTFP_OPCODE, - STRICT_FCMPS, - - // Vector packed double/float comparison. - STRICT_CMPP, - - /// Vector comparison generating mask bits for fp and - /// integer signed and unsigned data types. - STRICT_CMPM, - - // Vector float/double to signed/unsigned integer with truncation. - STRICT_CVTTP2SI, - STRICT_CVTTP2UI, - - // Vector FP extend. - STRICT_VFPEXT, - - // Vector FP round. - STRICT_VFPROUND, - - // RndScale - Round FP Values To Include A Given Number Of Fraction Bits. - // Also used by the legacy (V)ROUND intrinsics where we mask out the - // scaling part of the immediate. - STRICT_VRNDSCALE, - - // Vector signed/unsigned integer to float/double. - STRICT_CVTSI2P, - STRICT_CVTUI2P, - - // Strict FMA nodes. - STRICT_FNMADD, - STRICT_FMSUB, - STRICT_FNMSUB, - - // Conversions between float and half-float. - STRICT_CVTPS2PH, - STRICT_CVTPH2PS, - - // Perform an FP80 add after changing precision control in FPCW. - STRICT_FP80_ADD, - - /// Floating point max and min. - STRICT_FMAX, - STRICT_FMIN, - LAST_STRICTFP_OPCODE = STRICT_FMIN, - - // Compare and swap. - FIRST_MEMORY_OPCODE, - LCMPXCHG_DAG = FIRST_MEMORY_OPCODE, - LCMPXCHG8_DAG, - LCMPXCHG16_DAG, - LCMPXCHG16_SAVE_RBX_DAG, - - /// LOCK-prefixed arithmetic read-modify-write instructions. - /// EFLAGS, OUTCHAIN = LADD(INCHAIN, PTR, RHS) - LADD, - LSUB, - LOR, - LXOR, - LAND, - LBTS, - LBTC, - LBTR, - LBTS_RM, - LBTC_RM, - LBTR_RM, - - /// RAO arithmetic instructions. - /// OUTCHAIN = AADD(INCHAIN, PTR, RHS) - AADD, - AOR, - AXOR, - AAND, - - // Load, scalar_to_vector, and zero extend. - VZEXT_LOAD, - - // extract_vector_elt, store. - VEXTRACT_STORE, - - // scalar broadcast from memory. - VBROADCAST_LOAD, - - // subvector broadcast from memory. - SUBV_BROADCAST_LOAD, - - // Store FP control word into i16 memory. - FNSTCW16m, - - // Load FP control word from i16 memory. - FLDCW16m, - - // Store x87 FPU environment into memory. - FNSTENVm, - - // Load x87 FPU environment from memory. - FLDENVm, - - /// This instruction implements FP_TO_SINT with the - /// integer destination in memory and a FP reg source. This corresponds - /// to the X86::FIST*m instructions and the rounding mode change stuff. It - /// has two inputs (token chain and address) and two outputs (int value - /// and token chain). Memory VT specifies the type to store to. - FP_TO_INT_IN_MEM, - - /// This instruction implements SINT_TO_FP with the - /// integer source in memory and FP reg result. This corresponds to the - /// X86::FILD*m instructions. It has two inputs (token chain and address) - /// and two outputs (FP value and token chain). The integer source type is - /// specified by the memory VT. - FILD, - - /// This instruction implements a fp->int store from FP stack - /// slots. This corresponds to the fist instruction. It takes a - /// chain operand, value to store, address, and glue. The memory VT - /// specifies the type to store as. - FIST, - - /// This instruction implements an extending load to FP stack slots. - /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain - /// operand, and ptr to load from. The memory VT specifies the type to - /// load from. - FLD, - - /// This instruction implements a truncating store from FP stack - /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a - /// chain operand, value to store, address, and glue. The memory VT - /// specifies the type to store as. - FST, - - /// These instructions grab the address of the next argument - /// from a va_list. (reads and modifies the va_list in memory) - VAARG_64, - VAARG_X32, - - // Vector truncating store with unsigned/signed saturation - VTRUNCSTOREUS, - VTRUNCSTORES, - // Vector truncating masked store with unsigned/signed saturation - VMTRUNCSTOREUS, - VMTRUNCSTORES, - - // X86 specific gather and scatter - MGATHER, - MSCATTER, - - // Key locker nodes that produce flags. - AESENC128KL, - AESDEC128KL, - AESENC256KL, - AESDEC256KL, - AESENCWIDE128KL, - AESDECWIDE128KL, - AESENCWIDE256KL, - AESDECWIDE256KL, - - /// Compare and Add if Condition is Met. Compare value in operand 2 with - /// value in memory of operand 1. If condition of operand 4 is met, add - /// value operand 3 to m32 and write new value in operand 1. Operand 2 is - /// always updated with the original value from operand 1. - CMPCCXADD, - - // Save xmm argument registers to the stack, according to %al. An operator - // is needed so that this can be expanded with control flow. - VASTART_SAVE_XMM_REGS, - - // Conditional load/store instructions - CLOAD, - CSTORE, - LAST_MEMORY_OPCODE = CSTORE, - }; - } // end namespace X86ISD - - namespace X86 { - /// Current rounding mode is represented in bits 11:10 of FPSR. These - /// values are same as corresponding constants for rounding mode used - /// in glibc. - enum RoundingMode { - rmInvalid = -1, // For handle Invalid rounding mode - rmToNearest = 0, // FE_TONEAREST - rmDownward = 1 << 10, // FE_DOWNWARD - rmUpward = 2 << 10, // FE_UPWARD - rmTowardZero = 3 << 10, // FE_TOWARDZERO - rmMask = 3 << 10 // Bit mask selecting rounding mode - }; +class X86Subtarget; +class X86TargetMachine; + +namespace X86ISD { +// X86 Specific DAG Nodes +enum NodeType : unsigned { + // Start the numbering where the builtin ops leave off. + FIRST_NUMBER = ISD::BUILTIN_OP_END, + + /// Bit scan forward. + BSF, + /// Bit scan reverse. + BSR, + + /// X86 funnel/double shift i16 instructions. These correspond to + /// X86::SHLDW and X86::SHRDW instructions which have different amt + /// modulo rules to generic funnel shifts. + /// NOTE: The operand order matches ISD::FSHL/FSHR not SHLD/SHRD. + FSHL, + FSHR, + + /// Bitwise logical AND of floating point values. This corresponds + /// to X86::ANDPS or X86::ANDPD. + FAND, + + /// Bitwise logical OR of floating point values. This corresponds + /// to X86::ORPS or X86::ORPD. + FOR, + + /// Bitwise logical XOR of floating point values. This corresponds + /// to X86::XORPS or X86::XORPD. + FXOR, + + /// Bitwise logical ANDNOT of floating point values. This + /// corresponds to X86::ANDNPS or X86::ANDNPD. + FANDN, + + /// These operations represent an abstract X86 call + /// instruction, which includes a bunch of information. In particular the + /// operands of these node are: + /// + /// #0 - The incoming token chain + /// #1 - The callee + /// #2 - The number of arg bytes the caller pushes on the stack. + /// #3 - The number of arg bytes the callee pops off the stack. + /// #4 - The value to pass in AL/AX/EAX (optional) + /// #5 - The value to pass in DL/DX/EDX (optional) + /// + /// The result values of these nodes are: + /// + /// #0 - The outgoing token chain + /// #1 - The first register result value (optional) + /// #2 - The second register result value (optional) + /// + CALL, + + /// Same as call except it adds the NoTrack prefix. + NT_CALL, + + // Pseudo for a OBJC call that gets emitted together with a special + // marker instruction. + CALL_RVMARKER, + + /// The same as ISD::CopyFromReg except that this node makes it explicit + /// that it may lower to an x87 FPU stack pop. Optimizations should be more + /// cautious when handling this node than a normal CopyFromReg to avoid + /// removing a required FPU stack pop. A key requirement is optimizations + /// should not optimize any users of a chain that contains a + /// POP_FROM_X87_REG to use a chain from a point earlier than the + /// POP_FROM_X87_REG (which may remove a required FPU stack pop). + POP_FROM_X87_REG, + + // Pseudo for a call to an imported function to ensure the correct machine + // instruction is emitted for Import Call Optimization. + IMP_CALL, + + /// X86 compare and logical compare instructions. + CMP, + FCMP, + COMI, + UCOMI, + + // X86 compare with Intrinsics similar to COMI. + COMX, + UCOMX, + + /// X86 bit-test instructions. + BT, + + /// X86 SetCC. Operand 0 is condition code, and operand 1 is the EFLAGS + /// operand, usually produced by a CMP instruction. + SETCC, + + /// X86 Select + SELECTS, + + /// X86 Constant-time Select, implemented with CMOV instruction. This is + /// used to implement constant-time select. + CTSELECT, + + // Same as SETCC except it's materialized with a sbb and the value is all + // one's or all zero's. + SETCC_CARRY, // R = carry_bit ? ~0 : 0 + + /// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD. + /// Operands are two FP values to compare; result is a mask of + /// 0s or 1s. Generally DTRT for C/C++ with NaNs. + FSETCC, + + /// X86 FP SETCC, similar to above, but with output as an i1 mask and + /// and a version with SAE. + FSETCCM, + FSETCCM_SAE, + + /// X86 conditional moves. Operand 0 and operand 1 are the two values + /// to select from. Operand 2 is the condition code, and operand 3 is the + /// flag operand produced by a CMP or TEST instruction. + CMOV, + + /// X86 conditional branches. Operand 0 is the chain operand, operand 1 + /// is the block to branch if condition is true, operand 2 is the + /// condition code, and operand 3 is the flag operand produced by a CMP + /// or TEST instruction. + BRCOND, + + /// BRIND node with NoTrack prefix. Operand 0 is the chain operand and + /// operand 1 is the target address. + NT_BRIND, + + /// Return with a glue operand. Operand 0 is the chain operand, operand + /// 1 is the number of bytes of stack to pop. + RET_GLUE, + + /// Return from interrupt. Operand 0 is the number of bytes to pop. + IRET, + + /// Repeat fill, corresponds to X86::REP_STOSx. + REP_STOS, + + /// Repeat move, corresponds to X86::REP_MOVSx. + REP_MOVS, + + /// On Darwin, this node represents the result of the popl + /// at function entry, used for PIC code. + GlobalBaseReg, + + /// A wrapper node for TargetConstantPool, TargetJumpTable, + /// TargetExternalSymbol, TargetGlobalAddress, TargetGlobalTLSAddress, + /// MCSymbol and TargetBlockAddress. + Wrapper, + + /// Special wrapper used under X86-64 PIC mode for RIP + /// relative displacements. + WrapperRIP, + + /// Copies a 64-bit value from an MMX vector to the low word + /// of an XMM vector, with the high word zero filled. + MOVQ2DQ, + + /// Copies a 64-bit value from the low word of an XMM vector + /// to an MMX vector. + MOVDQ2Q, + + /// Copies a 32-bit value from the low word of a MMX + /// vector to a GPR. + MMX_MOVD2W, + + /// Copies a GPR into the low 32-bit word of a MMX vector + /// and zero out the high word. + MMX_MOVW2D, + + /// Extract an 8-bit value from a vector and zero extend it to + /// i32, corresponds to X86::PEXTRB. + PEXTRB, + + /// Extract a 16-bit value from a vector and zero extend it to + /// i32, corresponds to X86::PEXTRW. + PEXTRW, + + /// Insert any element of a 4 x float vector into any element + /// of a destination 4 x floatvector. + INSERTPS, + + /// Insert the lower 8-bits of a 32-bit value to a vector, + /// corresponds to X86::PINSRB. + PINSRB, + + /// Insert the lower 16-bits of a 32-bit value to a vector, + /// corresponds to X86::PINSRW. + PINSRW, + + /// Shuffle 16 8-bit values within a vector. + PSHUFB, + + /// Compute Sum of Absolute Differences. + PSADBW, + /// Compute Double Block Packed Sum-Absolute-Differences + DBPSADBW, + + /// Bitwise Logical AND NOT of Packed FP values. + ANDNP, + + /// Blend where the selector is an immediate. + BLENDI, + + /// Dynamic (non-constant condition) vector blend where only the sign bits + /// of the condition elements are used. This is used to enforce that the + /// condition mask is not valid for generic VSELECT optimizations. This + /// is also used to implement the intrinsics. + /// Operands are in VSELECT order: MASK, TRUE, FALSE + BLENDV, + + /// Combined add and sub on an FP vector. + ADDSUB, + + // FP vector ops with rounding mode. + FADD_RND, + FADDS, + FADDS_RND, + FSUB_RND, + FSUBS, + FSUBS_RND, + FMUL_RND, + FMULS, + FMULS_RND, + FDIV_RND, + FDIVS, + FDIVS_RND, + FMAX_SAE, + FMAXS_SAE, + FMIN_SAE, + FMINS_SAE, + FSQRT_RND, + FSQRTS, + FSQRTS_RND, + + // FP vector get exponent. + FGETEXP, + FGETEXP_SAE, + FGETEXPS, + FGETEXPS_SAE, + // Extract Normalized Mantissas. + VGETMANT, + VGETMANT_SAE, + VGETMANTS, + VGETMANTS_SAE, + // FP Scale. + SCALEF, + SCALEF_RND, + SCALEFS, + SCALEFS_RND, + + /// Integer horizontal add/sub. + HADD, + HSUB, + + /// Floating point horizontal add/sub. + FHADD, + FHSUB, + + // Detect Conflicts Within a Vector + CONFLICT, + + /// Floating point max and min. + FMAX, + FMIN, + + /// Commutative FMIN and FMAX. + FMAXC, + FMINC, + + /// Scalar intrinsic floating point max and min. + FMAXS, + FMINS, + + /// Floating point reciprocal-sqrt and reciprocal approximation. + /// Note that these typically require refinement + /// in order to obtain suitable precision. + FRSQRT, + FRCP, + + // AVX-512 reciprocal approximations with a little more precision. + RSQRT14, + RSQRT14S, + RCP14, + RCP14S, + + // Thread Local Storage. + TLSADDR, + + // Thread Local Storage. A call to get the start address + // of the TLS block for the current module. + TLSBASEADDR, + + // Thread Local Storage. When calling to an OS provided + // thunk at the address from an earlier relocation. + TLSCALL, + + // Thread Local Storage. A descriptor containing pointer to + // code and to argument to get the TLS offset for the symbol. + TLSDESC, + + // Exception Handling helpers. + EH_RETURN, + + // SjLj exception handling setjmp. + EH_SJLJ_SETJMP, + + // SjLj exception handling longjmp. + EH_SJLJ_LONGJMP, + + // SjLj exception handling dispatch. + EH_SJLJ_SETUP_DISPATCH, + + /// Tail call return. See X86TargetLowering::LowerCall for + /// the list of operands. + TC_RETURN, + + // Vector move to low scalar and zero higher vector elements. + VZEXT_MOVL, + + // Vector integer truncate. + VTRUNC, + // Vector integer truncate with unsigned/signed saturation. + VTRUNCUS, + VTRUNCS, + + // Masked version of the above. Used when less than a 128-bit result is + // produced since the mask only applies to the lower elements and can't + // be represented by a select. + // SRC, PASSTHRU, MASK + VMTRUNC, + VMTRUNCUS, + VMTRUNCS, + + // Vector FP extend. + VFPEXT, + VFPEXT_SAE, + VFPEXTS, + VFPEXTS_SAE, + + // Vector FP round. + VFPROUND, + // Convert TWO packed single data to one packed data + VFPROUND2, + VFPROUND2_RND, + VFPROUND_RND, + VFPROUNDS, + VFPROUNDS_RND, + + // Masked version of above. Used for v2f64->v4f32. + // SRC, PASSTHRU, MASK + VMFPROUND, + + // 128-bit vector logical left / right shift + VSHLDQ, + VSRLDQ, + + // Vector shift elements + VSHL, + VSRL, + VSRA, + + // Vector variable shift + VSHLV, + VSRLV, + VSRAV, + + // Vector shift elements by immediate + VSHLI, + VSRLI, + VSRAI, + + // Shifts of mask registers. + KSHIFTL, + KSHIFTR, + + // Bit rotate by immediate + VROTLI, + VROTRI, + + // Vector packed double/float comparison. + CMPP, + + // Vector integer comparisons. + PCMPEQ, + PCMPGT, + + // v8i16 Horizontal minimum and position. + PHMINPOS, + + MULTISHIFT, + + /// Vector comparison generating mask bits for fp and + /// integer signed and unsigned data types. + CMPM, + // Vector mask comparison generating mask bits for FP values. + CMPMM, + // Vector mask comparison with SAE for FP values. + CMPMM_SAE, + + // Arithmetic operations with FLAGS results. + ADD, + SUB, + ADC, + SBB, + SMUL, + UMUL, + OR, + XOR, + AND, + + // Bit field extract. + BEXTR, + BEXTRI, + + // Zero High Bits Starting with Specified Bit Position. + BZHI, + + // Parallel extract and deposit. + PDEP, + PEXT, + + // X86-specific multiply by immediate. + MUL_IMM, + + // Vector sign bit extraction. + MOVMSK, + + // Vector bitwise comparisons. + PTEST, + + // Vector packed fp sign bitwise comparisons. + TESTP, + + // OR/AND test for masks. + KORTEST, + KTEST, + + // ADD for masks. + KADD, + + // Several flavors of instructions with vector shuffle behaviors. + // Saturated signed/unnsigned packing. + PACKSS, + PACKUS, + // Intra-lane alignr. + PALIGNR, + // AVX512 inter-lane alignr. + VALIGN, + PSHUFD, + PSHUFHW, + PSHUFLW, + SHUFP, + // VBMI2 Concat & Shift. + VSHLD, + VSHRD, + + // Shuffle Packed Values at 128-bit granularity. + SHUF128, + MOVDDUP, + MOVSHDUP, + MOVSLDUP, + MOVLHPS, + MOVHLPS, + MOVSD, + MOVSS, + MOVSH, + UNPCKL, + UNPCKH, + VPERMILPV, + VPERMILPI, + VPERMI, + VPERM2X128, + + // Variable Permute (VPERM). + // Res = VPERMV MaskV, V0 + VPERMV, + + // 3-op Variable Permute (VPERMT2). + // Res = VPERMV3 V0, MaskV, V1 + VPERMV3, + + // Bitwise ternary logic. + VPTERNLOG, + // Fix Up Special Packed Float32/64 values. + VFIXUPIMM, + VFIXUPIMM_SAE, + VFIXUPIMMS, + VFIXUPIMMS_SAE, + // Range Restriction Calculation For Packed Pairs of Float32/64 values. + VRANGE, + VRANGE_SAE, + VRANGES, + VRANGES_SAE, + // Reduce - Perform Reduction Transformation on scalar\packed FP. + VREDUCE, + VREDUCE_SAE, + VREDUCES, + VREDUCES_SAE, + // RndScale - Round FP Values To Include A Given Number Of Fraction Bits. + // Also used by the legacy (V)ROUND intrinsics where we mask out the + // scaling part of the immediate. + VRNDSCALE, + VRNDSCALE_SAE, + VRNDSCALES, + VRNDSCALES_SAE, + // Tests Types Of a FP Values for packed types. + VFPCLASS, + // Tests Types Of a FP Values for scalar types. + VFPCLASSS, + + // Broadcast (splat) scalar or element 0 of a vector. If the operand is + // a vector, this node may change the vector length as part of the splat. + VBROADCAST, + // Broadcast mask to vector. + VBROADCASTM, + + /// SSE4A Extraction and Insertion. + EXTRQI, + INSERTQI, + + // XOP arithmetic/logical shifts. + VPSHA, + VPSHL, + // XOP signed/unsigned integer comparisons. + VPCOM, + VPCOMU, + // XOP packed permute bytes. + VPPERM, + // XOP two source permutation. + VPERMIL2, + + // Vector multiply packed unsigned doubleword integers. + PMULUDQ, + // Vector multiply packed signed doubleword integers. + PMULDQ, + // Vector Multiply Packed UnsignedIntegers with Round and Scale. + MULHRS, + + // Multiply and Add Packed Integers. + VPMADDUBSW, + VPMADDWD, + + // AVX512IFMA multiply and add. + // NOTE: These are different than the instruction and perform + // op0 x op1 + op2. + VPMADD52L, + VPMADD52H, + + // VNNI + VPDPBUSD, + VPDPBUSDS, + VPDPWSSD, + VPDPWSSDS, + + // FMA nodes. + // We use the target independent ISD::FMA for the non-inverted case. + FNMADD, + FMSUB, + FNMSUB, + FMADDSUB, + FMSUBADD, + + // FMA with rounding mode. + FMADD_RND, + FNMADD_RND, + FMSUB_RND, + FNMSUB_RND, + FMADDSUB_RND, + FMSUBADD_RND, + + // AVX512-FP16 complex addition and multiplication. + VFMADDC, + VFMADDC_RND, + VFCMADDC, + VFCMADDC_RND, + + VFMULC, + VFMULC_RND, + VFCMULC, + VFCMULC_RND, + + VFMADDCSH, + VFMADDCSH_RND, + VFCMADDCSH, + VFCMADDCSH_RND, + + VFMULCSH, + VFMULCSH_RND, + VFCMULCSH, + VFCMULCSH_RND, + + VPDPBSUD, + VPDPBSUDS, + VPDPBUUD, + VPDPBUUDS, + VPDPBSSD, + VPDPBSSDS, + + VPDPWSUD, + VPDPWSUDS, + VPDPWUSD, + VPDPWUSDS, + VPDPWUUD, + VPDPWUUDS, + + VMINMAX, + VMINMAX_SAE, + VMINMAXS, + VMINMAXS_SAE, + + CVTP2IBS, + CVTP2IUBS, + CVTP2IBS_RND, + CVTP2IUBS_RND, + CVTTP2IBS, + CVTTP2IUBS, + CVTTP2IBS_SAE, + CVTTP2IUBS_SAE, + + MPSADBW, + + VCVT2PH2BF8, + VCVT2PH2BF8S, + VCVT2PH2HF8, + VCVT2PH2HF8S, + VCVTBIASPH2BF8, + VCVTBIASPH2BF8S, + VCVTBIASPH2HF8, + VCVTBIASPH2HF8S, + VCVTPH2BF8, + VCVTPH2BF8S, + VCVTPH2HF8, + VCVTPH2HF8S, + VMCVTBIASPH2BF8, + VMCVTBIASPH2BF8S, + VMCVTBIASPH2HF8, + VMCVTBIASPH2HF8S, + VMCVTPH2BF8, + VMCVTPH2BF8S, + VMCVTPH2HF8, + VMCVTPH2HF8S, + VCVTHF82PH, + + // Compress and expand. + COMPRESS, + EXPAND, + + // Bits shuffle + VPSHUFBITQMB, + + // Convert Unsigned/Integer to Floating-Point Value with rounding mode. + SINT_TO_FP_RND, + UINT_TO_FP_RND, + SCALAR_SINT_TO_FP, + SCALAR_UINT_TO_FP, + SCALAR_SINT_TO_FP_RND, + SCALAR_UINT_TO_FP_RND, + + // Vector float/double to signed/unsigned integer. + CVTP2SI, + CVTP2UI, + CVTP2SI_RND, + CVTP2UI_RND, + // Scalar float/double to signed/unsigned integer. + CVTS2SI, + CVTS2UI, + CVTS2SI_RND, + CVTS2UI_RND, + + // Vector float/double to signed/unsigned integer with truncation. + CVTTP2SI, + CVTTP2UI, + CVTTP2SI_SAE, + CVTTP2UI_SAE, + + // Saturation enabled Vector float/double to signed/unsigned + // integer with truncation. + CVTTP2SIS, + CVTTP2UIS, + CVTTP2SIS_SAE, + CVTTP2UIS_SAE, + // Masked versions of above. Used for v2f64 to v4i32. + // SRC, PASSTHRU, MASK + MCVTTP2SIS, + MCVTTP2UIS, + + // Scalar float/double to signed/unsigned integer with truncation. + CVTTS2SI, + CVTTS2UI, + CVTTS2SI_SAE, + CVTTS2UI_SAE, + + // Vector signed/unsigned integer to float/double. + CVTSI2P, + CVTUI2P, + + // Scalar float/double to signed/unsigned integer with saturation. + CVTTS2SIS, + CVTTS2UIS, + CVTTS2SIS_SAE, + CVTTS2UIS_SAE, + + // Masked versions of above. Used for v2f64->v4f32. + // SRC, PASSTHRU, MASK + MCVTP2SI, + MCVTP2UI, + MCVTTP2SI, + MCVTTP2UI, + MCVTSI2P, + MCVTUI2P, + + // Custom handling for FP_TO_xINT_SAT + FP_TO_SINT_SAT, + FP_TO_UINT_SAT, + + // Vector float to bfloat16. + // Convert packed single data to packed BF16 data + CVTNEPS2BF16, + // Masked version of above. + // SRC, PASSTHRU, MASK + MCVTNEPS2BF16, + + // Dot product of BF16/FP16 pairs to accumulated into + // packed single precision. + DPBF16PS, + DPFP16PS, + + // A stack checking function call. On Windows it's _chkstk call. + DYN_ALLOCA, + + // For allocating variable amounts of stack space when using + // segmented stacks. Check if the current stacklet has enough space, and + // falls back to heap allocation if not. + SEG_ALLOCA, + + // For allocating stack space when using stack clash protector. + // Allocation is performed by block, and each block is probed. + PROBED_ALLOCA, + + // Memory barriers. + MFENCE, + + // Get a random integer and indicate whether it is valid in CF. + RDRAND, + + // Get a NIST SP800-90B & C compliant random integer and + // indicate whether it is valid in CF. + RDSEED, + + // Protection keys + // RDPKRU - Operand 0 is chain. Operand 1 is value for ECX. + // WRPKRU - Operand 0 is chain. Operand 1 is value for EDX. Operand 2 is + // value for ECX. + RDPKRU, + WRPKRU, + + // SSE42 string comparisons. + // These nodes produce 3 results, index, mask, and flags. X86ISelDAGToDAG + // will emit one or two instructions based on which results are used. If + // flags and index/mask this allows us to use a single instruction since + // we won't have to pick and opcode for flags. Instead we can rely on the + // DAG to CSE everything and decide at isel. + PCMPISTR, + PCMPESTR, + + // Test if in transactional execution. + XTEST, + + // Conversions between float and half-float. + CVTPS2PH, + CVTPS2PH_SAE, + CVTPH2PS, + CVTPH2PS_SAE, + + // Masked version of above. + // SRC, RND, PASSTHRU, MASK + MCVTPS2PH, + MCVTPS2PH_SAE, + + // Galois Field Arithmetic Instructions + GF2P8AFFINEINVQB, + GF2P8AFFINEQB, + GF2P8MULB, + + // LWP insert record. + LWPINS, + + // User level wait + UMWAIT, + TPAUSE, + + // Enqueue Stores Instructions + ENQCMD, + ENQCMDS, + + // For avx512-vp2intersect + VP2INTERSECT, + + // User level interrupts - testui + TESTUI, + + // Perform an FP80 add after changing precision control in FPCW. + FP80_ADD, + + // Conditional compare instructions + CCMP, + CTEST, + + /// X86 strict FP compare instructions. + FIRST_STRICTFP_OPCODE, + STRICT_FCMP = FIRST_STRICTFP_OPCODE, + STRICT_FCMPS, + + // Vector packed double/float comparison. + STRICT_CMPP, + + /// Vector comparison generating mask bits for fp and + /// integer signed and unsigned data types. + STRICT_CMPM, + + // Vector float/double to signed/unsigned integer with truncation. + STRICT_CVTTP2SI, + STRICT_CVTTP2UI, + + // Vector FP extend. + STRICT_VFPEXT, + + // Vector FP round. + STRICT_VFPROUND, + + // RndScale - Round FP Values To Include A Given Number Of Fraction Bits. + // Also used by the legacy (V)ROUND intrinsics where we mask out the + // scaling part of the immediate. + STRICT_VRNDSCALE, + + // Vector signed/unsigned integer to float/double. + STRICT_CVTSI2P, + STRICT_CVTUI2P, + + // Strict FMA nodes. + STRICT_FNMADD, + STRICT_FMSUB, + STRICT_FNMSUB, + + // Conversions between float and half-float. + STRICT_CVTPS2PH, + STRICT_CVTPH2PS, + + // Perform an FP80 add after changing precision control in FPCW. + STRICT_FP80_ADD, + + /// Floating point max and min. + STRICT_FMAX, + STRICT_FMIN, + LAST_STRICTFP_OPCODE = STRICT_FMIN, + + // Compare and swap. + FIRST_MEMORY_OPCODE, + LCMPXCHG_DAG = FIRST_MEMORY_OPCODE, + LCMPXCHG8_DAG, + LCMPXCHG16_DAG, + LCMPXCHG16_SAVE_RBX_DAG, + + /// LOCK-prefixed arithmetic read-modify-write instructions. + /// EFLAGS, OUTCHAIN = LADD(INCHAIN, PTR, RHS) + LADD, + LSUB, + LOR, + LXOR, + LAND, + LBTS, + LBTC, + LBTR, + LBTS_RM, + LBTC_RM, + LBTR_RM, + + /// RAO arithmetic instructions. + /// OUTCHAIN = AADD(INCHAIN, PTR, RHS) + AADD, + AOR, + AXOR, + AAND, + + // Load, scalar_to_vector, and zero extend. + VZEXT_LOAD, + + // extract_vector_elt, store. + VEXTRACT_STORE, + + // scalar broadcast from memory. + VBROADCAST_LOAD, + + // subvector broadcast from memory. + SUBV_BROADCAST_LOAD, + + // Store FP control word into i16 memory. + FNSTCW16m, + + // Load FP control word from i16 memory. + FLDCW16m, + + // Store x87 FPU environment into memory. + FNSTENVm, + + // Load x87 FPU environment from memory. + FLDENVm, + + /// This instruction implements FP_TO_SINT with the + /// integer destination in memory and a FP reg source. This corresponds + /// to the X86::FIST*m instructions and the rounding mode change stuff. It + /// has two inputs (token chain and address) and two outputs (int value + /// and token chain). Memory VT specifies the type to store to. + FP_TO_INT_IN_MEM, + + /// This instruction implements SINT_TO_FP with the + /// integer source in memory and FP reg result. This corresponds to the + /// X86::FILD*m instructions. It has two inputs (token chain and address) + /// and two outputs (FP value and token chain). The integer source type is + /// specified by the memory VT. + FILD, + + /// This instruction implements a fp->int store from FP stack + /// slots. This corresponds to the fist instruction. It takes a + /// chain operand, value to store, address, and glue. The memory VT + /// specifies the type to store as. + FIST, + + /// This instruction implements an extending load to FP stack slots. + /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain + /// operand, and ptr to load from. The memory VT specifies the type to + /// load from. + FLD, + + /// This instruction implements a truncating store from FP stack + /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a + /// chain operand, value to store, address, and glue. The memory VT + /// specifies the type to store as. + FST, + + /// These instructions grab the address of the next argument + /// from a va_list. (reads and modifies the va_list in memory) + VAARG_64, + VAARG_X32, + + // Vector truncating store with unsigned/signed saturation + VTRUNCSTOREUS, + VTRUNCSTORES, + // Vector truncating masked store with unsigned/signed saturation + VMTRUNCSTOREUS, + VMTRUNCSTORES, + + // X86 specific gather and scatter + MGATHER, + MSCATTER, + + // Key locker nodes that produce flags. + AESENC128KL, + AESDEC128KL, + AESENC256KL, + AESDEC256KL, + AESENCWIDE128KL, + AESDECWIDE128KL, + AESENCWIDE256KL, + AESDECWIDE256KL, + + /// Compare and Add if Condition is Met. Compare value in operand 2 with + /// value in memory of operand 1. If condition of operand 4 is met, add + /// value operand 3 to m32 and write new value in operand 1. Operand 2 is + /// always updated with the original value from operand 1. + CMPCCXADD, + + // Save xmm argument registers to the stack, according to %al. An operator + // is needed so that this can be expanded with control flow. + VASTART_SAVE_XMM_REGS, + + // Conditional load/store instructions + CLOAD, + CSTORE, + LAST_MEMORY_OPCODE = CSTORE, +}; +} // end namespace X86ISD + +namespace X86 { +/// Current rounding mode is represented in bits 11:10 of FPSR. These +/// values are same as corresponding constants for rounding mode used +/// in glibc. +enum RoundingMode { + rmInvalid = -1, // For handle Invalid rounding mode + rmToNearest = 0, // FE_TONEAREST + rmDownward = 1 << 10, // FE_DOWNWARD + rmUpward = 2 << 10, // FE_UPWARD + rmTowardZero = 3 << 10, // FE_TOWARDZERO + rmMask = 3 << 10 // Bit mask selecting rounding mode +}; +} // namespace X86 + +/// Define some predicates that are used for node matching. +namespace X86 { +/// Returns true if Elt is a constant zero or floating point constant +0.0. +bool isZeroNode(SDValue Elt); + +/// Returns true of the given offset can be +/// fit into displacement field of the instruction. +bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, + bool hasSymbolicDisplacement); + +/// Determines whether the callee is required to pop its +/// own arguments. Callee pop is necessary to support tail calls. +bool isCalleePop(CallingConv::ID CallingConv, bool is64Bit, bool IsVarArg, + bool GuaranteeTCO); + +/// If Op is a constant whose elements are all the same constant or +/// undefined, return true and return the constant value in \p SplatVal. +/// If we have undef bits that don't cover an entire element, we treat these +/// as zero if AllowPartialUndefs is set, else we fail and return false. +bool isConstantSplat(SDValue Op, APInt &SplatVal, + bool AllowPartialUndefs = true); + +/// Check if Op is a load operation that could be folded into some other x86 +/// instruction as a memory operand. Example: vpaddd (%rdi), %xmm0, %xmm0. +bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, + bool AssumeSingleUse = false); + +/// Check if Op is a load operation that could be folded into a vector splat +/// instruction as a memory operand. Example: vbroadcastss 16(%rdi), %xmm2. +bool mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT, + const X86Subtarget &Subtarget, + bool AssumeSingleUse = false); + +/// Check if Op is a value that could be used to fold a store into some +/// other x86 instruction as a memory operand. Ex: pextrb $0, %xmm0, (%rdi). +bool mayFoldIntoStore(SDValue Op); + +/// Check if Op is an operation that could be folded into a zero extend x86 +/// instruction. +bool mayFoldIntoZeroExtend(SDValue Op); + +/// True if the target supports the extended frame for async Swift +/// functions. +bool isExtendedSwiftAsyncFrameSupported(const X86Subtarget &Subtarget, + const MachineFunction &MF); + +/// Convert LLVM rounding mode to X86 rounding mode. +int getRoundingModeX86(unsigned RM); + +} // end namespace X86 + +//===--------------------------------------------------------------------===// +// X86 Implementation of the TargetLowering interface +class X86TargetLowering final : public TargetLowering { +public: + explicit X86TargetLowering(const X86TargetMachine &TM, + const X86Subtarget &STI); + + unsigned getJumpTableEncoding() const override; + bool useSoftFloat() const override; + + void markLibCallAttributes(MachineFunction *MF, unsigned CC, + ArgListTy &Args) const override; + + MVT getScalarShiftAmountTy(const DataLayout &, EVT VT) const override { + return MVT::i8; } - /// Define some predicates that are used for node matching. - namespace X86 { - /// Returns true if Elt is a constant zero or floating point constant +0.0. - bool isZeroNode(SDValue Elt); - - /// Returns true of the given offset can be - /// fit into displacement field of the instruction. - bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, - bool hasSymbolicDisplacement); - - /// Determines whether the callee is required to pop its - /// own arguments. Callee pop is necessary to support tail calls. - bool isCalleePop(CallingConv::ID CallingConv, - bool is64Bit, bool IsVarArg, bool GuaranteeTCO); - - /// If Op is a constant whose elements are all the same constant or - /// undefined, return true and return the constant value in \p SplatVal. - /// If we have undef bits that don't cover an entire element, we treat these - /// as zero if AllowPartialUndefs is set, else we fail and return false. - bool isConstantSplat(SDValue Op, APInt &SplatVal, - bool AllowPartialUndefs = true); - - /// Check if Op is a load operation that could be folded into some other x86 - /// instruction as a memory operand. Example: vpaddd (%rdi), %xmm0, %xmm0. - bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, - bool AssumeSingleUse = false); - - /// Check if Op is a load operation that could be folded into a vector splat - /// instruction as a memory operand. Example: vbroadcastss 16(%rdi), %xmm2. - bool mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT, - const X86Subtarget &Subtarget, - bool AssumeSingleUse = false); - - /// Check if Op is a value that could be used to fold a store into some - /// other x86 instruction as a memory operand. Ex: pextrb $0, %xmm0, (%rdi). - bool mayFoldIntoStore(SDValue Op); - - /// Check if Op is an operation that could be folded into a zero extend x86 - /// instruction. - bool mayFoldIntoZeroExtend(SDValue Op); - - /// True if the target supports the extended frame for async Swift - /// functions. - bool isExtendedSwiftAsyncFrameSupported(const X86Subtarget &Subtarget, - const MachineFunction &MF); - - /// Convert LLVM rounding mode to X86 rounding mode. - int getRoundingModeX86(unsigned RM); - - } // end namespace X86 - - //===--------------------------------------------------------------------===// - // X86 Implementation of the TargetLowering interface - class X86TargetLowering final : public TargetLowering { - public: - explicit X86TargetLowering(const X86TargetMachine &TM, - const X86Subtarget &STI); - - unsigned getJumpTableEncoding() const override; - bool useSoftFloat() const override; - - void markLibCallAttributes(MachineFunction *MF, unsigned CC, - ArgListTy &Args) const override; - - MVT getScalarShiftAmountTy(const DataLayout &, EVT VT) const override { - return MVT::i8; - } - - const MCExpr * - LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, - const MachineBasicBlock *MBB, unsigned uid, - MCContext &Ctx) const override; - - /// Returns relocation base for the given PIC jumptable. - SDValue getPICJumpTableRelocBase(SDValue Table, - SelectionDAG &DAG) const override; - const MCExpr * - getPICJumpTableRelocBaseExpr(const MachineFunction *MF, - unsigned JTI, MCContext &Ctx) const override; - - /// Return the desired alignment for ByVal aggregate - /// function arguments in the caller parameter area. For X86, aggregates - /// that contains are placed at 16-byte boundaries while the rest are at - /// 4-byte boundaries. - Align getByValTypeAlignment(Type *Ty, const DataLayout &DL) const override; - - EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, - const AttributeList &FuncAttributes) const override; - - /// Returns true if it's safe to use load / store of the - /// specified type to expand memcpy / memset inline. This is mostly true - /// for all types except for some special cases. For example, on X86 - /// targets without SSE2 f64 load / store are done with fldl / fstpl which - /// also does type conversion. Note the specified type doesn't have to be - /// legal as the hook is used before type legalization. - bool isSafeMemOpType(MVT VT) const override; - - bool isMemoryAccessFast(EVT VT, Align Alignment) const; - - /// Returns true if the target allows unaligned memory accesses of the - /// specified type. Returns whether it is "fast" in the last argument. - bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, Align Alignment, - MachineMemOperand::Flags Flags, - unsigned *Fast) const override; - - /// This function returns true if the memory access is aligned or if the - /// target allows this specific unaligned memory access. If the access is - /// allowed, the optional final parameter returns a relative speed of the - /// access (as defined by the target). - bool allowsMemoryAccess( - LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace, - Align Alignment, - MachineMemOperand::Flags Flags = MachineMemOperand::MONone, - unsigned *Fast = nullptr) const override; - - bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, - const MachineMemOperand &MMO, - unsigned *Fast) const { - return allowsMemoryAccess(Context, DL, VT, MMO.getAddrSpace(), - MMO.getAlign(), MMO.getFlags(), Fast); - } - - /// Provide custom lowering hooks for some operations. - /// - SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; - - bool isSelectSupported(SelectSupportKind Kind) const override; - - /// Replace the results of node with an illegal result - /// type with new values built out of custom code. - /// - void ReplaceNodeResults(SDNode *N, SmallVectorImpl&Results, - SelectionDAG &DAG) const override; - - SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; - - bool preferABDSToABSWithNSW(EVT VT) const override; - - bool preferSextInRegOfTruncate(EVT TruncVT, EVT VT, - EVT ExtVT) const override; - - bool isXAndYEqZeroPreferableToXAndYEqY(ISD::CondCode Cond, - EVT VT) const override; - - /// Return true if the target has native support for - /// the specified value type and it is 'desirable' to use the type for the - /// given node type. e.g. On x86 i16 is legal, but undesirable since i16 - /// instruction encodings are longer and some i16 instructions are slow. - bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override; - - /// Return true if the target has native support for the - /// specified value type and it is 'desirable' to use the type. e.g. On x86 - /// i16 is legal, but undesirable since i16 instruction encodings are longer - /// and some i16 instructions are slow. - bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override; - - /// Return prefered fold type, Abs if this is a vector, AddAnd if its an - /// integer, None otherwise. - TargetLowering::AndOrSETCCFoldKind - isDesirableToCombineLogicOpOfSETCC(const SDNode *LogicOp, - const SDNode *SETCC0, - const SDNode *SETCC1) const override; - - /// Return the newly negated expression if the cost is not expensive and - /// set the cost in \p Cost to indicate that if it is cheaper or neutral to - /// do the negation. - SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, - bool LegalOperations, bool ForCodeSize, - NegatibleCost &Cost, - unsigned Depth) const override; + const MCExpr *LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, + const MachineBasicBlock *MBB, + unsigned uid, + MCContext &Ctx) const override; + + /// Returns relocation base for the given PIC jumptable. + SDValue getPICJumpTableRelocBase(SDValue Table, + SelectionDAG &DAG) const override; + const MCExpr *getPICJumpTableRelocBaseExpr(const MachineFunction *MF, + unsigned JTI, + MCContext &Ctx) const override; + + /// Return the desired alignment for ByVal aggregate + /// function arguments in the caller parameter area. For X86, aggregates + /// that contains are placed at 16-byte boundaries while the rest are at + /// 4-byte boundaries. + Align getByValTypeAlignment(Type *Ty, const DataLayout &DL) const override; + + EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, + const AttributeList &FuncAttributes) const override; + + /// Returns true if it's safe to use load / store of the + /// specified type to expand memcpy / memset inline. This is mostly true + /// for all types except for some special cases. For example, on X86 + /// targets without SSE2 f64 load / store are done with fldl / fstpl which + /// also does type conversion. Note the specified type doesn't have to be + /// legal as the hook is used before type legalization. + bool isSafeMemOpType(MVT VT) const override; + + bool isMemoryAccessFast(EVT VT, Align Alignment) const; + + /// Returns true if the target allows unaligned memory accesses of the + /// specified type. Returns whether it is "fast" in the last argument. + bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, Align Alignment, + MachineMemOperand::Flags Flags, + unsigned *Fast) const override; + + /// This function returns true if the memory access is aligned or if the + /// target allows this specific unaligned memory access. If the access is + /// allowed, the optional final parameter returns a relative speed of the + /// access (as defined by the target). + bool + allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, + unsigned AddrSpace, Align Alignment, + MachineMemOperand::Flags Flags = MachineMemOperand::MONone, + unsigned *Fast = nullptr) const override; + + bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, + const MachineMemOperand &MMO, unsigned *Fast) const { + return allowsMemoryAccess(Context, DL, VT, MMO.getAddrSpace(), + MMO.getAlign(), MMO.getFlags(), Fast); + } + + /// Provide custom lowering hooks for some operations. + /// + SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; + + bool isSelectSupported(SelectSupportKind Kind) const override; + + /// Replace the results of node with an illegal result + /// type with new values built out of custom code. + /// + void ReplaceNodeResults(SDNode *N, SmallVectorImpl &Results, + SelectionDAG &DAG) const override; + + SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; + + bool preferABDSToABSWithNSW(EVT VT) const override; + + bool preferSextInRegOfTruncate(EVT TruncVT, EVT VT, EVT ExtVT) const override; + + bool isXAndYEqZeroPreferableToXAndYEqY(ISD::CondCode Cond, + EVT VT) const override; + + /// Return true if the target has native support for + /// the specified value type and it is 'desirable' to use the type for the + /// given node type. e.g. On x86 i16 is legal, but undesirable since i16 + /// instruction encodings are longer and some i16 instructions are slow. + bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override; + + /// Return true if the target has native support for the + /// specified value type and it is 'desirable' to use the type. e.g. On x86 + /// i16 is legal, but undesirable since i16 instruction encodings are longer + /// and some i16 instructions are slow. + bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override; + + /// Return prefered fold type, Abs if this is a vector, AddAnd if its an + /// integer, None otherwise. + TargetLowering::AndOrSETCCFoldKind + isDesirableToCombineLogicOpOfSETCC(const SDNode *LogicOp, + const SDNode *SETCC0, + const SDNode *SETCC1) const override; + + /// Return the newly negated expression if the cost is not expensive and + /// set the cost in \p Cost to indicate that if it is cheaper or neutral to + /// do the negation. + SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, + bool LegalOperations, bool ForCodeSize, + NegatibleCost &Cost, + unsigned Depth) const override; + + MachineBasicBlock * + EmitInstrWithCustomInserter(MachineInstr &MI, + MachineBasicBlock *MBB) const override; + + /// This method returns the name of a target specific DAG node. + const char *getTargetNodeName(unsigned Opcode) const override; + + /// Do not merge vector stores after legalization because that may conflict + /// with x86-specific store splitting optimizations. + bool mergeStoresAfterLegalization(EVT MemVT) const override { + return !MemVT.isVector(); + } + + bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, + const MachineFunction &MF) const override; + + bool isCheapToSpeculateCttz(Type *Ty) const override; + + bool isCheapToSpeculateCtlz(Type *Ty) const override; + + bool isCtlzFast() const override; + + bool isMultiStoresCheaperThanBitsMerge(EVT LTy, EVT HTy) const override { + // If the pair to store is a mixture of float and int values, we will + // save two bitwise instructions and one float-to-int instruction and + // increase one store instruction. There is potentially a more + // significant benefit because it avoids the float->int domain switch + // for input value. So It is more likely a win. + if ((LTy.isFloatingPoint() && HTy.isInteger()) || + (LTy.isInteger() && HTy.isFloatingPoint())) + return true; + // If the pair only contains int values, we will save two bitwise + // instructions and increase one store instruction (costing one more + // store buffer). Since the benefit is more blurred so we leave + // such pair out until we get testcase to prove it is a win. + return false; + } + + bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override; + + bool hasAndNotCompare(SDValue Y) const override; + + bool hasAndNot(SDValue Y) const override; + + bool hasBitTest(SDValue X, SDValue Y) const override; - MachineBasicBlock * - EmitInstrWithCustomInserter(MachineInstr &MI, - MachineBasicBlock *MBB) const override; - - /// This method returns the name of a target specific DAG node. - const char *getTargetNodeName(unsigned Opcode) const override; - - /// Do not merge vector stores after legalization because that may conflict - /// with x86-specific store splitting optimizations. - bool mergeStoresAfterLegalization(EVT MemVT) const override { - return !MemVT.isVector(); - } - - bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, - const MachineFunction &MF) const override; - - bool isCheapToSpeculateCttz(Type *Ty) const override; - - bool isCheapToSpeculateCtlz(Type *Ty) const override; - - bool isCtlzFast() const override; - - bool isMultiStoresCheaperThanBitsMerge(EVT LTy, EVT HTy) const override { - // If the pair to store is a mixture of float and int values, we will - // save two bitwise instructions and one float-to-int instruction and - // increase one store instruction. There is potentially a more - // significant benefit because it avoids the float->int domain switch - // for input value. So It is more likely a win. - if ((LTy.isFloatingPoint() && HTy.isInteger()) || - (LTy.isInteger() && HTy.isFloatingPoint())) - return true; - // If the pair only contains int values, we will save two bitwise - // instructions and increase one store instruction (costing one more - // store buffer). Since the benefit is more blurred so we leave - // such pair out until we get testcase to prove it is a win. + bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd( + SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, + unsigned OldShiftOpcode, unsigned NewShiftOpcode, + SelectionDAG &DAG) const override; + + unsigned preferedOpcodeForCmpEqPiecesOfOperand( + EVT VT, unsigned ShiftOpc, bool MayTransformRotate, + const APInt &ShiftOrRotateAmt, + const std::optional &AndMask) const override; + + bool preferScalarizeSplat(SDNode *N) const override; + + CondMergingParams + getJumpConditionMergingParams(Instruction::BinaryOps Opc, const Value *Lhs, + const Value *Rhs) const override; + + bool shouldFoldConstantShiftPairToMask(const SDNode *N) const override; + + bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override; + + bool shouldTransformSignedTruncationCheck(EVT XVT, + unsigned KeptBits) const override { + // For vectors, we don't have a preference.. + if (XVT.isVector()) return false; - } - bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override; - - bool hasAndNotCompare(SDValue Y) const override; - - bool hasAndNot(SDValue Y) const override; - - bool hasBitTest(SDValue X, SDValue Y) const override; - - bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd( - SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, - unsigned OldShiftOpcode, unsigned NewShiftOpcode, - SelectionDAG &DAG) const override; - - unsigned preferedOpcodeForCmpEqPiecesOfOperand( - EVT VT, unsigned ShiftOpc, bool MayTransformRotate, - const APInt &ShiftOrRotateAmt, - const std::optional &AndMask) const override; - - bool preferScalarizeSplat(SDNode *N) const override; - - CondMergingParams - getJumpConditionMergingParams(Instruction::BinaryOps Opc, const Value *Lhs, - const Value *Rhs) const override; - - bool shouldFoldConstantShiftPairToMask(const SDNode *N) const override; - - bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override; - - bool - shouldTransformSignedTruncationCheck(EVT XVT, - unsigned KeptBits) const override { - // For vectors, we don't have a preference.. - if (XVT.isVector()) - return false; - - auto VTIsOk = [](EVT VT) -> bool { - return VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || - VT == MVT::i64; - }; - - // We are ok with KeptBitsVT being byte/word/dword, what MOVS supports. - // XVT will be larger than KeptBitsVT. - MVT KeptBitsVT = MVT::getIntegerVT(KeptBits); - return VTIsOk(XVT) && VTIsOk(KeptBitsVT); - } - - ShiftLegalizationStrategy - preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, - unsigned ExpansionFactor) const override; - - bool shouldSplatInsEltVarIndex(EVT VT) const override; - - bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override { - // Converting to sat variants holds little benefit on X86 as we will just - // need to saturate the value back using fp arithmatic. - return Op != ISD::FP_TO_UINT_SAT && isOperationLegalOrCustom(Op, VT); - } - - bool convertSetCCLogicToBitwiseLogic(EVT VT) const override { - return VT.isScalarInteger(); - } - - /// Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST. - MVT hasFastEqualityCompare(unsigned NumBits) const override; - - /// Return the value type to use for ISD::SETCC. - EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, - EVT VT) const override; - - bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, - const APInt &DemandedElts, - TargetLoweringOpt &TLO) const override; - - /// Determine which of the bits specified in Mask are known to be either - /// zero or one and return them in the KnownZero/KnownOne bitsets. - void computeKnownBitsForTargetNode(const SDValue Op, - KnownBits &Known, - const APInt &DemandedElts, - const SelectionDAG &DAG, - unsigned Depth = 0) const override; - - /// Determine the number of bits in the operation that are sign bits. - unsigned ComputeNumSignBitsForTargetNode(SDValue Op, - const APInt &DemandedElts, - const SelectionDAG &DAG, - unsigned Depth) const override; - - bool SimplifyDemandedVectorEltsForTargetNode(SDValue Op, - const APInt &DemandedElts, - APInt &KnownUndef, - APInt &KnownZero, - TargetLoweringOpt &TLO, - unsigned Depth) const override; - - bool SimplifyDemandedVectorEltsForTargetShuffle(SDValue Op, - const APInt &DemandedElts, - unsigned MaskIndex, - TargetLoweringOpt &TLO, - unsigned Depth) const; + auto VTIsOk = [](EVT VT) -> bool { + return VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || + VT == MVT::i64; + }; + + // We are ok with KeptBitsVT being byte/word/dword, what MOVS supports. + // XVT will be larger than KeptBitsVT. + MVT KeptBitsVT = MVT::getIntegerVT(KeptBits); + return VTIsOk(XVT) && VTIsOk(KeptBitsVT); + } + + ShiftLegalizationStrategy + preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, + unsigned ExpansionFactor) const override; - bool SimplifyDemandedBitsForTargetNode(SDValue Op, - const APInt &DemandedBits, + bool shouldSplatInsEltVarIndex(EVT VT) const override; + + bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override { + // Converting to sat variants holds little benefit on X86 as we will just + // need to saturate the value back using fp arithmatic. + return Op != ISD::FP_TO_UINT_SAT && isOperationLegalOrCustom(Op, VT); + } + + bool convertSetCCLogicToBitwiseLogic(EVT VT) const override { + return VT.isScalarInteger(); + } + + /// Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST. + MVT hasFastEqualityCompare(unsigned NumBits) const override; + + /// Return the value type to use for ISD::SETCC. + EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, + EVT VT) const override; + + bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, + const APInt &DemandedElts, + TargetLoweringOpt &TLO) const override; + + /// Determine which of the bits specified in Mask are known to be either + /// zero or one and return them in the KnownZero/KnownOne bitsets. + void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, + const APInt &DemandedElts, + const SelectionDAG &DAG, + unsigned Depth = 0) const override; + + /// Determine the number of bits in the operation that are sign bits. + unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, - KnownBits &Known, - TargetLoweringOpt &TLO, + const SelectionDAG &DAG, unsigned Depth) const override; - SDValue SimplifyMultipleUseDemandedBitsForTargetNode( - SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, - SelectionDAG &DAG, unsigned Depth) const override; + bool SimplifyDemandedVectorEltsForTargetNode( + SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, + APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth) const override; - bool isGuaranteedNotToBeUndefOrPoisonForTargetNode( - SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, - bool PoisonOnly, unsigned Depth) const override; + bool SimplifyDemandedVectorEltsForTargetShuffle(SDValue Op, + const APInt &DemandedElts, + unsigned MaskIndex, + TargetLoweringOpt &TLO, + unsigned Depth) const; - bool canCreateUndefOrPoisonForTargetNode( - SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, - bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override; + bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, + const APInt &DemandedElts, + KnownBits &Known, + TargetLoweringOpt &TLO, + unsigned Depth) const override; - bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, - APInt &UndefElts, const SelectionDAG &DAG, - unsigned Depth) const override; + SDValue SimplifyMultipleUseDemandedBitsForTargetNode( + SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, + SelectionDAG &DAG, unsigned Depth) const override; - bool isTargetCanonicalConstantNode(SDValue Op) const override { - // Peek through bitcasts/extracts/inserts to see if we have a vector - // load/broadcast from memory. - while (Op.getOpcode() == ISD::BITCAST || - Op.getOpcode() == ISD::EXTRACT_SUBVECTOR || - (Op.getOpcode() == ISD::INSERT_SUBVECTOR && - Op.getOperand(0).isUndef())) - Op = Op.getOperand(Op.getOpcode() == ISD::INSERT_SUBVECTOR ? 1 : 0); + bool isGuaranteedNotToBeUndefOrPoisonForTargetNode( + SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, + bool PoisonOnly, unsigned Depth) const override; - return Op.getOpcode() == X86ISD::VBROADCAST_LOAD || - Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD || - (Op.getOpcode() == ISD::LOAD && - getTargetConstantFromLoad(cast(Op))) || - TargetLowering::isTargetCanonicalConstantNode(Op); - } + bool canCreateUndefOrPoisonForTargetNode(SDValue Op, + const APInt &DemandedElts, + const SelectionDAG &DAG, + bool PoisonOnly, bool ConsiderFlags, + unsigned Depth) const override; + + bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, + APInt &UndefElts, const SelectionDAG &DAG, + unsigned Depth) const override; + + bool isTargetCanonicalConstantNode(SDValue Op) const override { + // Peek through bitcasts/extracts/inserts to see if we have a vector + // load/broadcast from memory. + while ( + Op.getOpcode() == ISD::BITCAST || + Op.getOpcode() == ISD::EXTRACT_SUBVECTOR || + (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op.getOperand(0).isUndef())) + Op = Op.getOperand(Op.getOpcode() == ISD::INSERT_SUBVECTOR ? 1 : 0); + + return Op.getOpcode() == X86ISD::VBROADCAST_LOAD || + Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD || + (Op.getOpcode() == ISD::LOAD && + getTargetConstantFromLoad(cast(Op))) || + TargetLowering::isTargetCanonicalConstantNode(Op); + } - bool isTargetCanonicalSelect(SDNode *N) const override; + bool isTargetCanonicalSelect(SDNode *N) const override; - const Constant *getTargetConstantFromLoad(LoadSDNode *LD) const override; + const Constant *getTargetConstantFromLoad(LoadSDNode *LD) const override; - SDValue unwrapAddress(SDValue N) const override; + SDValue unwrapAddress(SDValue N) const override; - SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const; + SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const; - ConstraintType getConstraintType(StringRef Constraint) const override; + ConstraintType getConstraintType(StringRef Constraint) const override; - /// Examine constraint string and operand type and determine a weight value. - /// The operand object must already have been set up with the operand type. - ConstraintWeight - getSingleConstraintMatchWeight(AsmOperandInfo &Info, - const char *Constraint) const override; + /// Examine constraint string and operand type and determine a weight value. + /// The operand object must already have been set up with the operand type. + ConstraintWeight + getSingleConstraintMatchWeight(AsmOperandInfo &Info, + const char *Constraint) const override; - const char *LowerXConstraint(EVT ConstraintVT) const override; + const char *LowerXConstraint(EVT ConstraintVT) const override; - /// Lower the specified operand into the Ops vector. If it is invalid, don't - /// add anything to Ops. If hasMemory is true it means one of the asm - /// constraint of the inline asm instruction being processed is 'm'. - void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, - std::vector &Ops, + /// Lower the specified operand into the Ops vector. If it is invalid, don't + /// add anything to Ops. If hasMemory is true it means one of the asm + /// constraint of the inline asm instruction being processed is 'm'. + void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, + std::vector &Ops, + SelectionDAG &DAG) const override; + + InlineAsm::ConstraintCode + getInlineAsmMemConstraint(StringRef ConstraintCode) const override { + if (ConstraintCode == "v") + return InlineAsm::ConstraintCode::v; + return TargetLowering::getInlineAsmMemConstraint(ConstraintCode); + } + + /// Handle Lowering flag assembly outputs. + SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag, + const SDLoc &DL, + const AsmOperandInfo &Constraint, SelectionDAG &DAG) const override; - InlineAsm::ConstraintCode - getInlineAsmMemConstraint(StringRef ConstraintCode) const override { - if (ConstraintCode == "v") - return InlineAsm::ConstraintCode::v; - return TargetLowering::getInlineAsmMemConstraint(ConstraintCode); - } - - /// Handle Lowering flag assembly outputs. - SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag, - const SDLoc &DL, - const AsmOperandInfo &Constraint, - SelectionDAG &DAG) const override; - - /// Given a physical register constraint - /// (e.g. {edx}), return the register number and the register class for the - /// register. This should only be used for C_Register constraints. On - /// error, this returns a register number of 0. - std::pair - getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, - StringRef Constraint, MVT VT) const override; - - /// Return true if the addressing mode represented - /// by AM is legal for this target, for a load/store of the specified type. - bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, - Type *Ty, unsigned AS, - Instruction *I = nullptr) const override; - - bool addressingModeSupportsTLS(const GlobalValue &GV) const override; - - /// Return true if the specified immediate is legal - /// icmp immediate, that is the target has icmp instructions which can - /// compare a register against the immediate without having to materialize - /// the immediate into a register. - bool isLegalICmpImmediate(int64_t Imm) const override; - - /// Return true if the specified immediate is legal - /// add immediate, that is the target has add instructions which can - /// add a register and the immediate without having to materialize - /// the immediate into a register. - bool isLegalAddImmediate(int64_t Imm) const override; - - bool isLegalStoreImmediate(int64_t Imm) const override; - - /// Add x86-specific opcodes to the default list. - bool isBinOp(unsigned Opcode) const override; - - /// Returns true if the opcode is a commutative binary operation. - bool isCommutativeBinOp(unsigned Opcode) const override; - - /// Return true if it's free to truncate a value of - /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in - /// register EAX to i16 by referencing its sub-register AX. - bool isTruncateFree(Type *Ty1, Type *Ty2) const override; - bool isTruncateFree(EVT VT1, EVT VT2) const override; - - bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override; - - /// Return true if any actual instruction that defines a - /// value of type Ty1 implicit zero-extends the value to Ty2 in the result - /// register. This does not necessarily include registers defined in - /// unknown ways, such as incoming arguments, or copies from unknown - /// virtual registers. Also, if isTruncateFree(Ty2, Ty1) is true, this - /// does not necessarily apply to truncate instructions. e.g. on x86-64, - /// all instructions that define 32-bit values implicit zero-extend the - /// result out to 64 bits. - bool isZExtFree(Type *Ty1, Type *Ty2) const override; - bool isZExtFree(EVT VT1, EVT VT2) const override; - bool isZExtFree(SDValue Val, EVT VT2) const override; - - bool shouldConvertPhiType(Type *From, Type *To) const override; - - /// Return true if folding a vector load into ExtVal (a sign, zero, or any - /// extend node) is profitable. - bool isVectorLoadExtDesirable(SDValue) const override; - - /// Return true if an FMA operation is faster than a pair of fmul and fadd - /// instructions. fmuladd intrinsics will be expanded to FMAs when this - /// method returns true, otherwise fmuladd is expanded to fmul + fadd. - bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, + /// Given a physical register constraint + /// (e.g. {edx}), return the register number and the register class for the + /// register. This should only be used for C_Register constraints. On + /// error, this returns a register number of 0. + std::pair + getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, + StringRef Constraint, MVT VT) const override; + + /// Return true if the addressing mode represented + /// by AM is legal for this target, for a load/store of the specified type. + bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, + unsigned AS, + Instruction *I = nullptr) const override; + + bool addressingModeSupportsTLS(const GlobalValue &GV) const override; + + /// Return true if the specified immediate is legal + /// icmp immediate, that is the target has icmp instructions which can + /// compare a register against the immediate without having to materialize + /// the immediate into a register. + bool isLegalICmpImmediate(int64_t Imm) const override; + + /// Return true if the specified immediate is legal + /// add immediate, that is the target has add instructions which can + /// add a register and the immediate without having to materialize + /// the immediate into a register. + bool isLegalAddImmediate(int64_t Imm) const override; + + bool isLegalStoreImmediate(int64_t Imm) const override; + + /// Add x86-specific opcodes to the default list. + bool isBinOp(unsigned Opcode) const override; + + /// Returns true if the opcode is a commutative binary operation. + bool isCommutativeBinOp(unsigned Opcode) const override; + + /// Return true if it's free to truncate a value of + /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in + /// register EAX to i16 by referencing its sub-register AX. + bool isTruncateFree(Type *Ty1, Type *Ty2) const override; + bool isTruncateFree(EVT VT1, EVT VT2) const override; + + bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override; + + /// Return true if any actual instruction that defines a + /// value of type Ty1 implicit zero-extends the value to Ty2 in the result + /// register. This does not necessarily include registers defined in + /// unknown ways, such as incoming arguments, or copies from unknown + /// virtual registers. Also, if isTruncateFree(Ty2, Ty1) is true, this + /// does not necessarily apply to truncate instructions. e.g. on x86-64, + /// all instructions that define 32-bit values implicit zero-extend the + /// result out to 64 bits. + bool isZExtFree(Type *Ty1, Type *Ty2) const override; + bool isZExtFree(EVT VT1, EVT VT2) const override; + bool isZExtFree(SDValue Val, EVT VT2) const override; + + bool shouldConvertPhiType(Type *From, Type *To) const override; + + /// Return true if folding a vector load into ExtVal (a sign, zero, or any + /// extend node) is profitable. + bool isVectorLoadExtDesirable(SDValue) const override; + + /// Return true if an FMA operation is faster than a pair of fmul and fadd + /// instructions. fmuladd intrinsics will be expanded to FMAs when this + /// method returns true, otherwise fmuladd is expanded to fmul + fadd. + bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, + EVT VT) const override; + + /// Return true if it's profitable to narrow operations of type SrcVT to + /// DestVT. e.g. on x86, it's profitable to narrow from i32 to i8 but not + /// from i32 to i16. + bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override; + + bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT, + unsigned SelectOpcode, SDValue X, + SDValue Y) const override; + + /// Given an intrinsic, checks if on the target the intrinsic will need to map + /// to a MemIntrinsicNode (touches memory). If this is the case, it returns + /// true and stores the intrinsic information into the IntrinsicInfo that was + /// passed to the function. + bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, + MachineFunction &MF, + unsigned Intrinsic) const override; + + /// Returns true if the target can instruction select the + /// specified FP immediate natively. If false, the legalizer will + /// materialize the FP immediate as a load from a constant pool. + bool isFPImmLegal(const APFloat &Imm, EVT VT, + bool ForCodeSize) const override; + + /// Targets can use this to indicate that they only support *some* + /// VECTOR_SHUFFLE operations, those with specific masks. By default, if a + /// target supports the VECTOR_SHUFFLE node, all mask values are assumed to + /// be legal. + bool isShuffleMaskLegal(ArrayRef Mask, EVT VT) const override; + + /// Similar to isShuffleMaskLegal. Targets can use this to indicate if there + /// is a suitable VECTOR_SHUFFLE that can be used to replace a VAND with a + /// constant pool entry. + bool isVectorClearMaskLegal(ArrayRef Mask, EVT VT) const override; + + /// Returns true if lowering to a jump table is allowed. + bool areJTsAllowed(const Function *Fn) const override; + + MVT getPreferredSwitchConditionType(LLVMContext &Context, + EVT ConditionVT) const override; + + /// If true, then instruction selection should + /// seek to shrink the FP constant of the specified type to a smaller type + /// in order to save space and / or reduce runtime. + bool ShouldShrinkFPConstant(EVT VT) const override; + + /// Return true if we believe it is correct and profitable to reduce the + /// load node to a smaller type. + bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, + std::optional ByteOffset) const override; + + /// Return true if the specified scalar FP type is computed in an SSE + /// register, not on the X87 floating point stack. + bool isScalarFPTypeInSSEReg(EVT VT) const; + + /// Returns true if it is beneficial to convert a load of a constant + /// to just the constant itself. + bool shouldConvertConstantLoadToIntImm(const APInt &Imm, + Type *Ty) const override; + + bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const override; + + bool convertSelectOfConstantsToMath(EVT VT) const override; + + bool decomposeMulByConstant(LLVMContext &Context, EVT VT, + SDValue C) const override; + + /// Return true if EXTRACT_SUBVECTOR is cheap for this result type + /// with this index. + bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, + unsigned Index) const override; + + /// Scalar ops always have equal or better analysis/performance/power than + /// the vector equivalent, so this always makes sense if the scalar op is + /// supported. + bool shouldScalarizeBinop(SDValue) const override; + + /// Extract of a scalar FP value from index 0 of a vector is free. + bool isExtractVecEltCheap(EVT VT, unsigned Index) const override { + EVT EltVT = VT.getScalarType(); + return (EltVT == MVT::f32 || EltVT == MVT::f64) && Index == 0; + } + + /// Overflow nodes should get combined/lowered to optimal instructions + /// (they should allow eliminating explicit compares by getting flags from + /// math ops). + bool shouldFormOverflowOp(unsigned Opcode, EVT VT, + bool MathUsed) const override; + + bool storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT, unsigned NumElem, + unsigned AddrSpace) const override { + // If we can replace more than 2 scalar stores, there will be a reduction + // in instructions even after we add a vector constant load. + return IsZero || NumElem > 2; + } + + bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, + const SelectionDAG &DAG, + const MachineMemOperand &MMO) const override; + + Register getRegisterByName(const char *RegName, LLT VT, + const MachineFunction &MF) const override; + + /// If a physical register, this returns the register that receives the + /// exception address on entry to an EH pad. + Register + getExceptionPointerRegister(const Constant *PersonalityFn) const override; + + /// If a physical register, this returns the register that receives the + /// exception typeid on entry to a landing pad. + Register + getExceptionSelectorRegister(const Constant *PersonalityFn) const override; + + bool needsFixedCatchObjects() const override; + + /// This method returns a target specific FastISel object, + /// or null if the target does not support "fast" ISel. + FastISel *createFastISel(FunctionLoweringInfo &funcInfo, + const TargetLibraryInfo *libInfo) const override; + + /// If the target has a standard location for the stack protector cookie, + /// returns the address of that location. Otherwise, returns nullptr. + Value *getIRStackGuard(IRBuilderBase &IRB) const override; + + bool useLoadStackGuardNode(const Module &M) const override; + bool useStackGuardXorFP() const override; + void insertSSPDeclarations(Module &M) const override; + SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, + const SDLoc &DL) const override; + + /// Return true if the target stores SafeStack pointer at a fixed offset in + /// some non-standard address space, and populates the address space and + /// offset as appropriate. + Value *getSafeStackPointerLocation(IRBuilderBase &IRB) const override; + + std::pair BuildFILD(EVT DstVT, EVT SrcVT, const SDLoc &DL, + SDValue Chain, SDValue Pointer, + MachinePointerInfo PtrInfo, + Align Alignment, + SelectionDAG &DAG) const; + + /// Customize the preferred legalization strategy for certain types. + LegalizeTypeAction getPreferredVectorAction(MVT VT) const override; + + bool softPromoteHalfType() const override { return true; } + + MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override; - /// Return true if it's profitable to narrow operations of type SrcVT to - /// DestVT. e.g. on x86, it's profitable to narrow from i32 to i8 but not - /// from i32 to i16. - bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override; - - bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT, - unsigned SelectOpcode, SDValue X, - SDValue Y) const override; - - /// Given an intrinsic, checks if on the target the intrinsic will need to map - /// to a MemIntrinsicNode (touches memory). If this is the case, it returns - /// true and stores the intrinsic information into the IntrinsicInfo that was - /// passed to the function. - bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, - MachineFunction &MF, - unsigned Intrinsic) const override; - - /// Returns true if the target can instruction select the - /// specified FP immediate natively. If false, the legalizer will - /// materialize the FP immediate as a load from a constant pool. - bool isFPImmLegal(const APFloat &Imm, EVT VT, - bool ForCodeSize) const override; - - /// Targets can use this to indicate that they only support *some* - /// VECTOR_SHUFFLE operations, those with specific masks. By default, if a - /// target supports the VECTOR_SHUFFLE node, all mask values are assumed to - /// be legal. - bool isShuffleMaskLegal(ArrayRef Mask, EVT VT) const override; - - /// Similar to isShuffleMaskLegal. Targets can use this to indicate if there - /// is a suitable VECTOR_SHUFFLE that can be used to replace a VAND with a - /// constant pool entry. - bool isVectorClearMaskLegal(ArrayRef Mask, EVT VT) const override; - - /// Returns true if lowering to a jump table is allowed. - bool areJTsAllowed(const Function *Fn) const override; - - MVT getPreferredSwitchConditionType(LLVMContext &Context, - EVT ConditionVT) const override; - - /// If true, then instruction selection should - /// seek to shrink the FP constant of the specified type to a smaller type - /// in order to save space and / or reduce runtime. - bool ShouldShrinkFPConstant(EVT VT) const override; - - /// Return true if we believe it is correct and profitable to reduce the - /// load node to a smaller type. - bool - shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, - std::optional ByteOffset) const override; - - /// Return true if the specified scalar FP type is computed in an SSE - /// register, not on the X87 floating point stack. - bool isScalarFPTypeInSSEReg(EVT VT) const; - - /// Returns true if it is beneficial to convert a load of a constant - /// to just the constant itself. - bool shouldConvertConstantLoadToIntImm(const APInt &Imm, - Type *Ty) const override; - - bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const override; - - bool convertSelectOfConstantsToMath(EVT VT) const override; - - bool decomposeMulByConstant(LLVMContext &Context, EVT VT, - SDValue C) const override; - - /// Return true if EXTRACT_SUBVECTOR is cheap for this result type - /// with this index. - bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, - unsigned Index) const override; - - /// Scalar ops always have equal or better analysis/performance/power than - /// the vector equivalent, so this always makes sense if the scalar op is - /// supported. - bool shouldScalarizeBinop(SDValue) const override; - - /// Extract of a scalar FP value from index 0 of a vector is free. - bool isExtractVecEltCheap(EVT VT, unsigned Index) const override { - EVT EltVT = VT.getScalarType(); - return (EltVT == MVT::f32 || EltVT == MVT::f64) && Index == 0; - } + unsigned getNumRegistersForCallingConv(LLVMContext &Context, + CallingConv::ID CC, + EVT VT) const override; - /// Overflow nodes should get combined/lowered to optimal instructions - /// (they should allow eliminating explicit compares by getting flags from - /// math ops). - bool shouldFormOverflowOp(unsigned Opcode, EVT VT, - bool MathUsed) const override; + unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, + CallingConv::ID CC, EVT VT, + EVT &IntermediateVT, + unsigned &NumIntermediates, + MVT &RegisterVT) const override; - bool storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT, unsigned NumElem, - unsigned AddrSpace) const override { - // If we can replace more than 2 scalar stores, there will be a reduction - // in instructions even after we add a vector constant load. - return IsZero || NumElem > 2; - } - - bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, - const SelectionDAG &DAG, - const MachineMemOperand &MMO) const override; - - Register getRegisterByName(const char* RegName, LLT VT, - const MachineFunction &MF) const override; - - /// If a physical register, this returns the register that receives the - /// exception address on entry to an EH pad. - Register - getExceptionPointerRegister(const Constant *PersonalityFn) const override; - - /// If a physical register, this returns the register that receives the - /// exception typeid on entry to a landing pad. - Register - getExceptionSelectorRegister(const Constant *PersonalityFn) const override; - - bool needsFixedCatchObjects() const override; - - /// This method returns a target specific FastISel object, - /// or null if the target does not support "fast" ISel. - FastISel *createFastISel(FunctionLoweringInfo &funcInfo, - const TargetLibraryInfo *libInfo) const override; - - /// If the target has a standard location for the stack protector cookie, - /// returns the address of that location. Otherwise, returns nullptr. - Value *getIRStackGuard(IRBuilderBase &IRB) const override; - - bool useLoadStackGuardNode(const Module &M) const override; - bool useStackGuardXorFP() const override; - void insertSSPDeclarations(Module &M) const override; - SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, - const SDLoc &DL) const override; + bool functionArgumentNeedsConsecutiveRegisters( + Type *Ty, CallingConv::ID CallConv, bool isVarArg, + const DataLayout &DL) const override; + bool isIntDivCheap(EVT VT, AttributeList Attr) const override; - /// Return true if the target stores SafeStack pointer at a fixed offset in - /// some non-standard address space, and populates the address space and - /// offset as appropriate. - Value *getSafeStackPointerLocation(IRBuilderBase &IRB) const override; + bool supportSwiftError() const override; - std::pair BuildFILD(EVT DstVT, EVT SrcVT, const SDLoc &DL, - SDValue Chain, SDValue Pointer, - MachinePointerInfo PtrInfo, - Align Alignment, - SelectionDAG &DAG) const; - - /// Customize the preferred legalization strategy for certain types. - LegalizeTypeAction getPreferredVectorAction(MVT VT) const override; + bool supportKCFIBundles() const override { return true; } - bool softPromoteHalfType() const override { return true; } - - MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, - EVT VT) const override; + MachineInstr *EmitKCFICheck(MachineBasicBlock &MBB, + MachineBasicBlock::instr_iterator &MBBI, + const TargetInstrInfo *TII) const override; - unsigned getNumRegistersForCallingConv(LLVMContext &Context, - CallingConv::ID CC, - EVT VT) const override; + bool hasStackProbeSymbol(const MachineFunction &MF) const override; + bool hasInlineStackProbe(const MachineFunction &MF) const override; + StringRef getStackProbeSymbolName(const MachineFunction &MF) const override; - unsigned getVectorTypeBreakdownForCallingConv( - LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, - unsigned &NumIntermediates, MVT &RegisterVT) const override; + unsigned getStackProbeSize(const MachineFunction &MF) const; - bool functionArgumentNeedsConsecutiveRegisters( - Type *Ty, CallingConv::ID CallConv, bool isVarArg, - const DataLayout &DL) const override; + bool hasVectorBlend() const override { return true; } - bool isIntDivCheap(EVT VT, AttributeList Attr) const override; - - bool supportSwiftError() const override; - - bool supportKCFIBundles() const override { return true; } - - MachineInstr *EmitKCFICheck(MachineBasicBlock &MBB, - MachineBasicBlock::instr_iterator &MBBI, - const TargetInstrInfo *TII) const override; - - bool hasStackProbeSymbol(const MachineFunction &MF) const override; - bool hasInlineStackProbe(const MachineFunction &MF) const override; - StringRef getStackProbeSymbolName(const MachineFunction &MF) const override; - - unsigned getStackProbeSize(const MachineFunction &MF) const; - - bool hasVectorBlend() const override { return true; } - - unsigned getMaxSupportedInterleaveFactor() const override { return 4; } + unsigned getMaxSupportedInterleaveFactor() const override { return 4; } - bool isInlineAsmTargetBranch(const SmallVectorImpl &AsmStrs, - unsigned OpNo) const override; + bool isInlineAsmTargetBranch(const SmallVectorImpl &AsmStrs, + unsigned OpNo) const override; - SDValue visitMaskedLoad(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, - MachineMemOperand *MMO, SDValue &NewLoad, - SDValue Ptr, SDValue PassThru, - SDValue Mask) const override; - SDValue visitMaskedStore(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, - MachineMemOperand *MMO, SDValue Ptr, SDValue Val, - SDValue Mask) const override; + SDValue visitMaskedLoad(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, + MachineMemOperand *MMO, SDValue &NewLoad, SDValue Ptr, + SDValue PassThru, SDValue Mask) const override; + SDValue visitMaskedStore(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, + MachineMemOperand *MMO, SDValue Ptr, SDValue Val, + SDValue Mask) const override; - /// Lower interleaved load(s) into target specific - /// instructions/intrinsics. - bool lowerInterleavedLoad(Instruction *Load, Value *Mask, - ArrayRef Shuffles, - ArrayRef Indices, unsigned Factor, - const APInt &GapMask) const override; + /// Lower interleaved load(s) into target specific + /// instructions/intrinsics. + bool lowerInterleavedLoad(Instruction *Load, Value *Mask, + ArrayRef Shuffles, + ArrayRef Indices, unsigned Factor, + const APInt &GapMask) const override; - /// Lower interleaved store(s) into target specific - /// instructions/intrinsics. - bool lowerInterleavedStore(Instruction *Store, Value *Mask, - ShuffleVectorInst *SVI, unsigned Factor, - const APInt &GapMask) const override; + /// Lower interleaved store(s) into target specific + /// instructions/intrinsics. + bool lowerInterleavedStore(Instruction *Store, Value *Mask, + ShuffleVectorInst *SVI, unsigned Factor, + const APInt &GapMask) const override; - SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, - int JTI, SelectionDAG &DAG) const override; + SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, + int JTI, SelectionDAG &DAG) const override; - Align getPrefLoopAlignment(MachineLoop *ML) const override; + Align getPrefLoopAlignment(MachineLoop *ML) const override; - EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const override { - if (VT == MVT::f80) - return EVT::getIntegerVT(Context, 96); - return TargetLoweringBase::getTypeToTransformTo(Context, VT); - } + EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const override { + if (VT == MVT::f80) + return EVT::getIntegerVT(Context, 96); + return TargetLoweringBase::getTypeToTransformTo(Context, VT); + } - protected: - std::pair - findRepresentativeClass(const TargetRegisterInfo *TRI, - MVT VT) const override; +protected: + std::pair + findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const override; - private: - /// Keep a reference to the X86Subtarget around so that we can - /// make the right decision when generating code for different targets. - const X86Subtarget &Subtarget; +private: + /// Keep a reference to the X86Subtarget around so that we can + /// make the right decision when generating code for different targets. + const X86Subtarget &Subtarget; - /// A list of legal FP immediates. - std::vector LegalFPImmediates; + /// A list of legal FP immediates. + std::vector LegalFPImmediates; - /// Indicate that this x86 target can instruction - /// select the specified FP immediate natively. - void addLegalFPImmediate(const APFloat& Imm) { - LegalFPImmediates.push_back(Imm); - } + /// Indicate that this x86 target can instruction + /// select the specified FP immediate natively. + void addLegalFPImmediate(const APFloat &Imm) { + LegalFPImmediates.push_back(Imm); + } - SDValue LowerCallResult(SDValue Chain, SDValue InGlue, - CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl &Ins, - const SDLoc &dl, SelectionDAG &DAG, - SmallVectorImpl &InVals, - uint32_t *RegMask) const; - SDValue LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, - const SmallVectorImpl &ArgInfo, - const SDLoc &dl, SelectionDAG &DAG, - const CCValAssign &VA, MachineFrameInfo &MFI, - unsigned i) const; - SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg, - const SDLoc &dl, SelectionDAG &DAG, - const CCValAssign &VA, - ISD::ArgFlagsTy Flags, bool isByval) const; - - // Call lowering helpers. - - /// Check whether the call is eligible for tail call optimization. Targets - /// that want to do tail call optimization should implement this function. - bool IsEligibleForTailCallOptimization( - TargetLowering::CallLoweringInfo &CLI, CCState &CCInfo, - SmallVectorImpl &ArgLocs, bool IsCalleePopSRet) const; - SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr, - SDValue Chain, bool IsTailCall, - bool Is64Bit, int FPDiff, - const SDLoc &dl) const; - - unsigned GetAlignedArgumentStackSize(unsigned StackSize, - SelectionDAG &DAG) const; - - unsigned getAddressSpace() const; - - SDValue FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned, - SDValue &Chain) const; - SDValue LRINT_LLRINTHelper(SDNode *N, SelectionDAG &DAG) const; - - SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; - - unsigned getGlobalWrapperKind(const GlobalValue *GV, - const unsigned char OpFlags) const; - SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const; - - /// Creates target global address or external symbol nodes for calls or - /// other uses. - SDValue LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG, bool ForCall, - bool *IsImpCall) const; - - SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerLRINT_LLRINT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerADDROFRETURNADDR(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const; - SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const; - SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const; - SDValue lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerGET_FPENV_MEM(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSET_FPENV_MEM(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerRESET_FPENV(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerWin64_FP_TO_INT128(SDValue Op, SelectionDAG &DAG, - SDValue &Chain) const; - SDValue LowerWin64_INT128_TO_FP(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerGC_TRANSITION(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; - SDValue lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFP_TO_BF16(SDValue Op, SelectionDAG &DAG) const; - - SDValue - LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl &Ins, - const SDLoc &dl, SelectionDAG &DAG, - SmallVectorImpl &InVals) const override; - SDValue LowerCall(CallLoweringInfo &CLI, - SmallVectorImpl &InVals) const override; - - SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl &Outs, - const SmallVectorImpl &OutVals, - const SDLoc &dl, SelectionDAG &DAG) const override; - - bool supportSplitCSR(MachineFunction *MF) const override { - return MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS && - MF->getFunction().hasFnAttribute(Attribute::NoUnwind); - } - void initializeSplitCSR(MachineBasicBlock *Entry) const override; - void insertCopiesSplitCSR( + SDValue LowerCallResult(SDValue Chain, SDValue InGlue, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl &Ins, + const SDLoc &dl, SelectionDAG &DAG, + SmallVectorImpl &InVals, + uint32_t *RegMask) const; + SDValue LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, + const SmallVectorImpl &ArgInfo, + const SDLoc &dl, SelectionDAG &DAG, + const CCValAssign &VA, MachineFrameInfo &MFI, + unsigned i) const; + SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg, + const SDLoc &dl, SelectionDAG &DAG, + const CCValAssign &VA, ISD::ArgFlagsTy Flags, + bool isByval) const; + + // Call lowering helpers. + + /// Check whether the call is eligible for tail call optimization. Targets + /// that want to do tail call optimization should implement this function. + bool IsEligibleForTailCallOptimization(TargetLowering::CallLoweringInfo &CLI, + CCState &CCInfo, + SmallVectorImpl &ArgLocs, + bool IsCalleePopSRet) const; + SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr, + SDValue Chain, bool IsTailCall, bool Is64Bit, + int FPDiff, const SDLoc &dl) const; + + unsigned GetAlignedArgumentStackSize(unsigned StackSize, + SelectionDAG &DAG) const; + + unsigned getAddressSpace() const; + + SDValue FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned, + SDValue &Chain) const; + SDValue LRINT_LLRINTHelper(SDNode *N, SelectionDAG &DAG) const; + + SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; + + unsigned getGlobalWrapperKind(const GlobalValue *GV, + const unsigned char OpFlags) const; + SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const; + + /// Creates target global address or external symbol nodes for calls or + /// other uses. + SDValue LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG, bool ForCall, + bool *IsImpCall) const; + + SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerLRINT_LLRINT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerADDROFRETURNADDR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGET_FPENV_MEM(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSET_FPENV_MEM(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerRESET_FPENV(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerWin64_FP_TO_INT128(SDValue Op, SelectionDAG &DAG, + SDValue &Chain) const; + SDValue LowerWin64_INT128_TO_FP(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGC_TRANSITION(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_TO_BF16(SDValue Op, SelectionDAG &DAG) const; + + SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl &Ins, + const SDLoc &dl, SelectionDAG &DAG, + SmallVectorImpl &InVals) const override; + SDValue LowerCall(CallLoweringInfo &CLI, + SmallVectorImpl &InVals) const override; + + SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl &Outs, + const SmallVectorImpl &OutVals, const SDLoc &dl, + SelectionDAG &DAG) const override; + + bool supportSplitCSR(MachineFunction *MF) const override { + return MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS && + MF->getFunction().hasFnAttribute(Attribute::NoUnwind); + } + void initializeSplitCSR(MachineBasicBlock *Entry) const override; + void insertCopiesSplitCSR( MachineBasicBlock *Entry, const SmallVectorImpl &Exits) const override; - bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override; + bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override; - bool mayBeEmittedAsTailCall(const CallInst *CI) const override; + bool mayBeEmittedAsTailCall(const CallInst *CI) const override; - EVT getTypeForExtReturn(LLVMContext &Context, EVT VT, - ISD::NodeType ExtendKind) const override; + EVT getTypeForExtReturn(LLVMContext &Context, EVT VT, + ISD::NodeType ExtendKind) const override; - bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, - bool isVarArg, - const SmallVectorImpl &Outs, - LLVMContext &Context, - const Type *RetTy) const override; + bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, + bool isVarArg, + const SmallVectorImpl &Outs, + LLVMContext &Context, const Type *RetTy) const override; - const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override; - ArrayRef getRoundingControlRegisters() const override; + const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override; + ArrayRef getRoundingControlRegisters() const override; - TargetLoweringBase::AtomicExpansionKind - shouldExpandAtomicLoadInIR(LoadInst *LI) const override; - TargetLoweringBase::AtomicExpansionKind - shouldExpandAtomicStoreInIR(StoreInst *SI) const override; - TargetLoweringBase::AtomicExpansionKind - shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; - TargetLoweringBase::AtomicExpansionKind - shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const; - void emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const override; - void emitCmpArithAtomicRMWIntrinsic(AtomicRMWInst *AI) const override; + TargetLoweringBase::AtomicExpansionKind + shouldExpandAtomicLoadInIR(LoadInst *LI) const override; + TargetLoweringBase::AtomicExpansionKind + shouldExpandAtomicStoreInIR(StoreInst *SI) const override; + TargetLoweringBase::AtomicExpansionKind + shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; + TargetLoweringBase::AtomicExpansionKind + shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const; + void emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const override; + void emitCmpArithAtomicRMWIntrinsic(AtomicRMWInst *AI) const override; - LoadInst * - lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override; + LoadInst *lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override; - bool needsCmpXchgNb(Type *MemType) const; + bool needsCmpXchgNb(Type *MemType) const; - void SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB, - MachineBasicBlock *DispatchBB, int FI) const; + void SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB, + MachineBasicBlock *DispatchBB, int FI) const; - // Utility function to emit the low-level va_arg code for X86-64. - MachineBasicBlock * - EmitVAARGWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const; + // Utility function to emit the low-level va_arg code for X86-64. + MachineBasicBlock *EmitVAARGWithCustomInserter(MachineInstr &MI, + MachineBasicBlock *MBB) const; + + /// Utility function to emit the xmm reg save portion of va_start. + MachineBasicBlock *EmitLoweredCascadedSelect(MachineInstr &MI1, + MachineInstr &MI2, + MachineBasicBlock *BB) const; - /// Utility function to emit the xmm reg save portion of va_start. - MachineBasicBlock *EmitLoweredCascadedSelect(MachineInstr &MI1, - MachineInstr &MI2, - MachineBasicBlock *BB) const; + MachineBasicBlock *EmitLoweredSelect(MachineInstr &I, + MachineBasicBlock *BB) const; - MachineBasicBlock *EmitLoweredSelect(MachineInstr &I, + MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI, MachineBasicBlock *BB) const; - MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI, - MachineBasicBlock *BB) const; + MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr &MI, + MachineBasicBlock *BB) const; - MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr &MI, - MachineBasicBlock *BB) const; + MachineBasicBlock *EmitLoweredProbedAlloca(MachineInstr &MI, + MachineBasicBlock *BB) const; - MachineBasicBlock *EmitLoweredProbedAlloca(MachineInstr &MI, - MachineBasicBlock *BB) const; + MachineBasicBlock *EmitLoweredTLSCall(MachineInstr &MI, + MachineBasicBlock *BB) const; - MachineBasicBlock *EmitLoweredTLSCall(MachineInstr &MI, - MachineBasicBlock *BB) const; + MachineBasicBlock *EmitLoweredIndirectThunk(MachineInstr &MI, + MachineBasicBlock *BB) const; - MachineBasicBlock *EmitLoweredIndirectThunk(MachineInstr &MI, - MachineBasicBlock *BB) const; + MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr &MI, + MachineBasicBlock *MBB) const; - MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr &MI, - MachineBasicBlock *MBB) const; + void emitSetJmpShadowStackFix(MachineInstr &MI, MachineBasicBlock *MBB) const; - void emitSetJmpShadowStackFix(MachineInstr &MI, - MachineBasicBlock *MBB) const; + MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr &MI, + MachineBasicBlock *MBB) const; - MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr &MI, - MachineBasicBlock *MBB) const; + MachineBasicBlock *emitLongJmpShadowStackFix(MachineInstr &MI, + MachineBasicBlock *MBB) const; - MachineBasicBlock *emitLongJmpShadowStackFix(MachineInstr &MI, - MachineBasicBlock *MBB) const; + MachineBasicBlock *EmitSjLjDispatchBlock(MachineInstr &MI, + MachineBasicBlock *MBB) const; + + MachineBasicBlock *emitPatchableEventCall(MachineInstr &MI, + MachineBasicBlock *MBB) const; + + /// Emit flags for the given setcc condition and operands. Also returns the + /// corresponding X86 condition code constant in X86CC. + SDValue emitFlagsForSetcc(SDValue Op0, SDValue Op1, ISD::CondCode CC, + const SDLoc &dl, SelectionDAG &DAG, + SDValue &X86CC) const; + + bool optimizeFMulOrFDivAsShiftAddBitcast(SDNode *N, SDValue FPConst, + SDValue IntPow2) const override; + + /// Check if replacement of SQRT with RSQRT should be disabled. + bool isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const override; + + /// Use rsqrt* to speed up sqrt calculations. + SDValue getSqrtEstimate(SDValue Op, SelectionDAG &DAG, int Enabled, + int &RefinementSteps, bool &UseOneConstNR, + bool Reciprocal) const override; + + /// Use rcp* to speed up fdiv calculations. + SDValue getRecipEstimate(SDValue Op, SelectionDAG &DAG, int Enabled, + int &RefinementSteps) const override; + + /// Reassociate floating point divisions into multiply by reciprocal. + unsigned combineRepeatedFPDivisors() const override; + + SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, + SmallVectorImpl &Created) const override; + + SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1, + SDValue V2) const; +}; + +namespace X86 { +FastISel *createFastISel(FunctionLoweringInfo &funcInfo, + const TargetLibraryInfo *libInfo); +} // end namespace X86 + +// X86 specific Gather/Scatter nodes. +// The class has the same order of operands as MaskedGatherScatterSDNode for +// convenience. +class X86MaskedGatherScatterSDNode : public MemIntrinsicSDNode { +public: + // This is a intended as a utility and should never be directly created. + X86MaskedGatherScatterSDNode() = delete; + ~X86MaskedGatherScatterSDNode() = delete; + + const SDValue &getBasePtr() const { return getOperand(3); } + const SDValue &getIndex() const { return getOperand(4); } + const SDValue &getMask() const { return getOperand(2); } + const SDValue &getScale() const { return getOperand(5); } + + static bool classof(const SDNode *N) { + return N->getOpcode() == X86ISD::MGATHER || + N->getOpcode() == X86ISD::MSCATTER; + } +}; + +class X86MaskedGatherSDNode : public X86MaskedGatherScatterSDNode { +public: + const SDValue &getPassThru() const { return getOperand(1); } + + static bool classof(const SDNode *N) { + return N->getOpcode() == X86ISD::MGATHER; + } +}; + +class X86MaskedScatterSDNode : public X86MaskedGatherScatterSDNode { +public: + const SDValue &getValue() const { return getOperand(1); } + + static bool classof(const SDNode *N) { + return N->getOpcode() == X86ISD::MSCATTER; + } +}; + +/// Generate unpacklo/unpackhi shuffle mask. +void createUnpackShuffleMask(EVT VT, SmallVectorImpl &Mask, bool Lo, + bool Unary); - MachineBasicBlock *EmitSjLjDispatchBlock(MachineInstr &MI, - MachineBasicBlock *MBB) const; - - MachineBasicBlock *emitPatchableEventCall(MachineInstr &MI, - MachineBasicBlock *MBB) const; - - /// Emit flags for the given setcc condition and operands. Also returns the - /// corresponding X86 condition code constant in X86CC. - SDValue emitFlagsForSetcc(SDValue Op0, SDValue Op1, ISD::CondCode CC, - const SDLoc &dl, SelectionDAG &DAG, - SDValue &X86CC) const; - - bool optimizeFMulOrFDivAsShiftAddBitcast(SDNode *N, SDValue FPConst, - SDValue IntPow2) const override; - - /// Check if replacement of SQRT with RSQRT should be disabled. - bool isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const override; - - /// Use rsqrt* to speed up sqrt calculations. - SDValue getSqrtEstimate(SDValue Op, SelectionDAG &DAG, int Enabled, - int &RefinementSteps, bool &UseOneConstNR, - bool Reciprocal) const override; - - /// Use rcp* to speed up fdiv calculations. - SDValue getRecipEstimate(SDValue Op, SelectionDAG &DAG, int Enabled, - int &RefinementSteps) const override; - - /// Reassociate floating point divisions into multiply by reciprocal. - unsigned combineRepeatedFPDivisors() const override; - - SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, - SmallVectorImpl &Created) const override; - - SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1, - SDValue V2) const; - }; - - namespace X86 { - FastISel *createFastISel(FunctionLoweringInfo &funcInfo, - const TargetLibraryInfo *libInfo); - } // end namespace X86 - - // X86 specific Gather/Scatter nodes. - // The class has the same order of operands as MaskedGatherScatterSDNode for - // convenience. - class X86MaskedGatherScatterSDNode : public MemIntrinsicSDNode { - public: - // This is a intended as a utility and should never be directly created. - X86MaskedGatherScatterSDNode() = delete; - ~X86MaskedGatherScatterSDNode() = delete; - - const SDValue &getBasePtr() const { return getOperand(3); } - const SDValue &getIndex() const { return getOperand(4); } - const SDValue &getMask() const { return getOperand(2); } - const SDValue &getScale() const { return getOperand(5); } - - static bool classof(const SDNode *N) { - return N->getOpcode() == X86ISD::MGATHER || - N->getOpcode() == X86ISD::MSCATTER; - } - }; - - class X86MaskedGatherSDNode : public X86MaskedGatherScatterSDNode { - public: - const SDValue &getPassThru() const { return getOperand(1); } - - static bool classof(const SDNode *N) { - return N->getOpcode() == X86ISD::MGATHER; - } - }; - - class X86MaskedScatterSDNode : public X86MaskedGatherScatterSDNode { - public: - const SDValue &getValue() const { return getOperand(1); } - - static bool classof(const SDNode *N) { - return N->getOpcode() == X86ISD::MSCATTER; - } - }; - - /// Generate unpacklo/unpackhi shuffle mask. - void createUnpackShuffleMask(EVT VT, SmallVectorImpl &Mask, bool Lo, - bool Unary); - - /// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation - /// imposed by AVX and specific to the unary pattern. Example: - /// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3> - /// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7> - void createSplat2ShuffleMask(MVT VT, SmallVectorImpl &Mask, bool Lo); +/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation +/// imposed by AVX and specific to the unary pattern. Example: +/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3> +/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7> +void createSplat2ShuffleMask(MVT VT, SmallVectorImpl &Mask, bool Lo); } // end namespace llvm diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 765db86ffafb3..d73c3aa0e1e82 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -690,8 +690,7 @@ bool X86InstrInfo::expandCtSelectVector(MachineInstr &MI) const { .addImm(31)); } else { // Negate to convert 1 -> 0xFFFFFFFF, 0 -> 0x00000000 (negl %eax) - recordInstr(BuildMI(*MBB, MI, DL, get(X86::NEG32r), TmpGPR) - .addReg(TmpGPR)); + recordInstr(BuildMI(*MBB, MI, DL, get(X86::NEG32r), TmpGPR).addReg(TmpGPR)); } // Broadcast to TmpX (vector mask) @@ -848,7 +847,8 @@ bool X86InstrInfo::expandCtSelectVector(MachineInstr &MI) const { .setMIFlags(MachineInstr::MIFlag::NoMerge)); } - assert(FirstInstr && LastInstr && "Expected at least one expanded instruction"); + assert(FirstInstr && LastInstr && + "Expected at least one expanded instruction"); auto BundleEnd = LastInstr->getIterator(); finalizeBundle(*MBB, FirstInstr->getIterator(), std::next(BundleEnd)); @@ -916,25 +916,28 @@ bool X86InstrInfo::expandCtSelectWithCMOV(MachineInstr &MI) const { /// Expand i386-specific CTSELECT pseudo instructions (post-RA, constant-time) /// These internal pseudos receive a pre-materialized condition byte from the -/// custom inserter, avoiding EFLAGS corruption issues during i64 type legalization. +/// custom inserter, avoiding EFLAGS corruption issues during i64 type +/// legalization. bool X86InstrInfo::expandCtSelectIntWithoutCMOV(MachineInstr &MI) const { MachineBasicBlock *MBB = MI.getParent(); DebugLoc DL = MI.getDebugLoc(); // CTSELECT_I386_INT_GRxxrr has operands: (outs dst, tmp_byte, tmp_mask), // (ins src1, src2, cond_byte) - // Note: cond_byte is pre-materialized by custom inserter, not EFLAGS-dependent + // Note: cond_byte is pre-materialized by custom inserter, not + // EFLAGS-dependent Register DstReg = MI.getOperand(0).getReg(); Register TmpByteReg = MI.getOperand(1).getReg(); Register TmpMaskReg = MI.getOperand(2).getReg(); Register Src1Reg = MI.getOperand(3).getReg(); Register Src2Reg = MI.getOperand(4).getReg(); - Register CondByteReg = MI.getOperand(5).getReg(); // Pre-materialized condition byte + Register CondByteReg = + MI.getOperand(5).getReg(); // Pre-materialized condition byte // Determine instruction opcodes based on register width unsigned MovZXOp, NegOp, MovOp, AndOp, NotOp, OrOp; if (MI.getOpcode() == X86::CTSELECT_I386_INT_GR8rr) { - MovZXOp = 0; // No zero-extend needed for GR8 + MovZXOp = 0; // No zero-extend needed for GR8 NegOp = X86::NEG8r; MovOp = X86::MOV8rr; AndOp = X86::AND8rr; @@ -963,8 +966,8 @@ bool X86InstrInfo::expandCtSelectIntWithoutCMOV(MachineInstr &MI) const { // Step 1: Copy pre-materialized condition byte to TmpByteReg // This allows the bundle to work with allocated temporaries auto I1 = BuildMI(*MBB, MI, DL, get(X86::MOV8rr), TmpByteReg) - .addReg(CondByteReg) - .setMIFlag(MachineInstr::MIFlag::NoMerge); + .addReg(CondByteReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); auto BundleStart = I1->getIterator(); // Step 2: Zero-extend condition byte to register width (0 or 1) @@ -975,7 +978,9 @@ bool X86InstrInfo::expandCtSelectIntWithoutCMOV(MachineInstr &MI) const { } // Step 3: Convert condition to bitmask (NEG: 1 -> 0xFFFF..., 0 -> 0x0000...) - Register MaskReg = (MI.getOpcode() == X86::CTSELECT_I386_INT_GR8rr) ? TmpByteReg : TmpMaskReg; + Register MaskReg = (MI.getOpcode() == X86::CTSELECT_I386_INT_GR8rr) + ? TmpByteReg + : TmpMaskReg; BuildMI(*MBB, MI, DL, get(NegOp), MaskReg) .addReg(MaskReg) .setMIFlag(MachineInstr::MIFlag::NoMerge); @@ -1003,9 +1008,9 @@ bool X86InstrInfo::expandCtSelectIntWithoutCMOV(MachineInstr &MI) const { // Step 8: Final result: (src1 & mask) | (src2 & ~mask) auto LI = BuildMI(*MBB, MI, DL, get(OrOp), DstReg) - .addReg(DstReg) - .addReg(MaskReg) - .setMIFlag(MachineInstr::MIFlag::NoMerge); + .addReg(DstReg) + .addReg(MaskReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); // Bundle all generated instructions for atomic execution before removing MI auto BundleEnd = std::next(LI->getIterator()); @@ -1014,11 +1019,12 @@ bool X86InstrInfo::expandCtSelectIntWithoutCMOV(MachineInstr &MI) const { finalizeBundle(*MBB, BundleStart, BundleEnd); } - // TODO: Optimization opportunity - The register allocator may choose callee-saved - // registers (e.g., %ebx, %esi) for TmpByteReg/TmpMaskReg, causing unnecessary - // save/restore overhead. Consider constraining these to caller-saved register - // classes (e.g., GR8_AL, GR32_CallSaved) in the TableGen definitions to improve - // constant-time performance by eliminating prologue/epilogue instructions. + // TODO: Optimization opportunity - The register allocator may choose + // callee-saved registers (e.g., %ebx, %esi) for TmpByteReg/TmpMaskReg, + // causing unnecessary save/restore overhead. Consider constraining these to + // caller-saved register classes (e.g., GR8_AL, GR32_CallSaved) in the + // TableGen definitions to improve constant-time performance by eliminating + // prologue/epilogue instructions. // Remove the original pseudo instruction MI.eraseFromParent(); @@ -1306,8 +1312,7 @@ static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI) { return isPICBase; } -bool X86InstrInfo::isReMaterializableImpl( - const MachineInstr &MI) const { +bool X86InstrInfo::isReMaterializableImpl(const MachineInstr &MI) const { switch (MI.getOpcode()) { default: // This function should only be called for opcodes with the ReMaterializable @@ -1826,32 +1831,32 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc, switch (MIOpc) { default: llvm_unreachable("Unreachable!"); - CASE_NF(SHL8ri) - CASE_NF(SHL16ri) { - unsigned ShAmt = MI.getOperand(2).getImm(); - MIB.addReg(0) - .addImm(1LL << ShAmt) - .addReg(InRegLEA, RegState::Kill) - .addImm(0) - .addReg(0); - break; - } - CASE_NF(INC8r) - CASE_NF(INC16r) + CASE_NF(SHL8ri) + CASE_NF(SHL16ri) { + unsigned ShAmt = MI.getOperand(2).getImm(); + MIB.addReg(0) + .addImm(1LL << ShAmt) + .addReg(InRegLEA, RegState::Kill) + .addImm(0) + .addReg(0); + break; + } + CASE_NF(INC8r) + CASE_NF(INC16r) addRegOffset(MIB, InRegLEA, true, 1); break; - CASE_NF(DEC8r) - CASE_NF(DEC16r) + CASE_NF(DEC8r) + CASE_NF(DEC16r) addRegOffset(MIB, InRegLEA, true, -1); break; - CASE_NF(ADD8ri) - CASE_NF(ADD16ri) + CASE_NF(ADD8ri) + CASE_NF(ADD16ri) case X86::ADD8ri_DB: case X86::ADD16ri_DB: addRegOffset(MIB, InRegLEA, true, MI.getOperand(2).getImm()); break; - CASE_NF(ADD8rr) - CASE_NF(ADD16rr) + CASE_NF(ADD8rr) + CASE_NF(ADD16rr) case X86::ADD8rr_DB: case X86::ADD16rr_DB: { Src2 = MI.getOperand(2).getReg(); @@ -1989,128 +1994,129 @@ MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI, switch (MIOpc) { default: llvm_unreachable("Unreachable!"); - CASE_NF(SHL64ri) { - assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!"); - unsigned ShAmt = getTruncatedShiftCount(MI, 2); - if (!isTruncatedShiftCountForLEA(ShAmt)) - return nullptr; - - // LEA can't handle RSP. - if (Src.getReg().isVirtual() && !MF.getRegInfo().constrainRegClass( - Src.getReg(), &X86::GR64_NOSPRegClass)) - return nullptr; + CASE_NF(SHL64ri) { + assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!"); + unsigned ShAmt = getTruncatedShiftCount(MI, 2); + if (!isTruncatedShiftCountForLEA(ShAmt)) + return nullptr; - NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)) - .add(Dest) - .addReg(0) - .addImm(1LL << ShAmt) - .add(Src) - .addImm(0) - .addReg(0); - break; - } - CASE_NF(SHL32ri) { - assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!"); - unsigned ShAmt = getTruncatedShiftCount(MI, 2); - if (!isTruncatedShiftCountForLEA(ShAmt)) - return nullptr; + // LEA can't handle RSP. + if (Src.getReg().isVirtual() && + !MF.getRegInfo().constrainRegClass(Src.getReg(), + &X86::GR64_NOSPRegClass)) + return nullptr; - unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r; + NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)) + .add(Dest) + .addReg(0) + .addImm(1LL << ShAmt) + .add(Src) + .addImm(0) + .addReg(0); + break; + } + CASE_NF(SHL32ri) { + assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!"); + unsigned ShAmt = getTruncatedShiftCount(MI, 2); + if (!isTruncatedShiftCountForLEA(ShAmt)) + return nullptr; - // LEA can't handle ESP. - bool isKill; - MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); - if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg, - isKill, ImplicitOp, LV, LIS)) - return nullptr; + unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r; - MachineInstrBuilder MIB = - BuildMI(MF, MI.getDebugLoc(), get(Opc)) - .add(Dest) - .addReg(0) - .addImm(1LL << ShAmt) - .addReg(SrcReg, getKillRegState(isKill), SrcSubReg) - .addImm(0) - .addReg(0); - if (ImplicitOp.getReg() != 0) - MIB.add(ImplicitOp); - NewMI = MIB; + // LEA can't handle ESP. + bool isKill; + MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); + if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg, + isKill, ImplicitOp, LV, LIS)) + return nullptr; - // Add kills if classifyLEAReg created a new register. - if (LV && SrcReg != Src.getReg()) - LV->getVarInfo(SrcReg).Kills.push_back(NewMI); - break; - } - CASE_NF(SHL8ri) + MachineInstrBuilder MIB = + BuildMI(MF, MI.getDebugLoc(), get(Opc)) + .add(Dest) + .addReg(0) + .addImm(1LL << ShAmt) + .addReg(SrcReg, getKillRegState(isKill), SrcSubReg) + .addImm(0) + .addReg(0); + if (ImplicitOp.getReg() != 0) + MIB.add(ImplicitOp); + NewMI = MIB; + + // Add kills if classifyLEAReg created a new register. + if (LV && SrcReg != Src.getReg()) + LV->getVarInfo(SrcReg).Kills.push_back(NewMI); + break; + } + CASE_NF(SHL8ri) Is8BitOp = true; [[fallthrough]]; - CASE_NF(SHL16ri) { - assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!"); - unsigned ShAmt = getTruncatedShiftCount(MI, 2); - if (!isTruncatedShiftCountForLEA(ShAmt)) - return nullptr; - return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp); - } - CASE_NF(INC64r) - CASE_NF(INC32r) { - assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!"); - unsigned Opc = (MIOpc == X86::INC64r || MIOpc == X86::INC64r_NF) - ? X86::LEA64r - : (Is64Bit ? X86::LEA64_32r : X86::LEA32r); - bool isKill; - MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); - if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg, - isKill, ImplicitOp, LV, LIS)) - return nullptr; - - MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)) - .add(Dest) - .addReg(SrcReg, getKillRegState(isKill)); - if (ImplicitOp.getReg() != 0) - MIB.add(ImplicitOp); + CASE_NF(SHL16ri) { + assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!"); + unsigned ShAmt = getTruncatedShiftCount(MI, 2); + if (!isTruncatedShiftCountForLEA(ShAmt)) + return nullptr; + return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp); + } + CASE_NF(INC64r) + CASE_NF(INC32r) { + assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!"); + unsigned Opc = (MIOpc == X86::INC64r || MIOpc == X86::INC64r_NF) + ? X86::LEA64r + : (Is64Bit ? X86::LEA64_32r : X86::LEA32r); + bool isKill; + MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); + if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg, + isKill, ImplicitOp, LV, LIS)) + return nullptr; - NewMI = addOffset(MIB, 1); + MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)) + .add(Dest) + .addReg(SrcReg, getKillRegState(isKill)); + if (ImplicitOp.getReg() != 0) + MIB.add(ImplicitOp); - // Add kills if classifyLEAReg created a new register. - if (LV && SrcReg != Src.getReg()) - LV->getVarInfo(SrcReg).Kills.push_back(NewMI); - break; - } - CASE_NF(DEC64r) - CASE_NF(DEC32r) { - assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!"); - unsigned Opc = (MIOpc == X86::DEC64r || MIOpc == X86::DEC64r_NF) - ? X86::LEA64r - : (Is64Bit ? X86::LEA64_32r : X86::LEA32r); + NewMI = addOffset(MIB, 1); - bool isKill; - MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); - if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg, - isKill, ImplicitOp, LV, LIS)) - return nullptr; + // Add kills if classifyLEAReg created a new register. + if (LV && SrcReg != Src.getReg()) + LV->getVarInfo(SrcReg).Kills.push_back(NewMI); + break; + } + CASE_NF(DEC64r) + CASE_NF(DEC32r) { + assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!"); + unsigned Opc = (MIOpc == X86::DEC64r || MIOpc == X86::DEC64r_NF) + ? X86::LEA64r + : (Is64Bit ? X86::LEA64_32r : X86::LEA32r); + + bool isKill; + MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); + if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg, + isKill, ImplicitOp, LV, LIS)) + return nullptr; - MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)) - .add(Dest) - .addReg(SrcReg, getKillRegState(isKill)); - if (ImplicitOp.getReg() != 0) - MIB.add(ImplicitOp); + MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)) + .add(Dest) + .addReg(SrcReg, getKillRegState(isKill)); + if (ImplicitOp.getReg() != 0) + MIB.add(ImplicitOp); - NewMI = addOffset(MIB, -1); + NewMI = addOffset(MIB, -1); - // Add kills if classifyLEAReg created a new register. - if (LV && SrcReg != Src.getReg()) - LV->getVarInfo(SrcReg).Kills.push_back(NewMI); - break; - } - CASE_NF(DEC8r) - CASE_NF(INC8r) + // Add kills if classifyLEAReg created a new register. + if (LV && SrcReg != Src.getReg()) + LV->getVarInfo(SrcReg).Kills.push_back(NewMI); + break; + } + CASE_NF(DEC8r) + CASE_NF(INC8r) Is8BitOp = true; [[fallthrough]]; - CASE_NF(DEC16r) - CASE_NF(INC16r) + CASE_NF(DEC16r) + CASE_NF(INC16r) return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp); - CASE_NF(ADD64rr) - CASE_NF(ADD32rr) + CASE_NF(ADD64rr) + CASE_NF(ADD32rr) case X86::ADD64rr_DB: case X86::ADD32rr_DB: { assert(MI.getNumOperands() >= 3 && "Unknown add instruction!"); @@ -2161,21 +2167,21 @@ MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI, NumRegOperands = 3; break; } - CASE_NF(ADD8rr) + CASE_NF(ADD8rr) case X86::ADD8rr_DB: Is8BitOp = true; [[fallthrough]]; - CASE_NF(ADD16rr) + CASE_NF(ADD16rr) case X86::ADD16rr_DB: return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp); - CASE_NF(ADD64ri32) + CASE_NF(ADD64ri32) case X86::ADD64ri32_DB: assert(MI.getNumOperands() >= 3 && "Unknown add instruction!"); NewMI = addOffset( BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src), MI.getOperand(2)); break; - CASE_NF(ADD32ri) + CASE_NF(ADD32ri) case X86::ADD32ri_DB: { assert(MI.getNumOperands() >= 3 && "Unknown add instruction!"); unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r; @@ -2200,62 +2206,62 @@ MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI, LV->getVarInfo(SrcReg).Kills.push_back(NewMI); break; } - CASE_NF(ADD8ri) + CASE_NF(ADD8ri) case X86::ADD8ri_DB: Is8BitOp = true; [[fallthrough]]; - CASE_NF(ADD16ri) + CASE_NF(ADD16ri) case X86::ADD16ri_DB: return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp); - CASE_NF(SUB8ri) - CASE_NF(SUB16ri) + CASE_NF(SUB8ri) + CASE_NF(SUB16ri) /// FIXME: Support these similar to ADD8ri/ADD16ri*. return nullptr; - CASE_NF(SUB32ri) { - if (!MI.getOperand(2).isImm()) - return nullptr; - int64_t Imm = MI.getOperand(2).getImm(); - if (!isInt<32>(-Imm)) - return nullptr; + CASE_NF(SUB32ri) { + if (!MI.getOperand(2).isImm()) + return nullptr; + int64_t Imm = MI.getOperand(2).getImm(); + if (!isInt<32>(-Imm)) + return nullptr; - assert(MI.getNumOperands() >= 3 && "Unknown add instruction!"); - unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r; + assert(MI.getNumOperands() >= 3 && "Unknown add instruction!"); + unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r; - bool isKill; - MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); - if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, SrcSubReg, - isKill, ImplicitOp, LV, LIS)) - return nullptr; + bool isKill; + MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); + if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, SrcSubReg, + isKill, ImplicitOp, LV, LIS)) + return nullptr; - MachineInstrBuilder MIB = - BuildMI(MF, MI.getDebugLoc(), get(Opc)) - .add(Dest) - .addReg(SrcReg, getKillRegState(isKill), SrcSubReg); - if (ImplicitOp.getReg() != 0) - MIB.add(ImplicitOp); + MachineInstrBuilder MIB = + BuildMI(MF, MI.getDebugLoc(), get(Opc)) + .add(Dest) + .addReg(SrcReg, getKillRegState(isKill), SrcSubReg); + if (ImplicitOp.getReg() != 0) + MIB.add(ImplicitOp); - NewMI = addOffset(MIB, -Imm); + NewMI = addOffset(MIB, -Imm); - // Add kills if classifyLEAReg created a new register. - if (LV && SrcReg != Src.getReg()) - LV->getVarInfo(SrcReg).Kills.push_back(NewMI); - break; - } + // Add kills if classifyLEAReg created a new register. + if (LV && SrcReg != Src.getReg()) + LV->getVarInfo(SrcReg).Kills.push_back(NewMI); + break; + } - CASE_NF(SUB64ri32) { - if (!MI.getOperand(2).isImm()) - return nullptr; - int64_t Imm = MI.getOperand(2).getImm(); - if (!isInt<32>(-Imm)) - return nullptr; + CASE_NF(SUB64ri32) { + if (!MI.getOperand(2).isImm()) + return nullptr; + int64_t Imm = MI.getOperand(2).getImm(); + if (!isInt<32>(-Imm)) + return nullptr; - assert(MI.getNumOperands() >= 3 && "Unknown sub instruction!"); + assert(MI.getNumOperands() >= 3 && "Unknown sub instruction!"); - MachineInstrBuilder MIB = - BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src); - NewMI = addOffset(MIB, -Imm); - break; - } + MachineInstrBuilder MIB = + BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src); + NewMI = addOffset(MIB, -Imm); + break; + } case X86::VMOVDQU8Z128rmk: case X86::VMOVDQU8Z256rmk: @@ -2855,17 +2861,17 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, case X86::OP##_ND: switch (Opc) { - // SHLD B, C, I <-> SHRD C, B, (BitWidth - I) - CASE_ND(SHRD16rri8) - CASE_ND(SHLD16rri8) - CASE_ND(SHRD32rri8) - CASE_ND(SHLD32rri8) - CASE_ND(SHRD64rri8) - CASE_ND(SHLD64rri8) { - unsigned Size; - switch (Opc) { - default: - llvm_unreachable("Unreachable!"); + // SHLD B, C, I <-> SHRD C, B, (BitWidth - I) + CASE_ND(SHRD16rri8) + CASE_ND(SHLD16rri8) + CASE_ND(SHRD32rri8) + CASE_ND(SHLD32rri8) + CASE_ND(SHRD64rri8) + CASE_ND(SHLD64rri8) { + unsigned Size; + switch (Opc) { + default: + llvm_unreachable("Unreachable!"); #define FROM_TO_SIZE(A, B, S) \ case X86::A: \ Opc = X86::B; \ @@ -2884,16 +2890,16 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, Size = S; \ break; - FROM_TO_SIZE(SHRD16rri8, SHLD16rri8, 16) - FROM_TO_SIZE(SHRD32rri8, SHLD32rri8, 32) - FROM_TO_SIZE(SHRD64rri8, SHLD64rri8, 64) + FROM_TO_SIZE(SHRD16rri8, SHLD16rri8, 16) + FROM_TO_SIZE(SHRD32rri8, SHLD32rri8, 32) + FROM_TO_SIZE(SHRD64rri8, SHLD64rri8, 64) #undef FROM_TO_SIZE + } + WorkingMI = CloneIfNew(MI); + WorkingMI->setDesc(get(Opc)); + WorkingMI->getOperand(3).setImm(Size - MI.getOperand(3).getImm()); + break; } - WorkingMI = CloneIfNew(MI); - WorkingMI->setDesc(get(Opc)); - WorkingMI->getOperand(3).setImm(Size - MI.getOperand(3).getImm()); - break; - } case X86::PFSUBrr: case X86::PFSUBRrr: // PFSUB x, y: x = x - y @@ -3177,15 +3183,16 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, WorkingMI = CloneIfNew(MI); WorkingMI->setDesc(get(Opc)); break; - CASE_ND(CMOV16rr) - CASE_ND(CMOV32rr) - CASE_ND(CMOV64rr) { - WorkingMI = CloneIfNew(MI); - unsigned OpNo = MI.getDesc().getNumOperands() - 1; - X86::CondCode CC = static_cast(MI.getOperand(OpNo).getImm()); - WorkingMI->getOperand(OpNo).setImm(X86::GetOppositeBranchCondition(CC)); - break; - } + CASE_ND(CMOV16rr) + CASE_ND(CMOV32rr) + CASE_ND(CMOV64rr) { + WorkingMI = CloneIfNew(MI); + unsigned OpNo = MI.getDesc().getNumOperands() - 1; + X86::CondCode CC = + static_cast(MI.getOperand(OpNo).getImm()); + WorkingMI->getOperand(OpNo).setImm(X86::GetOppositeBranchCondition(CC)); + break; + } case X86::VPTERNLOGDZrri: case X86::VPTERNLOGDZrmi: case X86::VPTERNLOGDZ128rri: @@ -5393,29 +5400,29 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, CmpMask = CmpValue = 0; } return true; - // A SUB can be used to perform comparison. - CASE_ND(SUB64rm) - CASE_ND(SUB32rm) - CASE_ND(SUB16rm) - CASE_ND(SUB8rm) + // A SUB can be used to perform comparison. + CASE_ND(SUB64rm) + CASE_ND(SUB32rm) + CASE_ND(SUB16rm) + CASE_ND(SUB8rm) SrcReg = MI.getOperand(1).getReg(); SrcReg2 = 0; CmpMask = 0; CmpValue = 0; return true; - CASE_ND(SUB64rr) - CASE_ND(SUB32rr) - CASE_ND(SUB16rr) - CASE_ND(SUB8rr) + CASE_ND(SUB64rr) + CASE_ND(SUB32rr) + CASE_ND(SUB16rr) + CASE_ND(SUB8rr) SrcReg = MI.getOperand(1).getReg(); SrcReg2 = MI.getOperand(2).getReg(); CmpMask = 0; CmpValue = 0; return true; - CASE_ND(SUB64ri32) - CASE_ND(SUB32ri) - CASE_ND(SUB16ri) - CASE_ND(SUB8ri) + CASE_ND(SUB64ri32) + CASE_ND(SUB32ri) + CASE_ND(SUB16ri) + CASE_ND(SUB8ri) SrcReg = MI.getOperand(1).getReg(); SrcReg2 = 0; if (MI.getOperand(2).isImm()) { @@ -5470,27 +5477,27 @@ bool X86InstrInfo::isRedundantFlagInstr(const MachineInstr &FlagI, case X86::CMP32rr: case X86::CMP16rr: case X86::CMP8rr: - CASE_ND(SUB64rr) - CASE_ND(SUB32rr) - CASE_ND(SUB16rr) - CASE_ND(SUB8rr) { - Register OISrcReg; - Register OISrcReg2; - int64_t OIMask; - int64_t OIValue; - if (!analyzeCompare(OI, OISrcReg, OISrcReg2, OIMask, OIValue) || - OIMask != ImmMask || OIValue != ImmValue) + CASE_ND(SUB64rr) + CASE_ND(SUB32rr) + CASE_ND(SUB16rr) + CASE_ND(SUB8rr) { + Register OISrcReg; + Register OISrcReg2; + int64_t OIMask; + int64_t OIValue; + if (!analyzeCompare(OI, OISrcReg, OISrcReg2, OIMask, OIValue) || + OIMask != ImmMask || OIValue != ImmValue) + return false; + if (SrcReg == OISrcReg && SrcReg2 == OISrcReg2) { + *IsSwapped = false; + return true; + } + if (SrcReg == OISrcReg2 && SrcReg2 == OISrcReg) { + *IsSwapped = true; + return true; + } return false; - if (SrcReg == OISrcReg && SrcReg2 == OISrcReg2) { - *IsSwapped = false; - return true; - } - if (SrcReg == OISrcReg2 && SrcReg2 == OISrcReg) { - *IsSwapped = true; - return true; } - return false; - } case X86::CMP64ri32: case X86::CMP32ri: case X86::CMP16ri: @@ -5499,10 +5506,10 @@ bool X86InstrInfo::isRedundantFlagInstr(const MachineInstr &FlagI, case X86::TEST32ri: case X86::TEST16ri: case X86::TEST8ri: - CASE_ND(SUB64ri32) - CASE_ND(SUB32ri) - CASE_ND(SUB16ri) - CASE_ND(SUB8ri) + CASE_ND(SUB64ri32) + CASE_ND(SUB32ri) + CASE_ND(SUB16ri) + CASE_ND(SUB8ri) case X86::TEST64rr: case X86::TEST32rr: case X86::TEST16rr: @@ -5559,98 +5566,98 @@ inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag, default: return false; - // The shift instructions only modify ZF if their shift count is non-zero. - // N.B.: The processor truncates the shift count depending on the encoding. - CASE_ND(SAR8ri) - CASE_ND(SAR16ri) - CASE_ND(SAR32ri) - CASE_ND(SAR64ri) - CASE_ND(SHR8ri) - CASE_ND(SHR16ri) - CASE_ND(SHR32ri) - CASE_ND(SHR64ri) + // The shift instructions only modify ZF if their shift count is non-zero. + // N.B.: The processor truncates the shift count depending on the encoding. + CASE_ND(SAR8ri) + CASE_ND(SAR16ri) + CASE_ND(SAR32ri) + CASE_ND(SAR64ri) + CASE_ND(SHR8ri) + CASE_ND(SHR16ri) + CASE_ND(SHR32ri) + CASE_ND(SHR64ri) return getTruncatedShiftCount(MI, 2) != 0; - // Some left shift instructions can be turned into LEA instructions but only - // if their flags aren't used. Avoid transforming such instructions. - CASE_ND(SHL8ri) - CASE_ND(SHL16ri) - CASE_ND(SHL32ri) - CASE_ND(SHL64ri) { - unsigned ShAmt = getTruncatedShiftCount(MI, 2); - if (isTruncatedShiftCountForLEA(ShAmt)) - return false; - return ShAmt != 0; - } + // Some left shift instructions can be turned into LEA instructions but only + // if their flags aren't used. Avoid transforming such instructions. + CASE_ND(SHL8ri) + CASE_ND(SHL16ri) + CASE_ND(SHL32ri) + CASE_ND(SHL64ri) { + unsigned ShAmt = getTruncatedShiftCount(MI, 2); + if (isTruncatedShiftCountForLEA(ShAmt)) + return false; + return ShAmt != 0; + } - CASE_ND(SHRD16rri8) - CASE_ND(SHRD32rri8) - CASE_ND(SHRD64rri8) - CASE_ND(SHLD16rri8) - CASE_ND(SHLD32rri8) - CASE_ND(SHLD64rri8) + CASE_ND(SHRD16rri8) + CASE_ND(SHRD32rri8) + CASE_ND(SHRD64rri8) + CASE_ND(SHLD16rri8) + CASE_ND(SHLD32rri8) + CASE_ND(SHLD64rri8) return getTruncatedShiftCount(MI, 3) != 0; - CASE_ND(SUB64ri32) - CASE_ND(SUB32ri) - CASE_ND(SUB16ri) - CASE_ND(SUB8ri) - CASE_ND(SUB64rr) - CASE_ND(SUB32rr) - CASE_ND(SUB16rr) - CASE_ND(SUB8rr) - CASE_ND(SUB64rm) - CASE_ND(SUB32rm) - CASE_ND(SUB16rm) - CASE_ND(SUB8rm) - CASE_ND(DEC64r) - CASE_ND(DEC32r) - CASE_ND(DEC16r) - CASE_ND(DEC8r) - CASE_ND(ADD64ri32) - CASE_ND(ADD32ri) - CASE_ND(ADD16ri) - CASE_ND(ADD8ri) - CASE_ND(ADD64rr) - CASE_ND(ADD32rr) - CASE_ND(ADD16rr) - CASE_ND(ADD8rr) - CASE_ND(ADD64rm) - CASE_ND(ADD32rm) - CASE_ND(ADD16rm) - CASE_ND(ADD8rm) - CASE_ND(INC64r) - CASE_ND(INC32r) - CASE_ND(INC16r) - CASE_ND(INC8r) - CASE_ND(ADC64ri32) - CASE_ND(ADC32ri) - CASE_ND(ADC16ri) - CASE_ND(ADC8ri) - CASE_ND(ADC64rr) - CASE_ND(ADC32rr) - CASE_ND(ADC16rr) - CASE_ND(ADC8rr) - CASE_ND(ADC64rm) - CASE_ND(ADC32rm) - CASE_ND(ADC16rm) - CASE_ND(ADC8rm) - CASE_ND(SBB64ri32) - CASE_ND(SBB32ri) - CASE_ND(SBB16ri) - CASE_ND(SBB8ri) - CASE_ND(SBB64rr) - CASE_ND(SBB32rr) - CASE_ND(SBB16rr) - CASE_ND(SBB8rr) - CASE_ND(SBB64rm) - CASE_ND(SBB32rm) - CASE_ND(SBB16rm) - CASE_ND(SBB8rm) - CASE_ND(NEG8r) - CASE_ND(NEG16r) - CASE_ND(NEG32r) - CASE_ND(NEG64r) + CASE_ND(SUB64ri32) + CASE_ND(SUB32ri) + CASE_ND(SUB16ri) + CASE_ND(SUB8ri) + CASE_ND(SUB64rr) + CASE_ND(SUB32rr) + CASE_ND(SUB16rr) + CASE_ND(SUB8rr) + CASE_ND(SUB64rm) + CASE_ND(SUB32rm) + CASE_ND(SUB16rm) + CASE_ND(SUB8rm) + CASE_ND(DEC64r) + CASE_ND(DEC32r) + CASE_ND(DEC16r) + CASE_ND(DEC8r) + CASE_ND(ADD64ri32) + CASE_ND(ADD32ri) + CASE_ND(ADD16ri) + CASE_ND(ADD8ri) + CASE_ND(ADD64rr) + CASE_ND(ADD32rr) + CASE_ND(ADD16rr) + CASE_ND(ADD8rr) + CASE_ND(ADD64rm) + CASE_ND(ADD32rm) + CASE_ND(ADD16rm) + CASE_ND(ADD8rm) + CASE_ND(INC64r) + CASE_ND(INC32r) + CASE_ND(INC16r) + CASE_ND(INC8r) + CASE_ND(ADC64ri32) + CASE_ND(ADC32ri) + CASE_ND(ADC16ri) + CASE_ND(ADC8ri) + CASE_ND(ADC64rr) + CASE_ND(ADC32rr) + CASE_ND(ADC16rr) + CASE_ND(ADC8rr) + CASE_ND(ADC64rm) + CASE_ND(ADC32rm) + CASE_ND(ADC16rm) + CASE_ND(ADC8rm) + CASE_ND(SBB64ri32) + CASE_ND(SBB32ri) + CASE_ND(SBB16ri) + CASE_ND(SBB8ri) + CASE_ND(SBB64rr) + CASE_ND(SBB32rr) + CASE_ND(SBB16rr) + CASE_ND(SBB8rr) + CASE_ND(SBB64rm) + CASE_ND(SBB32rm) + CASE_ND(SBB16rm) + CASE_ND(SBB8rm) + CASE_ND(NEG8r) + CASE_ND(NEG16r) + CASE_ND(NEG32r) + CASE_ND(NEG64r) case X86::LZCNT16rr: case X86::LZCNT16rm: case X86::LZCNT32rr: @@ -5670,42 +5677,42 @@ inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag, case X86::TZCNT64rr: case X86::TZCNT64rm: return true; - CASE_ND(AND64ri32) - CASE_ND(AND32ri) - CASE_ND(AND16ri) - CASE_ND(AND8ri) - CASE_ND(AND64rr) - CASE_ND(AND32rr) - CASE_ND(AND16rr) - CASE_ND(AND8rr) - CASE_ND(AND64rm) - CASE_ND(AND32rm) - CASE_ND(AND16rm) - CASE_ND(AND8rm) - CASE_ND(XOR64ri32) - CASE_ND(XOR32ri) - CASE_ND(XOR16ri) - CASE_ND(XOR8ri) - CASE_ND(XOR64rr) - CASE_ND(XOR32rr) - CASE_ND(XOR16rr) - CASE_ND(XOR8rr) - CASE_ND(XOR64rm) - CASE_ND(XOR32rm) - CASE_ND(XOR16rm) - CASE_ND(XOR8rm) - CASE_ND(OR64ri32) - CASE_ND(OR32ri) - CASE_ND(OR16ri) - CASE_ND(OR8ri) - CASE_ND(OR64rr) - CASE_ND(OR32rr) - CASE_ND(OR16rr) - CASE_ND(OR8rr) - CASE_ND(OR64rm) - CASE_ND(OR32rm) - CASE_ND(OR16rm) - CASE_ND(OR8rm) + CASE_ND(AND64ri32) + CASE_ND(AND32ri) + CASE_ND(AND16ri) + CASE_ND(AND8ri) + CASE_ND(AND64rr) + CASE_ND(AND32rr) + CASE_ND(AND16rr) + CASE_ND(AND8rr) + CASE_ND(AND64rm) + CASE_ND(AND32rm) + CASE_ND(AND16rm) + CASE_ND(AND8rm) + CASE_ND(XOR64ri32) + CASE_ND(XOR32ri) + CASE_ND(XOR16ri) + CASE_ND(XOR8ri) + CASE_ND(XOR64rr) + CASE_ND(XOR32rr) + CASE_ND(XOR16rr) + CASE_ND(XOR8rr) + CASE_ND(XOR64rm) + CASE_ND(XOR32rm) + CASE_ND(XOR16rm) + CASE_ND(XOR8rm) + CASE_ND(OR64ri32) + CASE_ND(OR32ri) + CASE_ND(OR16ri) + CASE_ND(OR8ri) + CASE_ND(OR64rr) + CASE_ND(OR32rr) + CASE_ND(OR16rr) + CASE_ND(OR8rr) + CASE_ND(OR64rm) + CASE_ND(OR32rm) + CASE_ND(OR16rm) + CASE_ND(OR8rm) case X86::ANDN32rr: case X86::ANDN32rm: case X86::ANDN64rr: @@ -5783,15 +5790,17 @@ inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag, } /// Check whether the use can be converted to remove a comparison against zero. -/// Returns the EFLAGS condition and the operand that we are comparing against zero. -static std::pair isUseDefConvertible(const MachineInstr &MI) { +/// Returns the EFLAGS condition and the operand that we are comparing against +/// zero. +static std::pair +isUseDefConvertible(const MachineInstr &MI) { switch (MI.getOpcode()) { default: return std::make_pair(X86::COND_INVALID, ~0U); - CASE_ND(NEG8r) - CASE_ND(NEG16r) - CASE_ND(NEG32r) - CASE_ND(NEG64r) + CASE_ND(NEG8r) + CASE_ND(NEG16r) + CASE_ND(NEG32r) + CASE_ND(NEG64r) return std::make_pair(X86::COND_AE, 1U); case X86::LZCNT16rr: case X86::LZCNT32rr: @@ -5835,51 +5844,53 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, switch (CmpInstr.getOpcode()) { default: break; - CASE_ND(SUB64ri32) - CASE_ND(SUB32ri) - CASE_ND(SUB16ri) - CASE_ND(SUB8ri) - CASE_ND(SUB64rm) - CASE_ND(SUB32rm) - CASE_ND(SUB16rm) - CASE_ND(SUB8rm) - CASE_ND(SUB64rr) - CASE_ND(SUB32rr) - CASE_ND(SUB16rr) - CASE_ND(SUB8rr) { - if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg())) - return false; - // There is no use of the destination register, we can replace SUB with CMP. - unsigned NewOpcode = 0; + CASE_ND(SUB64ri32) + CASE_ND(SUB32ri) + CASE_ND(SUB16ri) + CASE_ND(SUB8ri) + CASE_ND(SUB64rm) + CASE_ND(SUB32rm) + CASE_ND(SUB16rm) + CASE_ND(SUB8rm) + CASE_ND(SUB64rr) + CASE_ND(SUB32rr) + CASE_ND(SUB16rr) + CASE_ND(SUB8rr) { + if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg())) + return false; + // There is no use of the destination register, we can replace SUB with + // CMP. + unsigned NewOpcode = 0; #define FROM_TO(A, B) \ CASE_ND(A) NewOpcode = X86::B; \ break; - switch (CmpInstr.getOpcode()) { - default: - llvm_unreachable("Unreachable!"); - FROM_TO(SUB64rm, CMP64rm) - FROM_TO(SUB32rm, CMP32rm) - FROM_TO(SUB16rm, CMP16rm) - FROM_TO(SUB8rm, CMP8rm) - FROM_TO(SUB64rr, CMP64rr) - FROM_TO(SUB32rr, CMP32rr) - FROM_TO(SUB16rr, CMP16rr) - FROM_TO(SUB8rr, CMP8rr) - FROM_TO(SUB64ri32, CMP64ri32) - FROM_TO(SUB32ri, CMP32ri) - FROM_TO(SUB16ri, CMP16ri) - FROM_TO(SUB8ri, CMP8ri) - } + switch (CmpInstr.getOpcode()) { + default: + llvm_unreachable("Unreachable!"); + FROM_TO(SUB64rm, CMP64rm) + FROM_TO(SUB32rm, CMP32rm) + FROM_TO(SUB16rm, CMP16rm) + FROM_TO(SUB8rm, CMP8rm) + FROM_TO(SUB64rr, CMP64rr) + FROM_TO(SUB32rr, CMP32rr) + FROM_TO(SUB16rr, CMP16rr) + FROM_TO(SUB8rr, CMP8rr) + FROM_TO(SUB64ri32, CMP64ri32) + FROM_TO(SUB32ri, CMP32ri) + FROM_TO(SUB16ri, CMP16ri) + FROM_TO(SUB8ri, CMP8ri) + } #undef FROM_TO - CmpInstr.setDesc(get(NewOpcode)); - CmpInstr.removeOperand(0); - // Mutating this instruction invalidates any debug data associated with it. - CmpInstr.dropDebugNumber(); - // Fall through to optimize Cmp if Cmp is CMPrr or CMPri. - if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm || - NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm) - return false; - } + CmpInstr.setDesc(get(NewOpcode)); + CmpInstr.removeOperand(0); + // Mutating this instruction invalidates any debug data associated with + // it. + CmpInstr.dropDebugNumber(); + // Fall through to optimize Cmp if Cmp is CMPrr or CMPri. + if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm || + NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm) + return false; + } } // The following code tries to remove the comparison by re-using EFLAGS @@ -6236,14 +6247,14 @@ static bool canConvert2Copy(unsigned Opc) { switch (Opc) { default: return false; - CASE_ND(ADD64ri32) - CASE_ND(SUB64ri32) - CASE_ND(OR64ri32) - CASE_ND(XOR64ri32) - CASE_ND(ADD32ri) - CASE_ND(SUB32ri) - CASE_ND(OR32ri) - CASE_ND(XOR32ri) + CASE_ND(ADD64ri32) + CASE_ND(SUB64ri32) + CASE_ND(OR64ri32) + CASE_ND(XOR64ri32) + CASE_ND(ADD32ri) + CASE_ND(SUB32ri) + CASE_ND(OR32ri) + CASE_ND(XOR32ri) return true; } } @@ -9627,7 +9638,7 @@ Register X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const { static const uint16_t *lookup(unsigned opcode, unsigned domain, ArrayRef Table) { - for (const uint16_t(&Row)[3] : Table) + for (const uint16_t (&Row)[3] : Table) if (Row[domain - 1] == opcode) return Row; return nullptr; @@ -9636,7 +9647,7 @@ static const uint16_t *lookup(unsigned opcode, unsigned domain, static const uint16_t *lookupAVX512(unsigned opcode, unsigned domain, ArrayRef Table) { // If this is the integer domain make sure to check both integer columns. - for (const uint16_t(&Row)[4] : Table) + for (const uint16_t (&Row)[4] : Table) if (Row[domain - 1] == opcode || (domain == 3 && Row[3] == opcode)) return Row; return nullptr; @@ -10392,25 +10403,25 @@ bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst, if (Invert) return false; switch (Inst.getOpcode()) { - CASE_ND(ADD8rr) - CASE_ND(ADD16rr) - CASE_ND(ADD32rr) - CASE_ND(ADD64rr) - CASE_ND(AND8rr) - CASE_ND(AND16rr) - CASE_ND(AND32rr) - CASE_ND(AND64rr) - CASE_ND(OR8rr) - CASE_ND(OR16rr) - CASE_ND(OR32rr) - CASE_ND(OR64rr) - CASE_ND(XOR8rr) - CASE_ND(XOR16rr) - CASE_ND(XOR32rr) - CASE_ND(XOR64rr) - CASE_ND(IMUL16rr) - CASE_ND(IMUL32rr) - CASE_ND(IMUL64rr) + CASE_ND(ADD8rr) + CASE_ND(ADD16rr) + CASE_ND(ADD32rr) + CASE_ND(ADD64rr) + CASE_ND(AND8rr) + CASE_ND(AND16rr) + CASE_ND(AND32rr) + CASE_ND(AND64rr) + CASE_ND(OR8rr) + CASE_ND(OR16rr) + CASE_ND(OR32rr) + CASE_ND(OR64rr) + CASE_ND(XOR8rr) + CASE_ND(XOR16rr) + CASE_ND(XOR32rr) + CASE_ND(XOR64rr) + CASE_ND(IMUL16rr) + CASE_ND(IMUL32rr) + CASE_ND(IMUL64rr) case X86::PANDrr: case X86::PORrr: case X86::PXORrr: @@ -11451,8 +11462,8 @@ bool X86InstrInfo::getMachineCombinerPatterns( break; } } - return TargetInstrInfo::getMachineCombinerPatterns(Root, - Patterns, DoRegPressureReduce); + return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns, + DoRegPressureReduce); } static void diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h index ebd7e070d5fe8..93fcfa2f288f3 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -320,8 +320,7 @@ class X86InstrInfo final : public X86GenInstrInfo { Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override; - Register isLoadFromStackSlot(const MachineInstr &MI, - int &FrameIndex, + Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex, TypeSize &MemBytes) const override; /// isLoadFromStackSlotPostFE - Check for post-frame ptr elimination /// stack locations as well. This uses a heuristic so it isn't @@ -331,8 +330,7 @@ class X86InstrInfo final : public X86GenInstrInfo { Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override; - Register isStoreToStackSlot(const MachineInstr &MI, - int &FrameIndex, + Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex, TypeSize &MemBytes) const override; /// isStoreToStackSlotPostFE - Check for post-frame ptr elimination /// stack locations as well. This uses a heuristic so it isn't @@ -494,12 +492,12 @@ class X86InstrInfo final : public X86GenInstrInfo { /// is likely that the referenced instruction has been changed. /// /// \returns true on success. - MachineInstr * - foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, - ArrayRef Ops, - MachineBasicBlock::iterator InsertPt, int FrameIndex, - LiveIntervals *LIS = nullptr, - VirtRegMap *VRM = nullptr) const override; + MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, + ArrayRef Ops, + MachineBasicBlock::iterator InsertPt, + int FrameIndex, + LiveIntervals *LIS = nullptr, + VirtRegMap *VRM = nullptr) const override; /// Same as the previous version except it allows folding of any load and /// store from / to any address, not just from a specific stack slot. @@ -748,8 +746,7 @@ class X86InstrInfo final : public X86GenInstrInfo { /// /// If IsIntrinsic is set, operand 1 will be ignored for commuting. bool findThreeSrcCommutedOpIndices(const MachineInstr &MI, - unsigned &SrcOpIdx1, - unsigned &SrcOpIdx2, + unsigned &SrcOpIdx1, unsigned &SrcOpIdx2, bool IsIntrinsic = false) const; /// Returns true when instruction \p FlagI produces the same flags as \p OI. diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index 66c9d75053640..33b5ae0eb8f7a 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -54,9 +54,10 @@ using namespace llvm; -static cl::opt EnableMachineCombinerPass("x86-machine-combiner", - cl::desc("Enable the machine combiner pass"), - cl::init(true), cl::Hidden); +static cl::opt + EnableMachineCombinerPass("x86-machine-combiner", + cl::desc("Enable the machine combiner pass"), + cl::init(true), cl::Hidden); static cl::opt EnableTileRAPass("x86-tile-ra", @@ -362,7 +363,7 @@ namespace { class X86PassConfig : public TargetPassConfig { public: X86PassConfig(X86TargetMachine &TM, PassManagerBase &PM) - : TargetPassConfig(TM, PM) {} + : TargetPassConfig(TM, PM) {} X86TargetMachine &getX86TargetMachine() const { return getTM(); @@ -401,10 +402,10 @@ char X86ExecutionDomainFix::ID; } // end anonymous namespace INITIALIZE_PASS_BEGIN(X86ExecutionDomainFix, "x86-execution-domain-fix", - "X86 Execution Domain Fix", false, false) + "X86 Execution Domain Fix", false, false) INITIALIZE_PASS_DEPENDENCY(ReachingDefInfoWrapperPass) INITIALIZE_PASS_END(X86ExecutionDomainFix, "x86-execution-domain-fix", - "X86 Execution Domain Fix", false, false) + "X86 Execution Domain Fix", false, false) TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) { return new X86PassConfig(*this, PM); @@ -621,7 +622,7 @@ void X86PassConfig::addPreEmitPass2() { (TT.isOSDarwin() && (M->getFunction("objc_retainAutoreleasedReturnValue") || M->getFunction("objc_unsafeClaimAutoreleasedReturnValue"))) || - F.hasFnAttribute("ct-select"); + F.hasFnAttribute("ct-select"); })); // Analyzes and emits pseudos to support Win x64 Unwind V2. This pass must run diff --git a/llvm/test/CodeGen/X86/ctselect-i386-fp.ll b/llvm/test/CodeGen/X86/ctselect-i386-fp.ll index ea943307c644f..eec38fa581c6f 100644 --- a/llvm/test/CodeGen/X86/ctselect-i386-fp.ll +++ b/llvm/test/CodeGen/X86/ctselect-i386-fp.ll @@ -209,94 +209,84 @@ define double @test_ctselect_f64_basic(i1 %cond, double %a, double %b) nounwind define x86_fp80 @test_ctselect_f80_basic(i1 %cond, x86_fp80 %a, x86_fp80 %b) nounwind { ; I386-NOCMOV-LABEL: test_ctselect_f80_basic: ; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %ebp +; I386-NOCMOV-NEXT: pushl %ebx ; I386-NOCMOV-NEXT: pushl %edi ; I386-NOCMOV-NEXT: pushl %esi -; I386-NOCMOV-NEXT: subl $12, %esp -; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: sete %al -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: subl $40, %esp +; I386-NOCMOV-NEXT: fldt {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fldt {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fstpt {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fstpt {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: testb $1, %cl +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-NOCMOV-NEXT: movl %eax, (%esp) # 4-byte Spill ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx -; I386-NOCMOV-NEXT: movb %al, %ah -; I386-NOCMOV-NEXT: movzbl %ah, %edi -; I386-NOCMOV-NEXT: negl %edi -; I386-NOCMOV-NEXT: movl %edx, %esi -; I386-NOCMOV-NEXT: andl %edi, %esi -; I386-NOCMOV-NEXT: notl %edi -; I386-NOCMOV-NEXT: andl %ecx, %edi -; I386-NOCMOV-NEXT: orl %edi, %esi -; I386-NOCMOV-NEXT: movl %esi, (%esp) -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edi +; I386-NOCMOV-NEXT: sete %ch +; I386-NOCMOV-NEXT: movb %ch, %al +; I386-NOCMOV-NEXT: movzbl %al, %ebp +; I386-NOCMOV-NEXT: negl %ebp +; I386-NOCMOV-NEXT: movl %edi, %ebx +; I386-NOCMOV-NEXT: andl %ebp, %ebx +; I386-NOCMOV-NEXT: notl %ebp +; I386-NOCMOV-NEXT: andl %edx, %ebp +; I386-NOCMOV-NEXT: orl %ebp, %ebx +; I386-NOCMOV-NEXT: testb $1, %cl +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx -; I386-NOCMOV-NEXT: movb %al, %ah -; I386-NOCMOV-NEXT: movzbl %ah, %edi -; I386-NOCMOV-NEXT: negl %edi -; I386-NOCMOV-NEXT: movl %edx, %esi -; I386-NOCMOV-NEXT: andl %edi, %esi -; I386-NOCMOV-NEXT: notl %edi -; I386-NOCMOV-NEXT: andl %ecx, %edi -; I386-NOCMOV-NEXT: orl %edi, %esi -; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx -; I386-NOCMOV-NEXT: movb %al, %ah -; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: sete %ch +; I386-NOCMOV-NEXT: movb %ch, %cl +; I386-NOCMOV-NEXT: movzbl %cl, %ebp +; I386-NOCMOV-NEXT: negl %ebp +; I386-NOCMOV-NEXT: movl %edx, %edi +; I386-NOCMOV-NEXT: andl %ebp, %edi +; I386-NOCMOV-NEXT: notl %ebp +; I386-NOCMOV-NEXT: andl %eax, %ebp +; I386-NOCMOV-NEXT: orl %ebp, %edi +; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movl %edi, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: movl (%esp), %ebx # 4-byte Reload +; I386-NOCMOV-NEXT: movb %al, %dl +; I386-NOCMOV-NEXT: movzbl %dl, %edi ; I386-NOCMOV-NEXT: negl %edi -; I386-NOCMOV-NEXT: movl %edx, %esi -; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: movl %esi, %ecx +; I386-NOCMOV-NEXT: andl %edi, %ecx ; I386-NOCMOV-NEXT: notl %edi -; I386-NOCMOV-NEXT: andl %ecx, %edi -; I386-NOCMOV-NEXT: orl %edi, %esi -; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: fldt (%esp) -; I386-NOCMOV-NEXT: addl $12, %esp +; I386-NOCMOV-NEXT: andl %ebx, %edi +; I386-NOCMOV-NEXT: orl %edi, %ecx +; I386-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fldt {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: addl $40, %esp ; I386-NOCMOV-NEXT: popl %esi ; I386-NOCMOV-NEXT: popl %edi +; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: popl %ebp ; I386-NOCMOV-NEXT: retl ; ; I386-CMOV-LABEL: test_ctselect_f80_basic: ; I386-CMOV: # %bb.0: -; I386-CMOV-NEXT: pushl %edi -; I386-CMOV-NEXT: pushl %esi -; I386-CMOV-NEXT: subl $12, %esp +; I386-CMOV-NEXT: subl $36, %esp +; I386-CMOV-NEXT: fldt {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: fldt {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: fstpt {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: fstpt {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax ; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: sete %al +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax ; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx -; I386-CMOV-NEXT: movb %al, %ah -; I386-CMOV-NEXT: movzbl %ah, %edi -; I386-CMOV-NEXT: negl %edi -; I386-CMOV-NEXT: movl %edx, %esi -; I386-CMOV-NEXT: andl %edi, %esi -; I386-CMOV-NEXT: notl %edi -; I386-CMOV-NEXT: andl %ecx, %edi -; I386-CMOV-NEXT: orl %edi, %esi -; I386-CMOV-NEXT: movl %esi, (%esp) -; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx -; I386-CMOV-NEXT: movb %al, %ah -; I386-CMOV-NEXT: movzbl %ah, %edi -; I386-CMOV-NEXT: negl %edi -; I386-CMOV-NEXT: movl %edx, %esi -; I386-CMOV-NEXT: andl %edi, %esi -; I386-CMOV-NEXT: notl %edi -; I386-CMOV-NEXT: andl %ecx, %edi -; I386-CMOV-NEXT: orl %edi, %esi -; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx -; I386-CMOV-NEXT: movb %al, %ah -; I386-CMOV-NEXT: movzbl %ah, %edi -; I386-CMOV-NEXT: negl %edi -; I386-CMOV-NEXT: movl %edx, %esi -; I386-CMOV-NEXT: andl %edi, %esi -; I386-CMOV-NEXT: notl %edi -; I386-CMOV-NEXT: andl %ecx, %edi -; I386-CMOV-NEXT: orl %edi, %esi -; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: movl %eax, (%esp) ; I386-CMOV-NEXT: fldt (%esp) -; I386-CMOV-NEXT: addl $12, %esp -; I386-CMOV-NEXT: popl %esi -; I386-CMOV-NEXT: popl %edi +; I386-CMOV-NEXT: addl $36, %esp ; I386-CMOV-NEXT: retl %result = call x86_fp80 @llvm.ct.select.f80(i1 %cond, x86_fp80 %a, x86_fp80 %b) ret x86_fp80 %result @@ -543,94 +533,84 @@ define float @test_ctselect_f32_nan(i1 %cond) nounwind { define x86_fp80 @test_ctselect_f80_alignment(i1 %cond, x86_fp80 %a, x86_fp80 %b) nounwind { ; I386-NOCMOV-LABEL: test_ctselect_f80_alignment: ; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %ebp +; I386-NOCMOV-NEXT: pushl %ebx ; I386-NOCMOV-NEXT: pushl %edi ; I386-NOCMOV-NEXT: pushl %esi -; I386-NOCMOV-NEXT: subl $12, %esp -; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: sete %al -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: subl $40, %esp +; I386-NOCMOV-NEXT: fldt {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fldt {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fstpt {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fstpt {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: testb $1, %cl +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-NOCMOV-NEXT: movl %eax, (%esp) # 4-byte Spill ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx -; I386-NOCMOV-NEXT: movb %al, %ah -; I386-NOCMOV-NEXT: movzbl %ah, %edi -; I386-NOCMOV-NEXT: negl %edi -; I386-NOCMOV-NEXT: movl %edx, %esi -; I386-NOCMOV-NEXT: andl %edi, %esi -; I386-NOCMOV-NEXT: notl %edi -; I386-NOCMOV-NEXT: andl %ecx, %edi -; I386-NOCMOV-NEXT: orl %edi, %esi -; I386-NOCMOV-NEXT: movl %esi, (%esp) -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edi +; I386-NOCMOV-NEXT: sete %ch +; I386-NOCMOV-NEXT: movb %ch, %al +; I386-NOCMOV-NEXT: movzbl %al, %ebp +; I386-NOCMOV-NEXT: negl %ebp +; I386-NOCMOV-NEXT: movl %edi, %ebx +; I386-NOCMOV-NEXT: andl %ebp, %ebx +; I386-NOCMOV-NEXT: notl %ebp +; I386-NOCMOV-NEXT: andl %edx, %ebp +; I386-NOCMOV-NEXT: orl %ebp, %ebx +; I386-NOCMOV-NEXT: testb $1, %cl +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax ; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx -; I386-NOCMOV-NEXT: movb %al, %ah -; I386-NOCMOV-NEXT: movzbl %ah, %edi -; I386-NOCMOV-NEXT: negl %edi -; I386-NOCMOV-NEXT: movl %edx, %esi -; I386-NOCMOV-NEXT: andl %edi, %esi -; I386-NOCMOV-NEXT: notl %edi -; I386-NOCMOV-NEXT: andl %ecx, %edi -; I386-NOCMOV-NEXT: orl %edi, %esi -; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx -; I386-NOCMOV-NEXT: movb %al, %ah -; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: sete %ch +; I386-NOCMOV-NEXT: movb %ch, %cl +; I386-NOCMOV-NEXT: movzbl %cl, %ebp +; I386-NOCMOV-NEXT: negl %ebp +; I386-NOCMOV-NEXT: movl %edx, %edi +; I386-NOCMOV-NEXT: andl %ebp, %edi +; I386-NOCMOV-NEXT: notl %ebp +; I386-NOCMOV-NEXT: andl %eax, %ebp +; I386-NOCMOV-NEXT: orl %ebp, %edi +; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movl %edi, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: movl (%esp), %ebx # 4-byte Reload +; I386-NOCMOV-NEXT: movb %al, %dl +; I386-NOCMOV-NEXT: movzbl %dl, %edi ; I386-NOCMOV-NEXT: negl %edi -; I386-NOCMOV-NEXT: movl %edx, %esi -; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: movl %esi, %ecx +; I386-NOCMOV-NEXT: andl %edi, %ecx ; I386-NOCMOV-NEXT: notl %edi -; I386-NOCMOV-NEXT: andl %ecx, %edi -; I386-NOCMOV-NEXT: orl %edi, %esi -; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) -; I386-NOCMOV-NEXT: fldt (%esp) -; I386-NOCMOV-NEXT: addl $12, %esp +; I386-NOCMOV-NEXT: andl %ebx, %edi +; I386-NOCMOV-NEXT: orl %edi, %ecx +; I386-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fldt {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: addl $40, %esp ; I386-NOCMOV-NEXT: popl %esi ; I386-NOCMOV-NEXT: popl %edi +; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: popl %ebp ; I386-NOCMOV-NEXT: retl ; ; I386-CMOV-LABEL: test_ctselect_f80_alignment: ; I386-CMOV: # %bb.0: -; I386-CMOV-NEXT: pushl %edi -; I386-CMOV-NEXT: pushl %esi -; I386-CMOV-NEXT: subl $12, %esp +; I386-CMOV-NEXT: subl $36, %esp +; I386-CMOV-NEXT: fldt {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: fldt {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: fstpt {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: fstpt {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax ; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: sete %al +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax ; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx -; I386-CMOV-NEXT: movb %al, %ah -; I386-CMOV-NEXT: movzbl %ah, %edi -; I386-CMOV-NEXT: negl %edi -; I386-CMOV-NEXT: movl %edx, %esi -; I386-CMOV-NEXT: andl %edi, %esi -; I386-CMOV-NEXT: notl %edi -; I386-CMOV-NEXT: andl %ecx, %edi -; I386-CMOV-NEXT: orl %edi, %esi -; I386-CMOV-NEXT: movl %esi, (%esp) -; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx -; I386-CMOV-NEXT: movb %al, %ah -; I386-CMOV-NEXT: movzbl %ah, %edi -; I386-CMOV-NEXT: negl %edi -; I386-CMOV-NEXT: movl %edx, %esi -; I386-CMOV-NEXT: andl %edi, %esi -; I386-CMOV-NEXT: notl %edi -; I386-CMOV-NEXT: andl %ecx, %edi -; I386-CMOV-NEXT: orl %edi, %esi -; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) -; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx -; I386-CMOV-NEXT: movb %al, %ah -; I386-CMOV-NEXT: movzbl %ah, %edi -; I386-CMOV-NEXT: negl %edi -; I386-CMOV-NEXT: movl %edx, %esi -; I386-CMOV-NEXT: andl %edi, %esi -; I386-CMOV-NEXT: notl %edi -; I386-CMOV-NEXT: andl %ecx, %edi -; I386-CMOV-NEXT: orl %edi, %esi -; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: movl %eax, (%esp) ; I386-CMOV-NEXT: fldt (%esp) -; I386-CMOV-NEXT: addl $12, %esp -; I386-CMOV-NEXT: popl %esi -; I386-CMOV-NEXT: popl %edi +; I386-CMOV-NEXT: addl $36, %esp ; I386-CMOV-NEXT: retl %result = call x86_fp80 @llvm.ct.select.f80(i1 %cond, x86_fp80 %a, x86_fp80 %b) ret x86_fp80 %result