diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 0dbade544eced..0c324cbab88bc 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -115,6 +115,7 @@ class VectorCombine { bool foldExtractedCmps(Instruction &I); bool foldSingleElementStore(Instruction &I); bool scalarizeLoadExtract(Instruction &I); + bool foldConcatOfBoolMasks(Instruction &I); bool foldPermuteOfBinops(Instruction &I); bool foldShuffleOfBinops(Instruction &I); bool foldShuffleOfCastops(Instruction &I); @@ -1423,6 +1424,113 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) { return true; } +/// Try to fold "(or (zext (bitcast X)), (shl (zext (bitcast Y)), C))" +/// to "(bitcast (concat X, Y))" +/// where X/Y are bitcasted from i1 mask vectors. +bool VectorCombine::foldConcatOfBoolMasks(Instruction &I) { + Type *Ty = I.getType(); + if (!Ty->isIntegerTy()) + return false; + + // TODO: Add big endian test coverage + if (DL->isBigEndian()) + return false; + + // Restrict to disjoint cases so the mask vectors aren't overlapping. + Instruction *X, *Y; + if (!match(&I, m_DisjointOr(m_Instruction(X), m_Instruction(Y)))) + return false; + + // Allow both sources to contain shl, to handle more generic pattern: + // "(or (shl (zext (bitcast X)), C1), (shl (zext (bitcast Y)), C2))" + Value *SrcX; + uint64_t ShAmtX = 0; + if (!match(X, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcX)))))) && + !match(X, m_OneUse( + m_Shl(m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcX))))), + m_ConstantInt(ShAmtX))))) + return false; + + Value *SrcY; + uint64_t ShAmtY = 0; + if (!match(Y, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcY)))))) && + !match(Y, m_OneUse( + m_Shl(m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcY))))), + m_ConstantInt(ShAmtY))))) + return false; + + // Canonicalize larger shift to the RHS. + if (ShAmtX > ShAmtY) { + std::swap(X, Y); + std::swap(SrcX, SrcY); + std::swap(ShAmtX, ShAmtY); + } + + // Ensure both sources are matching vXi1 bool mask types, and that the shift + // difference is the mask width so they can be easily concatenated together. + uint64_t ShAmtDiff = ShAmtY - ShAmtX; + unsigned NumSHL = (ShAmtX > 0) + (ShAmtY > 0); + unsigned BitWidth = Ty->getPrimitiveSizeInBits(); + auto *MaskTy = dyn_cast(SrcX->getType()); + if (!MaskTy || SrcX->getType() != SrcY->getType() || + !MaskTy->getElementType()->isIntegerTy(1) || + MaskTy->getNumElements() != ShAmtDiff || + MaskTy->getNumElements() > (BitWidth / 2)) + return false; + + auto *ConcatTy = FixedVectorType::getDoubleElementsVectorType(MaskTy); + auto *ConcatIntTy = + Type::getIntNTy(Ty->getContext(), ConcatTy->getNumElements()); + auto *MaskIntTy = Type::getIntNTy(Ty->getContext(), ShAmtDiff); + + SmallVector ConcatMask(ConcatTy->getNumElements()); + std::iota(ConcatMask.begin(), ConcatMask.end(), 0); + + // TODO: Is it worth supporting multi use cases? + InstructionCost OldCost = 0; + OldCost += TTI.getArithmeticInstrCost(Instruction::Or, Ty, CostKind); + OldCost += + NumSHL * TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind); + OldCost += 2 * TTI.getCastInstrCost(Instruction::ZExt, Ty, MaskIntTy, + TTI::CastContextHint::None, CostKind); + OldCost += 2 * TTI.getCastInstrCost(Instruction::BitCast, MaskIntTy, MaskTy, + TTI::CastContextHint::None, CostKind); + + InstructionCost NewCost = 0; + NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, MaskTy, + ConcatMask, CostKind); + NewCost += TTI.getCastInstrCost(Instruction::BitCast, ConcatIntTy, ConcatTy, + TTI::CastContextHint::None, CostKind); + if (Ty != ConcatIntTy) + NewCost += TTI.getCastInstrCost(Instruction::ZExt, Ty, ConcatIntTy, + TTI::CastContextHint::None, CostKind); + if (ShAmtX > 0) + NewCost += TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind); + + if (NewCost > OldCost) + return false; + + // Build bool mask concatenation, bitcast back to scalar integer, and perform + // any residual zero-extension or shifting. + Value *Concat = Builder.CreateShuffleVector(SrcX, SrcY, ConcatMask); + Worklist.pushValue(Concat); + + Value *Result = Builder.CreateBitCast(Concat, ConcatIntTy); + + if (Ty != ConcatIntTy) { + Worklist.pushValue(Result); + Result = Builder.CreateZExt(Result, Ty); + } + + if (ShAmtX > 0) { + Worklist.pushValue(Result); + Result = Builder.CreateShl(Result, ShAmtX); + } + + replaceValue(I, *Result); + return true; +} + /// Try to convert "shuffle (binop (shuffle, shuffle)), undef" /// --> "binop (shuffle), (shuffle)". bool VectorCombine::foldPermuteOfBinops(Instruction &I) { @@ -2945,6 +3053,9 @@ bool VectorCombine::run() { case Instruction::FCmp: MadeChange |= foldExtractExtract(I); break; + case Instruction::Or: + MadeChange |= foldConcatOfBoolMasks(I); + [[fallthrough]]; default: if (Instruction::isBinaryOp(Opcode)) { MadeChange |= foldExtractExtract(I); diff --git a/llvm/test/Transforms/VectorCombine/X86/concat-boolmasks.ll b/llvm/test/Transforms/VectorCombine/X86/concat-boolmasks.ll index 1aa03eedc5eb0..e46fc730fb5b8 100644 --- a/llvm/test/Transforms/VectorCombine/X86/concat-boolmasks.ll +++ b/llvm/test/Transforms/VectorCombine/X86/concat-boolmasks.ll @@ -1,20 +1,22 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s -; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s -; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s -; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s +; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE +; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE +; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512 define i32 @movmsk_i32_v32i8_v16i8(<16 x i8> %v0, <16 x i8> %v1) { -; CHECK-LABEL: @movmsk_i32_v32i8_v16i8( -; CHECK-NEXT: [[C0:%.*]] = icmp slt <16 x i8> [[V0:%.*]], zeroinitializer -; CHECK-NEXT: [[C1:%.*]] = icmp slt <16 x i8> [[V1:%.*]], zeroinitializer -; CHECK-NEXT: [[B0:%.*]] = bitcast <16 x i1> [[C0]] to i16 -; CHECK-NEXT: [[B1:%.*]] = bitcast <16 x i1> [[C1]] to i16 -; CHECK-NEXT: [[Z0:%.*]] = zext i16 [[B0]] to i32 -; CHECK-NEXT: [[Z1:%.*]] = zext i16 [[B1]] to i32 -; CHECK-NEXT: [[S0:%.*]] = shl nuw i32 [[Z0]], 16 -; CHECK-NEXT: [[OR:%.*]] = or disjoint i32 [[S0]], [[Z1]] -; CHECK-NEXT: ret i32 [[OR]] +; SSE-LABEL: @movmsk_i32_v32i8_v16i8( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> [[V0:%.*]], <32 x i32> +; SSE-NEXT: [[TMP2:%.*]] = icmp slt <32 x i8> [[TMP1]], zeroinitializer +; SSE-NEXT: [[OR:%.*]] = bitcast <32 x i1> [[TMP2]] to i32 +; SSE-NEXT: ret i32 [[OR]] +; +; AVX-LABEL: @movmsk_i32_v32i8_v16i8( +; AVX-NEXT: [[C0:%.*]] = icmp slt <16 x i8> [[V0:%.*]], zeroinitializer +; AVX-NEXT: [[C1:%.*]] = icmp slt <16 x i8> [[V1:%.*]], zeroinitializer +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <16 x i1> [[C1]], <16 x i1> [[C0]], <32 x i32> +; AVX-NEXT: [[OR:%.*]] = bitcast <32 x i1> [[TMP1]] to i32 +; AVX-NEXT: ret i32 [[OR]] ; %c0 = icmp slt <16 x i8> %v0, zeroinitializer %c1 = icmp slt <16 x i8> %v1, zeroinitializer @@ -28,16 +30,20 @@ define i32 @movmsk_i32_v32i8_v16i8(<16 x i8> %v0, <16 x i8> %v1) { } define i32 @movmsk_i32_v8i32_v4i32(<4 x i32> %v0, <4 x i32> %v1) { -; CHECK-LABEL: @movmsk_i32_v8i32_v4i32( -; CHECK-NEXT: [[C0:%.*]] = icmp slt <4 x i32> [[V0:%.*]], zeroinitializer -; CHECK-NEXT: [[C1:%.*]] = icmp slt <4 x i32> [[V1:%.*]], zeroinitializer -; CHECK-NEXT: [[B0:%.*]] = bitcast <4 x i1> [[C0]] to i4 -; CHECK-NEXT: [[B1:%.*]] = bitcast <4 x i1> [[C1]] to i4 -; CHECK-NEXT: [[Z0:%.*]] = zext i4 [[B0]] to i32 -; CHECK-NEXT: [[Z1:%.*]] = zext i4 [[B1]] to i32 -; CHECK-NEXT: [[S0:%.*]] = shl nuw i32 [[Z0]], 4 -; CHECK-NEXT: [[OR:%.*]] = or disjoint i32 [[S0]], [[Z1]] -; CHECK-NEXT: ret i32 [[OR]] +; SSE-LABEL: @movmsk_i32_v8i32_v4i32( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <8 x i32> +; SSE-NEXT: [[TMP2:%.*]] = icmp slt <8 x i32> [[TMP1]], zeroinitializer +; SSE-NEXT: [[TMP3:%.*]] = bitcast <8 x i1> [[TMP2]] to i8 +; SSE-NEXT: [[OR:%.*]] = zext i8 [[TMP3]] to i32 +; SSE-NEXT: ret i32 [[OR]] +; +; AVX-LABEL: @movmsk_i32_v8i32_v4i32( +; AVX-NEXT: [[C0:%.*]] = icmp slt <4 x i32> [[V0:%.*]], zeroinitializer +; AVX-NEXT: [[C1:%.*]] = icmp slt <4 x i32> [[V1:%.*]], zeroinitializer +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i1> [[C1]], <4 x i1> [[C0]], <8 x i32> +; AVX-NEXT: [[TMP2:%.*]] = bitcast <8 x i1> [[TMP1]] to i8 +; AVX-NEXT: [[OR:%.*]] = zext i8 [[TMP2]] to i32 +; AVX-NEXT: ret i32 [[OR]] ; %c0 = icmp slt <4 x i32> %v0, zeroinitializer %c1 = icmp slt <4 x i32> %v1, zeroinitializer @@ -51,16 +57,20 @@ define i32 @movmsk_i32_v8i32_v4i32(<4 x i32> %v0, <4 x i32> %v1) { } define i64 @movmsk_i64_v32i8_v16i8(<16 x i8> %v0, <16 x i8> %v1) { -; CHECK-LABEL: @movmsk_i64_v32i8_v16i8( -; CHECK-NEXT: [[C0:%.*]] = icmp slt <16 x i8> [[V0:%.*]], zeroinitializer -; CHECK-NEXT: [[C1:%.*]] = icmp slt <16 x i8> [[V1:%.*]], zeroinitializer -; CHECK-NEXT: [[B0:%.*]] = bitcast <16 x i1> [[C0]] to i16 -; CHECK-NEXT: [[B1:%.*]] = bitcast <16 x i1> [[C1]] to i16 -; CHECK-NEXT: [[Z0:%.*]] = zext i16 [[B0]] to i64 -; CHECK-NEXT: [[Z1:%.*]] = zext i16 [[B1]] to i64 -; CHECK-NEXT: [[S0:%.*]] = shl nuw i64 [[Z0]], 16 -; CHECK-NEXT: [[OR:%.*]] = or disjoint i64 [[S0]], [[Z1]] -; CHECK-NEXT: ret i64 [[OR]] +; SSE-LABEL: @movmsk_i64_v32i8_v16i8( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> [[V0:%.*]], <32 x i32> +; SSE-NEXT: [[TMP2:%.*]] = icmp slt <32 x i8> [[TMP1]], zeroinitializer +; SSE-NEXT: [[TMP3:%.*]] = bitcast <32 x i1> [[TMP2]] to i32 +; SSE-NEXT: [[OR:%.*]] = zext i32 [[TMP3]] to i64 +; SSE-NEXT: ret i64 [[OR]] +; +; AVX-LABEL: @movmsk_i64_v32i8_v16i8( +; AVX-NEXT: [[C0:%.*]] = icmp slt <16 x i8> [[V0:%.*]], zeroinitializer +; AVX-NEXT: [[C1:%.*]] = icmp slt <16 x i8> [[V1:%.*]], zeroinitializer +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <16 x i1> [[C1]], <16 x i1> [[C0]], <32 x i32> +; AVX-NEXT: [[TMP2:%.*]] = bitcast <32 x i1> [[TMP1]] to i32 +; AVX-NEXT: [[OR:%.*]] = zext i32 [[TMP2]] to i64 +; AVX-NEXT: ret i64 [[OR]] ; %c0 = icmp slt <16 x i8> %v0, zeroinitializer %c1 = icmp slt <16 x i8> %v1, zeroinitializer @@ -74,16 +84,20 @@ define i64 @movmsk_i64_v32i8_v16i8(<16 x i8> %v0, <16 x i8> %v1) { } define i64 @movmsk_i64_v8i32_v4i32(<4 x i32> %v0, <4 x i32> %v1) { -; CHECK-LABEL: @movmsk_i64_v8i32_v4i32( -; CHECK-NEXT: [[C0:%.*]] = icmp slt <4 x i32> [[V0:%.*]], zeroinitializer -; CHECK-NEXT: [[C1:%.*]] = icmp slt <4 x i32> [[V1:%.*]], zeroinitializer -; CHECK-NEXT: [[B0:%.*]] = bitcast <4 x i1> [[C0]] to i4 -; CHECK-NEXT: [[B1:%.*]] = bitcast <4 x i1> [[C1]] to i4 -; CHECK-NEXT: [[Z0:%.*]] = zext i4 [[B0]] to i64 -; CHECK-NEXT: [[Z1:%.*]] = zext i4 [[B1]] to i64 -; CHECK-NEXT: [[S0:%.*]] = shl nuw i64 [[Z0]], 4 -; CHECK-NEXT: [[OR:%.*]] = or disjoint i64 [[S0]], [[Z1]] -; CHECK-NEXT: ret i64 [[OR]] +; SSE-LABEL: @movmsk_i64_v8i32_v4i32( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <8 x i32> +; SSE-NEXT: [[TMP2:%.*]] = icmp slt <8 x i32> [[TMP1]], zeroinitializer +; SSE-NEXT: [[TMP3:%.*]] = bitcast <8 x i1> [[TMP2]] to i8 +; SSE-NEXT: [[OR:%.*]] = zext i8 [[TMP3]] to i64 +; SSE-NEXT: ret i64 [[OR]] +; +; AVX-LABEL: @movmsk_i64_v8i32_v4i32( +; AVX-NEXT: [[C0:%.*]] = icmp slt <4 x i32> [[V0:%.*]], zeroinitializer +; AVX-NEXT: [[C1:%.*]] = icmp slt <4 x i32> [[V1:%.*]], zeroinitializer +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i1> [[C1]], <4 x i1> [[C0]], <8 x i32> +; AVX-NEXT: [[TMP2:%.*]] = bitcast <8 x i1> [[TMP1]] to i8 +; AVX-NEXT: [[OR:%.*]] = zext i8 [[TMP2]] to i64 +; AVX-NEXT: ret i64 [[OR]] ; %c0 = icmp slt <4 x i32> %v0, zeroinitializer %c1 = icmp slt <4 x i32> %v1, zeroinitializer @@ -97,26 +111,24 @@ define i64 @movmsk_i64_v8i32_v4i32(<4 x i32> %v0, <4 x i32> %v1) { } define i64 @movmsk_i64_v64i8_v16i8(<16 x i8> %v0, <16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) { -; CHECK-LABEL: @movmsk_i64_v64i8_v16i8( -; CHECK-NEXT: [[C0:%.*]] = icmp slt <16 x i8> [[V0:%.*]], zeroinitializer -; CHECK-NEXT: [[C1:%.*]] = icmp slt <16 x i8> [[V1:%.*]], zeroinitializer -; CHECK-NEXT: [[C2:%.*]] = icmp slt <16 x i8> [[V2:%.*]], zeroinitializer -; CHECK-NEXT: [[C3:%.*]] = icmp slt <16 x i8> [[V3:%.*]], zeroinitializer -; CHECK-NEXT: [[B0:%.*]] = bitcast <16 x i1> [[C0]] to i16 -; CHECK-NEXT: [[B1:%.*]] = bitcast <16 x i1> [[C1]] to i16 -; CHECK-NEXT: [[B2:%.*]] = bitcast <16 x i1> [[C2]] to i16 -; CHECK-NEXT: [[B3:%.*]] = bitcast <16 x i1> [[C3]] to i16 -; CHECK-NEXT: [[Z0:%.*]] = zext i16 [[B0]] to i64 -; CHECK-NEXT: [[Z1:%.*]] = zext i16 [[B1]] to i64 -; CHECK-NEXT: [[Z2:%.*]] = zext i16 [[B2]] to i64 -; CHECK-NEXT: [[Z3:%.*]] = zext i16 [[B3]] to i64 -; CHECK-NEXT: [[S0:%.*]] = shl nuw i64 [[Z0]], 48 -; CHECK-NEXT: [[S1:%.*]] = shl nuw i64 [[Z1]], 32 -; CHECK-NEXT: [[S2:%.*]] = shl nuw i64 [[Z2]], 16 -; CHECK-NEXT: [[OR0:%.*]] = or disjoint i64 [[S0]], [[S1]] -; CHECK-NEXT: [[OR1:%.*]] = or disjoint i64 [[S2]], [[Z3]] -; CHECK-NEXT: [[OR:%.*]] = or disjoint i64 [[OR0]], [[OR1]] -; CHECK-NEXT: ret i64 [[OR]] +; SSE-LABEL: @movmsk_i64_v64i8_v16i8( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[V3:%.*]], <16 x i8> [[V2:%.*]], <32 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> [[V0:%.*]], <32 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> [[TMP2]], <64 x i32> +; SSE-NEXT: [[TMP4:%.*]] = icmp slt <64 x i8> [[TMP3]], zeroinitializer +; SSE-NEXT: [[OR:%.*]] = bitcast <64 x i1> [[TMP4]] to i64 +; SSE-NEXT: ret i64 [[OR]] +; +; AVX-LABEL: @movmsk_i64_v64i8_v16i8( +; AVX-NEXT: [[C0:%.*]] = icmp slt <16 x i8> [[V0:%.*]], zeroinitializer +; AVX-NEXT: [[C1:%.*]] = icmp slt <16 x i8> [[V1:%.*]], zeroinitializer +; AVX-NEXT: [[C2:%.*]] = icmp slt <16 x i8> [[V2:%.*]], zeroinitializer +; AVX-NEXT: [[C3:%.*]] = icmp slt <16 x i8> [[V3:%.*]], zeroinitializer +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <16 x i1> [[C1]], <16 x i1> [[C0]], <32 x i32> +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <16 x i1> [[C3]], <16 x i1> [[C2]], <32 x i32> +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <32 x i1> [[TMP1]], <32 x i1> [[TMP2]], <64 x i32> +; AVX-NEXT: [[OR:%.*]] = bitcast <64 x i1> [[TMP3]] to i64 +; AVX-NEXT: ret i64 [[OR]] ; %c0 = icmp slt <16 x i8> %v0, zeroinitializer %c1 = icmp slt <16 x i8> %v1, zeroinitializer @@ -140,26 +152,26 @@ define i64 @movmsk_i64_v64i8_v16i8(<16 x i8> %v0, <16 x i8> %v1, <16 x i8> %v2, } define i64 @movmsk_i64_v32i32_v4i32(<4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) { -; CHECK-LABEL: @movmsk_i64_v32i32_v4i32( -; CHECK-NEXT: [[C0:%.*]] = icmp slt <4 x i32> [[V0:%.*]], zeroinitializer -; CHECK-NEXT: [[C1:%.*]] = icmp slt <4 x i32> [[V1:%.*]], zeroinitializer -; CHECK-NEXT: [[C2:%.*]] = icmp slt <4 x i32> [[V2:%.*]], zeroinitializer -; CHECK-NEXT: [[C3:%.*]] = icmp slt <4 x i32> [[V3:%.*]], zeroinitializer -; CHECK-NEXT: [[B0:%.*]] = bitcast <4 x i1> [[C0]] to i4 -; CHECK-NEXT: [[B1:%.*]] = bitcast <4 x i1> [[C1]] to i4 -; CHECK-NEXT: [[B2:%.*]] = bitcast <4 x i1> [[C2]] to i4 -; CHECK-NEXT: [[B3:%.*]] = bitcast <4 x i1> [[C3]] to i4 -; CHECK-NEXT: [[Z0:%.*]] = zext i4 [[B0]] to i64 -; CHECK-NEXT: [[Z1:%.*]] = zext i4 [[B1]] to i64 -; CHECK-NEXT: [[Z2:%.*]] = zext i4 [[B2]] to i64 -; CHECK-NEXT: [[Z3:%.*]] = zext i4 [[B3]] to i64 -; CHECK-NEXT: [[S1:%.*]] = shl nuw i64 [[Z0]], 12 -; CHECK-NEXT: [[S0:%.*]] = shl nuw i64 [[Z1]], 8 -; CHECK-NEXT: [[S2:%.*]] = shl nuw i64 [[Z2]], 4 -; CHECK-NEXT: [[OR0:%.*]] = or disjoint i64 [[S1]], [[S0]] -; CHECK-NEXT: [[OR1:%.*]] = or disjoint i64 [[S2]], [[Z3]] -; CHECK-NEXT: [[OR:%.*]] = or disjoint i64 [[OR0]], [[OR1]] -; CHECK-NEXT: ret i64 [[OR]] +; SSE-LABEL: @movmsk_i64_v32i32_v4i32( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V3:%.*]], <4 x i32> [[V2:%.*]], <8 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <8 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <16 x i32> +; SSE-NEXT: [[TMP4:%.*]] = icmp slt <16 x i32> [[TMP3]], zeroinitializer +; SSE-NEXT: [[TMP5:%.*]] = bitcast <16 x i1> [[TMP4]] to i16 +; SSE-NEXT: [[OR:%.*]] = zext i16 [[TMP5]] to i64 +; SSE-NEXT: ret i64 [[OR]] +; +; AVX-LABEL: @movmsk_i64_v32i32_v4i32( +; AVX-NEXT: [[C0:%.*]] = icmp slt <4 x i32> [[V0:%.*]], zeroinitializer +; AVX-NEXT: [[C1:%.*]] = icmp slt <4 x i32> [[V1:%.*]], zeroinitializer +; AVX-NEXT: [[C2:%.*]] = icmp slt <4 x i32> [[V2:%.*]], zeroinitializer +; AVX-NEXT: [[C3:%.*]] = icmp slt <4 x i32> [[V3:%.*]], zeroinitializer +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x i1> [[C1]], <4 x i1> [[C0]], <8 x i32> +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i1> [[C3]], <4 x i1> [[C2]], <8 x i32> +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <16 x i32> +; AVX-NEXT: [[TMP4:%.*]] = bitcast <16 x i1> [[TMP3]] to i16 +; AVX-NEXT: [[OR:%.*]] = zext i16 [[TMP4]] to i64 +; AVX-NEXT: ret i64 [[OR]] ; %c0 = icmp slt <4 x i32> %v0, zeroinitializer %c1 = icmp slt <4 x i32> %v1, zeroinitializer @@ -183,16 +195,24 @@ define i64 @movmsk_i64_v32i32_v4i32(<4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, } define i64 @movmsk_i64_v64i8_v32i8(<32 x i8> %v0, <32 x i8> %v1) { -; CHECK-LABEL: @movmsk_i64_v64i8_v32i8( -; CHECK-NEXT: [[C0:%.*]] = icmp slt <32 x i8> [[V0:%.*]], zeroinitializer -; CHECK-NEXT: [[C1:%.*]] = icmp slt <32 x i8> [[V1:%.*]], zeroinitializer -; CHECK-NEXT: [[B0:%.*]] = bitcast <32 x i1> [[C0]] to i32 -; CHECK-NEXT: [[B1:%.*]] = bitcast <32 x i1> [[C1]] to i32 -; CHECK-NEXT: [[Z0:%.*]] = zext i32 [[B0]] to i64 -; CHECK-NEXT: [[Z1:%.*]] = zext i32 [[B1]] to i64 -; CHECK-NEXT: [[S0:%.*]] = shl nuw i64 [[Z0]], 32 -; CHECK-NEXT: [[OR:%.*]] = or disjoint i64 [[S0]], [[Z1]] -; CHECK-NEXT: ret i64 [[OR]] +; SSE-LABEL: @movmsk_i64_v64i8_v32i8( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[V1:%.*]], <32 x i8> [[V0:%.*]], <64 x i32> +; SSE-NEXT: [[TMP2:%.*]] = icmp slt <64 x i8> [[TMP1]], zeroinitializer +; SSE-NEXT: [[OR:%.*]] = bitcast <64 x i1> [[TMP2]] to i64 +; SSE-NEXT: ret i64 [[OR]] +; +; AVX2-LABEL: @movmsk_i64_v64i8_v32i8( +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[V1:%.*]], <32 x i8> [[V0:%.*]], <64 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = icmp slt <64 x i8> [[TMP1]], zeroinitializer +; AVX2-NEXT: [[OR:%.*]] = bitcast <64 x i1> [[TMP2]] to i64 +; AVX2-NEXT: ret i64 [[OR]] +; +; AVX512-LABEL: @movmsk_i64_v64i8_v32i8( +; AVX512-NEXT: [[C0:%.*]] = icmp slt <32 x i8> [[V0:%.*]], zeroinitializer +; AVX512-NEXT: [[C1:%.*]] = icmp slt <32 x i8> [[V1:%.*]], zeroinitializer +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <32 x i1> [[C1]], <32 x i1> [[C0]], <64 x i32> +; AVX512-NEXT: [[OR:%.*]] = bitcast <64 x i1> [[TMP1]] to i64 +; AVX512-NEXT: ret i64 [[OR]] ; %c0 = icmp slt <32 x i8> %v0, zeroinitializer %c1 = icmp slt <32 x i8> %v1, zeroinitializer @@ -206,16 +226,27 @@ define i64 @movmsk_i64_v64i8_v32i8(<32 x i8> %v0, <32 x i8> %v1) { } define i32 @movmsk_i32_v16i32_v8i32(<8 x i32> %v0, <8 x i32> %v1) { -; CHECK-LABEL: @movmsk_i32_v16i32_v8i32( -; CHECK-NEXT: [[C0:%.*]] = icmp slt <8 x i32> [[V0:%.*]], zeroinitializer -; CHECK-NEXT: [[C1:%.*]] = icmp slt <8 x i32> [[V1:%.*]], zeroinitializer -; CHECK-NEXT: [[B0:%.*]] = bitcast <8 x i1> [[C0]] to i8 -; CHECK-NEXT: [[B1:%.*]] = bitcast <8 x i1> [[C1]] to i8 -; CHECK-NEXT: [[Z0:%.*]] = zext i8 [[B0]] to i32 -; CHECK-NEXT: [[Z1:%.*]] = zext i8 [[B1]] to i32 -; CHECK-NEXT: [[S0:%.*]] = shl nuw i32 [[Z0]], 8 -; CHECK-NEXT: [[OR:%.*]] = or disjoint i32 [[S0]], [[Z1]] -; CHECK-NEXT: ret i32 [[OR]] +; SSE-LABEL: @movmsk_i32_v16i32_v8i32( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[V1:%.*]], <8 x i32> [[V0:%.*]], <16 x i32> +; SSE-NEXT: [[TMP2:%.*]] = icmp slt <16 x i32> [[TMP1]], zeroinitializer +; SSE-NEXT: [[TMP3:%.*]] = bitcast <16 x i1> [[TMP2]] to i16 +; SSE-NEXT: [[OR:%.*]] = zext i16 [[TMP3]] to i32 +; SSE-NEXT: ret i32 [[OR]] +; +; AVX2-LABEL: @movmsk_i32_v16i32_v8i32( +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[V1:%.*]], <8 x i32> [[V0:%.*]], <16 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = icmp slt <16 x i32> [[TMP1]], zeroinitializer +; AVX2-NEXT: [[TMP3:%.*]] = bitcast <16 x i1> [[TMP2]] to i16 +; AVX2-NEXT: [[OR:%.*]] = zext i16 [[TMP3]] to i32 +; AVX2-NEXT: ret i32 [[OR]] +; +; AVX512-LABEL: @movmsk_i32_v16i32_v8i32( +; AVX512-NEXT: [[C0:%.*]] = icmp slt <8 x i32> [[V0:%.*]], zeroinitializer +; AVX512-NEXT: [[C1:%.*]] = icmp slt <8 x i32> [[V1:%.*]], zeroinitializer +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x i1> [[C1]], <8 x i1> [[C0]], <16 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = bitcast <16 x i1> [[TMP1]] to i16 +; AVX512-NEXT: [[OR:%.*]] = zext i16 [[TMP2]] to i32 +; AVX512-NEXT: ret i32 [[OR]] ; %c0 = icmp slt <8 x i32> %v0, zeroinitializer %c1 = icmp slt <8 x i32> %v1, zeroinitializer @@ -229,16 +260,26 @@ define i32 @movmsk_i32_v16i32_v8i32(<8 x i32> %v0, <8 x i32> %v1) { } define i64 @PR111431(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %a2) { -; CHECK-LABEL: @PR111431( -; CHECK-NEXT: [[C01:%.*]] = icmp eq <32 x i8> [[A0:%.*]], [[A1:%.*]] -; CHECK-NEXT: [[C02:%.*]] = icmp eq <32 x i8> [[A0]], [[A2:%.*]] -; CHECK-NEXT: [[B01:%.*]] = bitcast <32 x i1> [[C01]] to i32 -; CHECK-NEXT: [[B02:%.*]] = bitcast <32 x i1> [[C02]] to i32 -; CHECK-NEXT: [[Z01:%.*]] = zext i32 [[B01]] to i64 -; CHECK-NEXT: [[Z02:%.*]] = zext i32 [[B02]] to i64 -; CHECK-NEXT: [[SHL:%.*]] = shl nuw i64 [[Z01]], 32 -; CHECK-NEXT: [[OR:%.*]] = or disjoint i64 [[SHL]], [[Z02]] -; CHECK-NEXT: ret i64 [[OR]] +; SSE-LABEL: @PR111431( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> [[A0]], <64 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <32 x i8> [[A2:%.*]], <32 x i8> [[A1:%.*]], <64 x i32> +; SSE-NEXT: [[TMP3:%.*]] = icmp eq <64 x i8> [[TMP1]], [[TMP2]] +; SSE-NEXT: [[OR:%.*]] = bitcast <64 x i1> [[TMP3]] to i64 +; SSE-NEXT: ret i64 [[OR]] +; +; AVX2-LABEL: @PR111431( +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> [[A0]], <64 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <32 x i8> [[A2:%.*]], <32 x i8> [[A1:%.*]], <64 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = icmp eq <64 x i8> [[TMP1]], [[TMP2]] +; AVX2-NEXT: [[OR:%.*]] = bitcast <64 x i1> [[TMP3]] to i64 +; AVX2-NEXT: ret i64 [[OR]] +; +; AVX512-LABEL: @PR111431( +; AVX512-NEXT: [[C01:%.*]] = icmp eq <32 x i8> [[A0:%.*]], [[A1:%.*]] +; AVX512-NEXT: [[C02:%.*]] = icmp eq <32 x i8> [[A0]], [[A2:%.*]] +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <32 x i1> [[C02]], <32 x i1> [[C01]], <64 x i32> +; AVX512-NEXT: [[OR:%.*]] = bitcast <64 x i1> [[TMP1]] to i64 +; AVX512-NEXT: ret i64 [[OR]] ; %c01 = icmp eq <32 x i8> %a0, %a1 %c02 = icmp eq <32 x i8> %a0, %a2