From 6d907d68f156539c8b8902fd9aaa096a95552a57 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Wed, 11 Dec 2024 09:28:49 -0800 Subject: [PATCH] =?UTF-8?q?Revert=20"[VectorCombine]=20Fold=20"(or=20(zext?= =?UTF-8?q?=20(bitcast=20X)),=20(shl=20(zext=20(bitcast=20Y))=E2=80=A6"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 08f904011f4b17e46b7616737a5dec01e3563c80. --- .../Transforms/Vectorize/VectorCombine.cpp | 111 -------- .../PhaseOrdering/X86/concat-boolmasks.ll | 269 ++++++++---------- 2 files changed, 114 insertions(+), 266 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 567da5e52fd1c..9003642f1f93b 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -115,7 +115,6 @@ class VectorCombine { bool foldExtractedCmps(Instruction &I); bool foldSingleElementStore(Instruction &I); bool scalarizeLoadExtract(Instruction &I); - bool foldConcatOfBoolMasks(Instruction &I); bool foldPermuteOfBinops(Instruction &I); bool foldShuffleOfBinops(Instruction &I); bool foldShuffleOfCastops(Instruction &I); @@ -1424,113 +1423,6 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) { return true; } -/// Try to fold "(or (zext (bitcast X)), (shl (zext (bitcast Y)), C))" -/// to "(bitcast (concat X, Y))" -/// where X/Y are bitcasted from i1 mask vectors. -bool VectorCombine::foldConcatOfBoolMasks(Instruction &I) { - Type *Ty = I.getType(); - if (!Ty->isIntegerTy()) - return false; - - // TODO: Add big endian test coverage - if (DL->isBigEndian()) - return false; - - // Restrict to disjoint cases so the mask vectors aren't overlapping. - Instruction *X, *Y; - if (!match(&I, m_DisjointOr(m_Instruction(X), m_Instruction(Y)))) - return false; - - // Allow both sources to contain shl, to handle more generic pattern: - // "(or (shl (zext (bitcast X)), C1), (shl (zext (bitcast Y)), C2))" - Value *SrcX; - uint64_t ShAmtX = 0; - if (!match(X, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcX)))))) && - !match(X, m_OneUse( - m_Shl(m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcX))))), - m_ConstantInt(ShAmtX))))) - return false; - - Value *SrcY; - uint64_t ShAmtY = 0; - if (!match(Y, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcY)))))) && - !match(Y, m_OneUse( - m_Shl(m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcY))))), - m_ConstantInt(ShAmtY))))) - return false; - - // Canonicalize larger shift to the RHS. - if (ShAmtX > ShAmtY) { - std::swap(X, Y); - std::swap(SrcX, SrcY); - std::swap(ShAmtX, ShAmtY); - } - - // Ensure both sources are matching vXi1 bool mask types, and that the shift - // difference is the mask width so they can be easily concatenated together. - uint64_t ShAmtDiff = ShAmtY - ShAmtX; - unsigned NumSHL = (ShAmtX > 0) + (ShAmtY > 0); - unsigned BitWidth = Ty->getPrimitiveSizeInBits(); - auto *MaskTy = dyn_cast(SrcX->getType()); - if (!MaskTy || SrcX->getType() != SrcY->getType() || - !MaskTy->getElementType()->isIntegerTy(1) || - MaskTy->getNumElements() != ShAmtDiff || - MaskTy->getNumElements() > (BitWidth / 2)) - return false; - - auto *ConcatTy = FixedVectorType::getDoubleElementsVectorType(MaskTy); - auto *ConcatIntTy = - Type::getIntNTy(Ty->getContext(), ConcatTy->getNumElements()); - auto *MaskIntTy = Type::getIntNTy(Ty->getContext(), ShAmtDiff); - - SmallVector ConcatMask(ConcatTy->getNumElements()); - std::iota(ConcatMask.begin(), ConcatMask.end(), 0); - - // TODO: Is it worth supporting multi use cases? - InstructionCost OldCost = 0; - OldCost += TTI.getArithmeticInstrCost(Instruction::Or, Ty, CostKind); - OldCost += - NumSHL * TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind); - OldCost += 2 * TTI.getCastInstrCost(Instruction::ZExt, Ty, MaskIntTy, - TTI::CastContextHint::None, CostKind); - OldCost += 2 * TTI.getCastInstrCost(Instruction::BitCast, MaskIntTy, MaskTy, - TTI::CastContextHint::None, CostKind); - - InstructionCost NewCost = 0; - NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, MaskTy, - ConcatMask, CostKind); - NewCost += TTI.getCastInstrCost(Instruction::BitCast, ConcatIntTy, ConcatTy, - TTI::CastContextHint::None, CostKind); - if (Ty != ConcatIntTy) - NewCost += TTI.getCastInstrCost(Instruction::ZExt, Ty, ConcatIntTy, - TTI::CastContextHint::None, CostKind); - if (ShAmtX > 0) - NewCost += TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind); - - if (NewCost > OldCost) - return false; - - // Build bool mask concatenation, bitcast back to scalar integer, and perform - // any residual zero-extension or shifting. - Value *Concat = Builder.CreateShuffleVector(SrcX, SrcY, ConcatMask); - Worklist.pushValue(Concat); - - Value *Result = Builder.CreateBitCast(Concat, ConcatIntTy); - - if (Ty != ConcatIntTy) { - Worklist.pushValue(Result); - Result = Builder.CreateZExt(Result, Ty); - } - - if (ShAmtX > 0) { - Worklist.pushValue(Result); - Result = Builder.CreateShl(Result, ShAmtX); - } - - replaceValue(I, *Result); - return true; -} - /// Try to convert "shuffle (binop (shuffle, shuffle)), undef" /// --> "binop (shuffle), (shuffle)". bool VectorCombine::foldPermuteOfBinops(Instruction &I) { @@ -3016,9 +2908,6 @@ bool VectorCombine::run() { if (TryEarlyFoldsOnly) return; - if (I.getType()->isIntegerTy()) - MadeChange |= foldConcatOfBoolMasks(I); - // Otherwise, try folds that improve codegen but may interfere with // early IR canonicalizations. // The type checking is for run-time efficiency. We can avoid wasting time diff --git a/llvm/test/Transforms/PhaseOrdering/X86/concat-boolmasks.ll b/llvm/test/Transforms/PhaseOrdering/X86/concat-boolmasks.ll index d068e1b2e0516..07bfbffa9518f 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/concat-boolmasks.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/concat-boolmasks.ll @@ -1,22 +1,20 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE -; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE -; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX2 -; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512 +; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s +; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s +; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s +; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s define i32 @movmsk_i32_v32i8_v16i8(<16 x i8> %v0, <16 x i8> %v1) { -; SSE-LABEL: @movmsk_i32_v32i8_v16i8( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> [[V0:%.*]], <32 x i32> -; SSE-NEXT: [[TMP2:%.*]] = icmp slt <32 x i8> [[TMP1]], zeroinitializer -; SSE-NEXT: [[OR:%.*]] = bitcast <32 x i1> [[TMP2]] to i32 -; SSE-NEXT: ret i32 [[OR]] -; -; AVX-LABEL: @movmsk_i32_v32i8_v16i8( -; AVX-NEXT: [[C0:%.*]] = icmp slt <16 x i8> [[V0:%.*]], zeroinitializer -; AVX-NEXT: [[C1:%.*]] = icmp slt <16 x i8> [[V1:%.*]], zeroinitializer -; AVX-NEXT: [[TMP1:%.*]] = shufflevector <16 x i1> [[C1]], <16 x i1> [[C0]], <32 x i32> -; AVX-NEXT: [[OR:%.*]] = bitcast <32 x i1> [[TMP1]] to i32 -; AVX-NEXT: ret i32 [[OR]] +; CHECK-LABEL: @movmsk_i32_v32i8_v16i8( +; CHECK-NEXT: [[C0:%.*]] = icmp slt <16 x i8> [[V0:%.*]], zeroinitializer +; CHECK-NEXT: [[C1:%.*]] = icmp slt <16 x i8> [[V1:%.*]], zeroinitializer +; CHECK-NEXT: [[B0:%.*]] = bitcast <16 x i1> [[C0]] to i16 +; CHECK-NEXT: [[B1:%.*]] = bitcast <16 x i1> [[C1]] to i16 +; CHECK-NEXT: [[Z0:%.*]] = zext i16 [[B0]] to i32 +; CHECK-NEXT: [[Z1:%.*]] = zext i16 [[B1]] to i32 +; CHECK-NEXT: [[S0:%.*]] = shl nuw i32 [[Z0]], 16 +; CHECK-NEXT: [[OR:%.*]] = or disjoint i32 [[S0]], [[Z1]] +; CHECK-NEXT: ret i32 [[OR]] ; %c0 = icmp slt <16 x i8> %v0, zeroinitializer %c1 = icmp slt <16 x i8> %v1, zeroinitializer @@ -30,20 +28,16 @@ define i32 @movmsk_i32_v32i8_v16i8(<16 x i8> %v0, <16 x i8> %v1) { } define i32 @movmsk_i32_v8i32_v4i32(<4 x i32> %v0, <4 x i32> %v1) { -; SSE-LABEL: @movmsk_i32_v8i32_v4i32( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <8 x i32> -; SSE-NEXT: [[TMP2:%.*]] = icmp slt <8 x i32> [[TMP1]], zeroinitializer -; SSE-NEXT: [[TMP3:%.*]] = bitcast <8 x i1> [[TMP2]] to i8 -; SSE-NEXT: [[OR:%.*]] = zext i8 [[TMP3]] to i32 -; SSE-NEXT: ret i32 [[OR]] -; -; AVX-LABEL: @movmsk_i32_v8i32_v4i32( -; AVX-NEXT: [[C0:%.*]] = icmp slt <4 x i32> [[V0:%.*]], zeroinitializer -; AVX-NEXT: [[C1:%.*]] = icmp slt <4 x i32> [[V1:%.*]], zeroinitializer -; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i1> [[C1]], <4 x i1> [[C0]], <8 x i32> -; AVX-NEXT: [[TMP2:%.*]] = bitcast <8 x i1> [[TMP1]] to i8 -; AVX-NEXT: [[OR:%.*]] = zext i8 [[TMP2]] to i32 -; AVX-NEXT: ret i32 [[OR]] +; CHECK-LABEL: @movmsk_i32_v8i32_v4i32( +; CHECK-NEXT: [[C0:%.*]] = icmp slt <4 x i32> [[V0:%.*]], zeroinitializer +; CHECK-NEXT: [[C1:%.*]] = icmp slt <4 x i32> [[V1:%.*]], zeroinitializer +; CHECK-NEXT: [[B0:%.*]] = bitcast <4 x i1> [[C0]] to i4 +; CHECK-NEXT: [[B1:%.*]] = bitcast <4 x i1> [[C1]] to i4 +; CHECK-NEXT: [[Z0:%.*]] = zext i4 [[B0]] to i32 +; CHECK-NEXT: [[Z1:%.*]] = zext i4 [[B1]] to i32 +; CHECK-NEXT: [[S0:%.*]] = shl nuw nsw i32 [[Z0]], 4 +; CHECK-NEXT: [[OR:%.*]] = or disjoint i32 [[S0]], [[Z1]] +; CHECK-NEXT: ret i32 [[OR]] ; %c0 = icmp slt <4 x i32> %v0, zeroinitializer %c1 = icmp slt <4 x i32> %v1, zeroinitializer @@ -57,20 +51,16 @@ define i32 @movmsk_i32_v8i32_v4i32(<4 x i32> %v0, <4 x i32> %v1) { } define i64 @movmsk_i64_v32i8_v16i8(<16 x i8> %v0, <16 x i8> %v1) { -; SSE-LABEL: @movmsk_i64_v32i8_v16i8( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> [[V0:%.*]], <32 x i32> -; SSE-NEXT: [[TMP2:%.*]] = icmp slt <32 x i8> [[TMP1]], zeroinitializer -; SSE-NEXT: [[TMP3:%.*]] = bitcast <32 x i1> [[TMP2]] to i32 -; SSE-NEXT: [[OR:%.*]] = zext i32 [[TMP3]] to i64 -; SSE-NEXT: ret i64 [[OR]] -; -; AVX-LABEL: @movmsk_i64_v32i8_v16i8( -; AVX-NEXT: [[C0:%.*]] = icmp slt <16 x i8> [[V0:%.*]], zeroinitializer -; AVX-NEXT: [[C1:%.*]] = icmp slt <16 x i8> [[V1:%.*]], zeroinitializer -; AVX-NEXT: [[TMP1:%.*]] = shufflevector <16 x i1> [[C1]], <16 x i1> [[C0]], <32 x i32> -; AVX-NEXT: [[TMP2:%.*]] = bitcast <32 x i1> [[TMP1]] to i32 -; AVX-NEXT: [[OR:%.*]] = zext i32 [[TMP2]] to i64 -; AVX-NEXT: ret i64 [[OR]] +; CHECK-LABEL: @movmsk_i64_v32i8_v16i8( +; CHECK-NEXT: [[C0:%.*]] = icmp slt <16 x i8> [[V0:%.*]], zeroinitializer +; CHECK-NEXT: [[C1:%.*]] = icmp slt <16 x i8> [[V1:%.*]], zeroinitializer +; CHECK-NEXT: [[B0:%.*]] = bitcast <16 x i1> [[C0]] to i16 +; CHECK-NEXT: [[B1:%.*]] = bitcast <16 x i1> [[C1]] to i16 +; CHECK-NEXT: [[Z0:%.*]] = zext i16 [[B0]] to i64 +; CHECK-NEXT: [[Z1:%.*]] = zext i16 [[B1]] to i64 +; CHECK-NEXT: [[S0:%.*]] = shl nuw nsw i64 [[Z0]], 16 +; CHECK-NEXT: [[OR:%.*]] = or disjoint i64 [[S0]], [[Z1]] +; CHECK-NEXT: ret i64 [[OR]] ; %c0 = icmp slt <16 x i8> %v0, zeroinitializer %c1 = icmp slt <16 x i8> %v1, zeroinitializer @@ -84,20 +74,16 @@ define i64 @movmsk_i64_v32i8_v16i8(<16 x i8> %v0, <16 x i8> %v1) { } define i64 @movmsk_i64_v8i32_v4i32(<4 x i32> %v0, <4 x i32> %v1) { -; SSE-LABEL: @movmsk_i64_v8i32_v4i32( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <8 x i32> -; SSE-NEXT: [[TMP2:%.*]] = icmp slt <8 x i32> [[TMP1]], zeroinitializer -; SSE-NEXT: [[TMP3:%.*]] = bitcast <8 x i1> [[TMP2]] to i8 -; SSE-NEXT: [[OR:%.*]] = zext i8 [[TMP3]] to i64 -; SSE-NEXT: ret i64 [[OR]] -; -; AVX-LABEL: @movmsk_i64_v8i32_v4i32( -; AVX-NEXT: [[C0:%.*]] = icmp slt <4 x i32> [[V0:%.*]], zeroinitializer -; AVX-NEXT: [[C1:%.*]] = icmp slt <4 x i32> [[V1:%.*]], zeroinitializer -; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i1> [[C1]], <4 x i1> [[C0]], <8 x i32> -; AVX-NEXT: [[TMP2:%.*]] = bitcast <8 x i1> [[TMP1]] to i8 -; AVX-NEXT: [[OR:%.*]] = zext i8 [[TMP2]] to i64 -; AVX-NEXT: ret i64 [[OR]] +; CHECK-LABEL: @movmsk_i64_v8i32_v4i32( +; CHECK-NEXT: [[C0:%.*]] = icmp slt <4 x i32> [[V0:%.*]], zeroinitializer +; CHECK-NEXT: [[C1:%.*]] = icmp slt <4 x i32> [[V1:%.*]], zeroinitializer +; CHECK-NEXT: [[B0:%.*]] = bitcast <4 x i1> [[C0]] to i4 +; CHECK-NEXT: [[B1:%.*]] = bitcast <4 x i1> [[C1]] to i4 +; CHECK-NEXT: [[Z0:%.*]] = zext i4 [[B0]] to i64 +; CHECK-NEXT: [[Z1:%.*]] = zext i4 [[B1]] to i64 +; CHECK-NEXT: [[S0:%.*]] = shl nuw nsw i64 [[Z0]], 4 +; CHECK-NEXT: [[OR:%.*]] = or disjoint i64 [[S0]], [[Z1]] +; CHECK-NEXT: ret i64 [[OR]] ; %c0 = icmp slt <4 x i32> %v0, zeroinitializer %c1 = icmp slt <4 x i32> %v1, zeroinitializer @@ -111,24 +97,26 @@ define i64 @movmsk_i64_v8i32_v4i32(<4 x i32> %v0, <4 x i32> %v1) { } define i64 @movmsk_i64_v64i8_v16i8(<16 x i8> %v0, <16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) { -; SSE-LABEL: @movmsk_i64_v64i8_v16i8( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[V3:%.*]], <16 x i8> [[V2:%.*]], <32 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> [[V0:%.*]], <32 x i32> -; SSE-NEXT: [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> [[TMP2]], <64 x i32> -; SSE-NEXT: [[TMP4:%.*]] = icmp slt <64 x i8> [[TMP3]], zeroinitializer -; SSE-NEXT: [[OR:%.*]] = bitcast <64 x i1> [[TMP4]] to i64 -; SSE-NEXT: ret i64 [[OR]] -; -; AVX-LABEL: @movmsk_i64_v64i8_v16i8( -; AVX-NEXT: [[C0:%.*]] = icmp slt <16 x i8> [[V0:%.*]], zeroinitializer -; AVX-NEXT: [[C1:%.*]] = icmp slt <16 x i8> [[V1:%.*]], zeroinitializer -; AVX-NEXT: [[C2:%.*]] = icmp slt <16 x i8> [[V2:%.*]], zeroinitializer -; AVX-NEXT: [[C3:%.*]] = icmp slt <16 x i8> [[V3:%.*]], zeroinitializer -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <16 x i1> [[C1]], <16 x i1> [[C0]], <32 x i32> -; AVX-NEXT: [[TMP1:%.*]] = shufflevector <16 x i1> [[C3]], <16 x i1> [[C2]], <32 x i32> -; AVX-NEXT: [[TMP3:%.*]] = shufflevector <32 x i1> [[TMP1]], <32 x i1> [[TMP2]], <64 x i32> -; AVX-NEXT: [[OR:%.*]] = bitcast <64 x i1> [[TMP3]] to i64 -; AVX-NEXT: ret i64 [[OR]] +; CHECK-LABEL: @movmsk_i64_v64i8_v16i8( +; CHECK-NEXT: [[C0:%.*]] = icmp slt <16 x i8> [[V0:%.*]], zeroinitializer +; CHECK-NEXT: [[C1:%.*]] = icmp slt <16 x i8> [[V1:%.*]], zeroinitializer +; CHECK-NEXT: [[C2:%.*]] = icmp slt <16 x i8> [[V2:%.*]], zeroinitializer +; CHECK-NEXT: [[C3:%.*]] = icmp slt <16 x i8> [[V3:%.*]], zeroinitializer +; CHECK-NEXT: [[B0:%.*]] = bitcast <16 x i1> [[C0]] to i16 +; CHECK-NEXT: [[B1:%.*]] = bitcast <16 x i1> [[C1]] to i16 +; CHECK-NEXT: [[B2:%.*]] = bitcast <16 x i1> [[C2]] to i16 +; CHECK-NEXT: [[B3:%.*]] = bitcast <16 x i1> [[C3]] to i16 +; CHECK-NEXT: [[Z0:%.*]] = zext i16 [[B0]] to i64 +; CHECK-NEXT: [[Z1:%.*]] = zext i16 [[B1]] to i64 +; CHECK-NEXT: [[Z2:%.*]] = zext i16 [[B2]] to i64 +; CHECK-NEXT: [[Z3:%.*]] = zext i16 [[B3]] to i64 +; CHECK-NEXT: [[S0:%.*]] = shl nuw i64 [[Z0]], 48 +; CHECK-NEXT: [[S1:%.*]] = shl nuw nsw i64 [[Z1]], 32 +; CHECK-NEXT: [[S2:%.*]] = shl nuw nsw i64 [[Z2]], 16 +; CHECK-NEXT: [[OR0:%.*]] = or disjoint i64 [[S1]], [[S0]] +; CHECK-NEXT: [[OR1:%.*]] = or disjoint i64 [[S2]], [[Z3]] +; CHECK-NEXT: [[OR:%.*]] = or disjoint i64 [[OR1]], [[OR0]] +; CHECK-NEXT: ret i64 [[OR]] ; %c0 = icmp slt <16 x i8> %v0, zeroinitializer %c1 = icmp slt <16 x i8> %v1, zeroinitializer @@ -152,26 +140,26 @@ define i64 @movmsk_i64_v64i8_v16i8(<16 x i8> %v0, <16 x i8> %v1, <16 x i8> %v2, } define i64 @movmsk_i64_v32i32_v4i32(<4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) { -; SSE-LABEL: @movmsk_i64_v32i32_v4i32( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V3:%.*]], <4 x i32> [[V2:%.*]], <8 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <8 x i32> -; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <16 x i32> -; SSE-NEXT: [[TMP4:%.*]] = icmp slt <16 x i32> [[TMP3]], zeroinitializer -; SSE-NEXT: [[TMP5:%.*]] = bitcast <16 x i1> [[TMP4]] to i16 -; SSE-NEXT: [[OR:%.*]] = zext i16 [[TMP5]] to i64 -; SSE-NEXT: ret i64 [[OR]] -; -; AVX-LABEL: @movmsk_i64_v32i32_v4i32( -; AVX-NEXT: [[C0:%.*]] = icmp slt <4 x i32> [[V0:%.*]], zeroinitializer -; AVX-NEXT: [[C1:%.*]] = icmp slt <4 x i32> [[V1:%.*]], zeroinitializer -; AVX-NEXT: [[C2:%.*]] = icmp slt <4 x i32> [[V2:%.*]], zeroinitializer -; AVX-NEXT: [[C3:%.*]] = icmp slt <4 x i32> [[V3:%.*]], zeroinitializer -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x i1> [[C1]], <4 x i1> [[C0]], <8 x i32> -; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i1> [[C3]], <4 x i1> [[C2]], <8 x i32> -; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <16 x i32> -; AVX-NEXT: [[TMP4:%.*]] = bitcast <16 x i1> [[TMP3]] to i16 -; AVX-NEXT: [[OR:%.*]] = zext i16 [[TMP4]] to i64 -; AVX-NEXT: ret i64 [[OR]] +; CHECK-LABEL: @movmsk_i64_v32i32_v4i32( +; CHECK-NEXT: [[C0:%.*]] = icmp slt <4 x i32> [[V0:%.*]], zeroinitializer +; CHECK-NEXT: [[C1:%.*]] = icmp slt <4 x i32> [[V1:%.*]], zeroinitializer +; CHECK-NEXT: [[C2:%.*]] = icmp slt <4 x i32> [[V2:%.*]], zeroinitializer +; CHECK-NEXT: [[C3:%.*]] = icmp slt <4 x i32> [[V3:%.*]], zeroinitializer +; CHECK-NEXT: [[B0:%.*]] = bitcast <4 x i1> [[C0]] to i4 +; CHECK-NEXT: [[B1:%.*]] = bitcast <4 x i1> [[C1]] to i4 +; CHECK-NEXT: [[B2:%.*]] = bitcast <4 x i1> [[C2]] to i4 +; CHECK-NEXT: [[B3:%.*]] = bitcast <4 x i1> [[C3]] to i4 +; CHECK-NEXT: [[Z0:%.*]] = zext i4 [[B0]] to i64 +; CHECK-NEXT: [[Z1:%.*]] = zext i4 [[B1]] to i64 +; CHECK-NEXT: [[Z2:%.*]] = zext i4 [[B2]] to i64 +; CHECK-NEXT: [[Z3:%.*]] = zext i4 [[B3]] to i64 +; CHECK-NEXT: [[S0:%.*]] = shl nuw nsw i64 [[Z0]], 12 +; CHECK-NEXT: [[S1:%.*]] = shl nuw nsw i64 [[Z1]], 8 +; CHECK-NEXT: [[S2:%.*]] = shl nuw nsw i64 [[Z2]], 4 +; CHECK-NEXT: [[OR0:%.*]] = or disjoint i64 [[S1]], [[S0]] +; CHECK-NEXT: [[OR1:%.*]] = or disjoint i64 [[S2]], [[Z3]] +; CHECK-NEXT: [[OR:%.*]] = or disjoint i64 [[OR1]], [[OR0]] +; CHECK-NEXT: ret i64 [[OR]] ; %c0 = icmp slt <4 x i32> %v0, zeroinitializer %c1 = icmp slt <4 x i32> %v1, zeroinitializer @@ -195,24 +183,16 @@ define i64 @movmsk_i64_v32i32_v4i32(<4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, } define i64 @movmsk_i64_v64i8_v32i8(<32 x i8> %v0, <32 x i8> %v1) { -; SSE-LABEL: @movmsk_i64_v64i8_v32i8( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[V1:%.*]], <32 x i8> [[V0:%.*]], <64 x i32> -; SSE-NEXT: [[TMP2:%.*]] = icmp slt <64 x i8> [[TMP1]], zeroinitializer -; SSE-NEXT: [[OR:%.*]] = bitcast <64 x i1> [[TMP2]] to i64 -; SSE-NEXT: ret i64 [[OR]] -; -; AVX2-LABEL: @movmsk_i64_v64i8_v32i8( -; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[V1:%.*]], <32 x i8> [[V0:%.*]], <64 x i32> -; AVX2-NEXT: [[TMP2:%.*]] = icmp slt <64 x i8> [[TMP1]], zeroinitializer -; AVX2-NEXT: [[OR:%.*]] = bitcast <64 x i1> [[TMP2]] to i64 -; AVX2-NEXT: ret i64 [[OR]] -; -; AVX512-LABEL: @movmsk_i64_v64i8_v32i8( -; AVX512-NEXT: [[C0:%.*]] = icmp slt <32 x i8> [[V0:%.*]], zeroinitializer -; AVX512-NEXT: [[C1:%.*]] = icmp slt <32 x i8> [[V1:%.*]], zeroinitializer -; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <32 x i1> [[C1]], <32 x i1> [[C0]], <64 x i32> -; AVX512-NEXT: [[OR:%.*]] = bitcast <64 x i1> [[TMP1]] to i64 -; AVX512-NEXT: ret i64 [[OR]] +; CHECK-LABEL: @movmsk_i64_v64i8_v32i8( +; CHECK-NEXT: [[C0:%.*]] = icmp slt <32 x i8> [[V0:%.*]], zeroinitializer +; CHECK-NEXT: [[C1:%.*]] = icmp slt <32 x i8> [[V1:%.*]], zeroinitializer +; CHECK-NEXT: [[B0:%.*]] = bitcast <32 x i1> [[C0]] to i32 +; CHECK-NEXT: [[B1:%.*]] = bitcast <32 x i1> [[C1]] to i32 +; CHECK-NEXT: [[Z0:%.*]] = zext i32 [[B0]] to i64 +; CHECK-NEXT: [[Z1:%.*]] = zext i32 [[B1]] to i64 +; CHECK-NEXT: [[S0:%.*]] = shl nuw i64 [[Z0]], 32 +; CHECK-NEXT: [[OR:%.*]] = or disjoint i64 [[S0]], [[Z1]] +; CHECK-NEXT: ret i64 [[OR]] ; %c0 = icmp slt <32 x i8> %v0, zeroinitializer %c1 = icmp slt <32 x i8> %v1, zeroinitializer @@ -226,27 +206,16 @@ define i64 @movmsk_i64_v64i8_v32i8(<32 x i8> %v0, <32 x i8> %v1) { } define i32 @movmsk_i32_v16i32_v8i32(<8 x i32> %v0, <8 x i32> %v1) { -; SSE-LABEL: @movmsk_i32_v16i32_v8i32( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[V1:%.*]], <8 x i32> [[V0:%.*]], <16 x i32> -; SSE-NEXT: [[TMP2:%.*]] = icmp slt <16 x i32> [[TMP1]], zeroinitializer -; SSE-NEXT: [[TMP3:%.*]] = bitcast <16 x i1> [[TMP2]] to i16 -; SSE-NEXT: [[OR:%.*]] = zext i16 [[TMP3]] to i32 -; SSE-NEXT: ret i32 [[OR]] -; -; AVX2-LABEL: @movmsk_i32_v16i32_v8i32( -; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[V1:%.*]], <8 x i32> [[V0:%.*]], <16 x i32> -; AVX2-NEXT: [[TMP2:%.*]] = icmp slt <16 x i32> [[TMP1]], zeroinitializer -; AVX2-NEXT: [[TMP3:%.*]] = bitcast <16 x i1> [[TMP2]] to i16 -; AVX2-NEXT: [[OR:%.*]] = zext i16 [[TMP3]] to i32 -; AVX2-NEXT: ret i32 [[OR]] -; -; AVX512-LABEL: @movmsk_i32_v16i32_v8i32( -; AVX512-NEXT: [[C0:%.*]] = icmp slt <8 x i32> [[V0:%.*]], zeroinitializer -; AVX512-NEXT: [[C1:%.*]] = icmp slt <8 x i32> [[V1:%.*]], zeroinitializer -; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x i1> [[C1]], <8 x i1> [[C0]], <16 x i32> -; AVX512-NEXT: [[TMP2:%.*]] = bitcast <16 x i1> [[TMP1]] to i16 -; AVX512-NEXT: [[OR:%.*]] = zext i16 [[TMP2]] to i32 -; AVX512-NEXT: ret i32 [[OR]] +; CHECK-LABEL: @movmsk_i32_v16i32_v8i32( +; CHECK-NEXT: [[C0:%.*]] = icmp slt <8 x i32> [[V0:%.*]], zeroinitializer +; CHECK-NEXT: [[C1:%.*]] = icmp slt <8 x i32> [[V1:%.*]], zeroinitializer +; CHECK-NEXT: [[B0:%.*]] = bitcast <8 x i1> [[C0]] to i8 +; CHECK-NEXT: [[B1:%.*]] = bitcast <8 x i1> [[C1]] to i8 +; CHECK-NEXT: [[Z0:%.*]] = zext i8 [[B0]] to i32 +; CHECK-NEXT: [[Z1:%.*]] = zext i8 [[B1]] to i32 +; CHECK-NEXT: [[S0:%.*]] = shl nuw nsw i32 [[Z0]], 8 +; CHECK-NEXT: [[OR:%.*]] = or disjoint i32 [[S0]], [[Z1]] +; CHECK-NEXT: ret i32 [[OR]] ; %c0 = icmp slt <8 x i32> %v0, zeroinitializer %c1 = icmp slt <8 x i32> %v1, zeroinitializer @@ -260,26 +229,16 @@ define i32 @movmsk_i32_v16i32_v8i32(<8 x i32> %v0, <8 x i32> %v1) { } define i64 @PR111431(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %a2) { -; SSE-LABEL: @PR111431( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <64 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <32 x i8> [[A2:%.*]], <32 x i8> [[A1:%.*]], <64 x i32> -; SSE-NEXT: [[TMP3:%.*]] = icmp eq <64 x i8> [[TMP1]], [[TMP2]] -; SSE-NEXT: [[OR:%.*]] = bitcast <64 x i1> [[TMP3]] to i64 -; SSE-NEXT: ret i64 [[OR]] -; -; AVX2-LABEL: @PR111431( -; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <64 x i32> -; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <32 x i8> [[A2:%.*]], <32 x i8> [[A1:%.*]], <64 x i32> -; AVX2-NEXT: [[TMP3:%.*]] = icmp eq <64 x i8> [[TMP1]], [[TMP2]] -; AVX2-NEXT: [[OR:%.*]] = bitcast <64 x i1> [[TMP3]] to i64 -; AVX2-NEXT: ret i64 [[OR]] -; -; AVX512-LABEL: @PR111431( -; AVX512-NEXT: [[C01:%.*]] = icmp eq <32 x i8> [[A0:%.*]], [[A1:%.*]] -; AVX512-NEXT: [[C02:%.*]] = icmp eq <32 x i8> [[A0]], [[A2:%.*]] -; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <32 x i1> [[C02]], <32 x i1> [[C01]], <64 x i32> -; AVX512-NEXT: [[OR:%.*]] = bitcast <64 x i1> [[TMP1]] to i64 -; AVX512-NEXT: ret i64 [[OR]] +; CHECK-LABEL: @PR111431( +; CHECK-NEXT: [[C01:%.*]] = icmp eq <32 x i8> [[A0:%.*]], [[A1:%.*]] +; CHECK-NEXT: [[C02:%.*]] = icmp eq <32 x i8> [[A0]], [[A2:%.*]] +; CHECK-NEXT: [[B01:%.*]] = bitcast <32 x i1> [[C01]] to i32 +; CHECK-NEXT: [[B02:%.*]] = bitcast <32 x i1> [[C02]] to i32 +; CHECK-NEXT: [[Z01:%.*]] = zext i32 [[B01]] to i64 +; CHECK-NEXT: [[Z02:%.*]] = zext i32 [[B02]] to i64 +; CHECK-NEXT: [[SHL:%.*]] = shl nuw i64 [[Z01]], 32 +; CHECK-NEXT: [[OR:%.*]] = or disjoint i64 [[SHL]], [[Z02]] +; CHECK-NEXT: ret i64 [[OR]] ; %c01 = icmp eq <32 x i8> %a0, %a1 %c02 = icmp eq <32 x i8> %a0, %a2