diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 558d75c5eb388..7b3526cca119f 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -10706,6 +10706,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { }); SmallPtrSet UniqueBases; unsigned SliceSize = getPartNumElems(VL.size(), NumParts); + SmallDenseMap VectorOpsToExtracts; for (unsigned Part : seq(NumParts)) { unsigned Limit = getNumElems(VL.size(), SliceSize, Part); ArrayRef SubMask = Mask.slice(Part * SliceSize, Limit); @@ -10756,10 +10757,18 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { continue; } } - Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(), - CostKind, Idx); - } - } + APInt &DemandedElts = + VectorOpsToExtracts + .try_emplace(VecBase, + APInt::getZero(getNumElements(VecBase->getType()))) + .first->getSecond(); + DemandedElts.setBit(Idx); + } + } + for (const auto &[Vec, DemandedElts] : VectorOpsToExtracts) + Cost -= TTI.getScalarizationOverhead(cast(Vec->getType()), + DemandedElts, /*Insert=*/false, + /*Extract=*/true, CostKind); // Check that gather of extractelements can be represented as just a // shuffle of a single/two vectors the scalars are extracted from. // Found the bunch of extractelement instructions that must be gathered @@ -11283,24 +11292,27 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, } case Instruction::ExtractValue: case Instruction::ExtractElement: { + APInt DemandedElts; + VectorType *SrcVecTy = nullptr; auto GetScalarCost = [&](unsigned Idx) { if (isa(UniqueValues[Idx])) return InstructionCost(TTI::TCC_Free); auto *I = cast(UniqueValues[Idx]); - VectorType *SrcVecTy; - if (ShuffleOrOp == Instruction::ExtractElement) { - auto *EE = cast(I); - SrcVecTy = EE->getVectorOperandType(); - } else { - auto *EV = cast(I); - Type *AggregateTy = EV->getAggregateOperand()->getType(); - unsigned NumElts; - if (auto *ATy = dyn_cast(AggregateTy)) - NumElts = ATy->getNumElements(); - else - NumElts = AggregateTy->getStructNumElements(); - SrcVecTy = getWidenedType(OrigScalarTy, NumElts); + if (!SrcVecTy) { + if (ShuffleOrOp == Instruction::ExtractElement) { + auto *EE = cast(I); + SrcVecTy = EE->getVectorOperandType(); + } else { + auto *EV = cast(I); + Type *AggregateTy = EV->getAggregateOperand()->getType(); + unsigned NumElts; + if (auto *ATy = dyn_cast(AggregateTy)) + NumElts = ATy->getNumElements(); + else + NumElts = AggregateTy->getStructNumElements(); + SrcVecTy = getWidenedType(OrigScalarTy, NumElts); + } } if (I->hasOneUse()) { Instruction *Ext = I->user_back(); @@ -11317,10 +11329,18 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, return Cost; } } - return TTI->getVectorInstrCost(Instruction::ExtractElement, SrcVecTy, - CostKind, *getExtractIndex(I)); + if (DemandedElts.isZero()) + DemandedElts = APInt::getZero(getNumElements(SrcVecTy)); + DemandedElts.setBit(*getExtractIndex(I)); + return InstructionCost(TTI::TCC_Free); + }; + auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) { + return CommonCost - (DemandedElts.isZero() + ? TTI::TCC_Free + : TTI.getScalarizationOverhead( + SrcVecTy, DemandedElts, /*Insert=*/false, + /*Extract=*/true, CostKind)); }; - auto GetVectorCost = [](InstructionCost CommonCost) { return CommonCost; }; return GetCostDiff(GetScalarCost, GetVectorCost); } case Instruction::InsertElement: { @@ -13663,6 +13683,7 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef VL, bool ForPoisonSrc, // Check if the same elements are inserted several times and count them as // shuffle candidates. APInt ShuffledElements = APInt::getZero(VL.size()); + APInt DemandedElements = APInt::getZero(VL.size()); DenseMap UniqueElements; constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; InstructionCost Cost; @@ -13673,9 +13694,7 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef VL, bool ForPoisonSrc, V = nullptr; } if (!ForPoisonSrc) - Cost += - TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, - I, Constant::getNullValue(VecTy), V); + DemandedElements.setBit(I); }; SmallVector ShuffleMask(VL.size(), PoisonMaskElem); for (unsigned I = 0, E = VL.size(); I < E; ++I) { @@ -13698,6 +13717,10 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef VL, bool ForPoisonSrc, ShuffledElements.setBit(I); ShuffleMask[I] = Res.first->second; } + if (!DemandedElements.isZero()) + Cost += + TTI->getScalarizationOverhead(VecTy, DemandedElements, /*Insert=*/true, + /*Extract=*/false, CostKind, VL); if (ForPoisonSrc) { if (isa(ScalarTy)) { assert(SLPReVec && "Only supported by REVEC."); diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/div.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/div.ll index 29bd81998cdb2..bb88edff11634 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/div.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/div.ll @@ -607,35 +607,13 @@ define <2 x i32> @sdiv_v2i32_unknown_divisor(<2 x i32> %a, <2 x i32> %x, <2 x i3 ; computes (a/const + x - y) * z define <2 x i32> @sdiv_v2i32_const_divisor(<2 x i32> %a, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) -; NO-SVE-LABEL: define <2 x i32> @sdiv_v2i32_const_divisor( -; NO-SVE-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]], <2 x i32> [[Z:%.*]]) #[[ATTR0]] { -; NO-SVE-NEXT: [[A0:%.*]] = extractelement <2 x i32> [[A]], i64 0 -; NO-SVE-NEXT: [[A1:%.*]] = extractelement <2 x i32> [[A]], i64 1 -; NO-SVE-NEXT: [[TMP1:%.*]] = sdiv i32 [[A0]], 2 -; NO-SVE-NEXT: [[TMP2:%.*]] = sdiv i32 [[A1]], 4 -; NO-SVE-NEXT: [[X0:%.*]] = extractelement <2 x i32> [[X]], i64 0 -; NO-SVE-NEXT: [[X1:%.*]] = extractelement <2 x i32> [[X]], i64 1 -; NO-SVE-NEXT: [[TMP3:%.*]] = add i32 [[TMP1]], [[X0]] -; NO-SVE-NEXT: [[TMP4:%.*]] = add i32 [[TMP2]], [[X1]] -; NO-SVE-NEXT: [[Y0:%.*]] = extractelement <2 x i32> [[Y]], i64 0 -; NO-SVE-NEXT: [[Y1:%.*]] = extractelement <2 x i32> [[Y]], i64 1 -; NO-SVE-NEXT: [[TMP5:%.*]] = sub i32 [[TMP3]], [[Y0]] -; NO-SVE-NEXT: [[TMP6:%.*]] = sub i32 [[TMP4]], [[Y1]] -; NO-SVE-NEXT: [[Z0:%.*]] = extractelement <2 x i32> [[Z]], i64 0 -; NO-SVE-NEXT: [[Z1:%.*]] = extractelement <2 x i32> [[Z]], i64 1 -; NO-SVE-NEXT: [[TMP7:%.*]] = mul i32 [[TMP5]], [[Z0]] -; NO-SVE-NEXT: [[TMP8:%.*]] = mul i32 [[TMP6]], [[Z1]] -; NO-SVE-NEXT: [[RES0:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0 -; NO-SVE-NEXT: [[RES1:%.*]] = insertelement <2 x i32> [[RES0]], i32 [[TMP8]], i32 1 -; NO-SVE-NEXT: ret <2 x i32> [[RES1]] -; -; SVE-LABEL: define <2 x i32> @sdiv_v2i32_const_divisor( -; SVE-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]], <2 x i32> [[Z:%.*]]) #[[ATTR0]] { -; SVE-NEXT: [[TMP1:%.*]] = sdiv <2 x i32> [[A]], -; SVE-NEXT: [[TMP2:%.*]] = add <2 x i32> [[TMP1]], [[X]] -; SVE-NEXT: [[TMP3:%.*]] = sub <2 x i32> [[TMP2]], [[Y]] -; SVE-NEXT: [[TMP4:%.*]] = mul <2 x i32> [[TMP3]], [[Z]] -; SVE-NEXT: ret <2 x i32> [[TMP4]] +; CHECK-LABEL: define <2 x i32> @sdiv_v2i32_const_divisor( +; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]], <2 x i32> [[Z:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = sdiv <2 x i32> [[A]], +; CHECK-NEXT: [[TMP2:%.*]] = add <2 x i32> [[TMP1]], [[X]] +; CHECK-NEXT: [[TMP3:%.*]] = sub <2 x i32> [[TMP2]], [[Y]] +; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i32> [[TMP3]], [[Z]] +; CHECK-NEXT: ret <2 x i32> [[TMP4]] ; { %a0 = extractelement <2 x i32> %a, i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-with-reuses.ll b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-with-reuses.ll index 579239bc659bd..75a413ffc1fb1 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-with-reuses.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-with-reuses.ll @@ -10,15 +10,18 @@ define <4 x double> @test(ptr %ia, ptr %ib, ptr %ic, ptr %id, ptr %ie, ptr %x) { ; CHECK-NEXT: [[I4275:%.*]] = load double, ptr [[ID]], align 8 ; CHECK-NEXT: [[I4277:%.*]] = load double, ptr [[IE]], align 8 ; CHECK-NEXT: [[I4326:%.*]] = load <4 x double>, ptr [[X]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[I4326]], <4 x double> poison, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[I4275]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x double> poison, double [[I4238]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x double> [[TMP4]], double [[I4252]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x double> [[TMP5]], double [[I4264]], i32 2 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x double> [[TMP6]], double [[I4277]], i32 3 -; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <4 x double> [[TMP3]], [[TMP7]] -; CHECK-NEXT: ret <4 x double> [[TMP8]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[I4326]], <4 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[I4238]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[I4252]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <2 x double> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP1]], double [[I4275]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[I4264]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[I4277]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <2 x double> [[TMP5]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> poison, <4 x i32> +; CHECK-NEXT: [[I44281:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> [[TMP10]], <4 x i32> +; CHECK-NEXT: ret <4 x double> [[I44281]] ; %i4238 = load double, ptr %ia, align 8 %i4252 = load double, ptr %ib, align 8 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll index de99654d84eb8..c2369a6a89ec1 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll @@ -9,7 +9,7 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16 ; YAML-NEXT: Function: foo ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'SLP vectorized with cost ' -; YAML-NEXT: - Cost: '-3' +; YAML-NEXT: - Cost: '-4' ; YAML-NEXT: - String: ' and with tree size ' ; YAML-NEXT: - TreeSize: '10' ; YAML-NEXT: ... diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll index 2cdbd5cff4468..cb4783010965e 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll @@ -49,11 +49,24 @@ define i32 @reduce_and4(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, < ; ; AVX512-LABEL: @reduce_and4( ; AVX512-NEXT: entry: -; AVX512-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> -; AVX512-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP0]]) -; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> -; AVX512-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP2]]) -; AVX512-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP1]], [[TMP3]] +; AVX512-NEXT: [[VECEXT:%.*]] = extractelement <4 x i32> [[V1:%.*]], i64 0 +; AVX512-NEXT: [[VECEXT1:%.*]] = extractelement <4 x i32> [[V1]], i64 1 +; AVX512-NEXT: [[VECEXT2:%.*]] = extractelement <4 x i32> [[V1]], i64 2 +; AVX512-NEXT: [[VECEXT4:%.*]] = extractelement <4 x i32> [[V1]], i64 3 +; AVX512-NEXT: [[VECEXT7:%.*]] = extractelement <4 x i32> [[V2:%.*]], i64 0 +; AVX512-NEXT: [[VECEXT8:%.*]] = extractelement <4 x i32> [[V2]], i64 1 +; AVX512-NEXT: [[VECEXT10:%.*]] = extractelement <4 x i32> [[V2]], i64 2 +; AVX512-NEXT: [[VECEXT12:%.*]] = extractelement <4 x i32> [[V2]], i64 3 +; AVX512-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <16 x i32> +; AVX512-NEXT: [[TMP1:%.*]] = insertelement <16 x i32> [[TMP0]], i32 [[VECEXT8]], i32 8 +; AVX512-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[VECEXT7]], i32 9 +; AVX512-NEXT: [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[VECEXT10]], i32 10 +; AVX512-NEXT: [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[VECEXT12]], i32 11 +; AVX512-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> [[TMP4]], i32 [[VECEXT1]], i32 12 +; AVX512-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> [[TMP5]], i32 [[VECEXT]], i32 13 +; AVX512-NEXT: [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[VECEXT2]], i32 14 +; AVX512-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[VECEXT4]], i32 15 +; AVX512-NEXT: [[OP_RDX:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP8]]) ; AVX512-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]] ; AVX512-NEXT: ret i32 [[OP_RDX1]] ; @@ -131,11 +144,24 @@ define i32 @reduce_and4_transpose(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i ; AVX2-NEXT: ret i32 [[OP_RDX]] ; ; AVX512-LABEL: @reduce_and4_transpose( -; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> -; AVX512-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]]) -; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> -; AVX512-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]]) -; AVX512-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP2]], [[TMP4]] +; AVX512-NEXT: [[VECEXT:%.*]] = extractelement <4 x i32> [[V1:%.*]], i64 0 +; AVX512-NEXT: [[VECEXT1:%.*]] = extractelement <4 x i32> [[V2:%.*]], i64 0 +; AVX512-NEXT: [[VECEXT7:%.*]] = extractelement <4 x i32> [[V1]], i64 1 +; AVX512-NEXT: [[VECEXT8:%.*]] = extractelement <4 x i32> [[V2]], i64 1 +; AVX512-NEXT: [[VECEXT15:%.*]] = extractelement <4 x i32> [[V1]], i64 2 +; AVX512-NEXT: [[VECEXT16:%.*]] = extractelement <4 x i32> [[V2]], i64 2 +; AVX512-NEXT: [[VECEXT23:%.*]] = extractelement <4 x i32> [[V1]], i64 3 +; AVX512-NEXT: [[VECEXT24:%.*]] = extractelement <4 x i32> [[V2]], i64 3 +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <16 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[VECEXT24]], i32 8 +; AVX512-NEXT: [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[VECEXT16]], i32 9 +; AVX512-NEXT: [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[VECEXT8]], i32 10 +; AVX512-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> [[TMP4]], i32 [[VECEXT1]], i32 11 +; AVX512-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> [[TMP5]], i32 [[VECEXT23]], i32 12 +; AVX512-NEXT: [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[VECEXT15]], i32 13 +; AVX512-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[VECEXT7]], i32 14 +; AVX512-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[VECEXT]], i32 15 +; AVX512-NEXT: [[OP_RDX:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP9]]) ; AVX512-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]] ; AVX512-NEXT: ret i32 [[OP_RDX1]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/reduction-gather-non-scheduled-extracts.ll b/llvm/test/Transforms/SLPVectorizer/reduction-gather-non-scheduled-extracts.ll index f1034f3971135..ae5018a63e214 100644 --- a/llvm/test/Transforms/SLPVectorizer/reduction-gather-non-scheduled-extracts.ll +++ b/llvm/test/Transforms/SLPVectorizer/reduction-gather-non-scheduled-extracts.ll @@ -1,22 +1,43 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 -; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-sie-ps5 < %s | FileCheck %s %} -; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %} +; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-sie-ps5 < %s | FileCheck %s --check-prefix=X86 %} +; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefix=AARCH64 %} define void @tes() { -; CHECK-LABEL: define void @tes() { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = fcmp ole <2 x double> zeroinitializer, zeroinitializer -; CHECK-NEXT: br label [[TMP1:%.*]] -; CHECK: 1: -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i1> zeroinitializer, <2 x i1> [[TMP0]], <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP3]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 false, i1 [[TMP4]], i1 false -; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 false, i1 [[OP_RDX]], i1 false -; CHECK-NEXT: br i1 [[OP_RDX1]], label [[TMP6:%.*]], label [[TMP5:%.*]] -; CHECK: 4: -; CHECK-NEXT: ret void -; CHECK: 5: -; CHECK-NEXT: ret void +; X86-LABEL: define void @tes() { +; X86-NEXT: entry: +; X86-NEXT: [[TMP0:%.*]] = fcmp ole <2 x double> zeroinitializer, zeroinitializer +; X86-NEXT: br label [[TMP1:%.*]] +; X86: 1: +; X86-NEXT: [[TMP2:%.*]] = shufflevector <2 x i1> zeroinitializer, <2 x i1> [[TMP0]], <4 x i32> +; X86-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP2]]) +; X86-NEXT: [[OP_RDX:%.*]] = select i1 false, i1 [[TMP3]], i1 false +; X86-NEXT: [[OP_RDX1:%.*]] = select i1 false, i1 [[OP_RDX]], i1 false +; X86-NEXT: br i1 [[OP_RDX1]], label [[TMP4:%.*]], label [[TMP5:%.*]] +; X86: 4: +; X86-NEXT: ret void +; X86: 5: +; X86-NEXT: ret void +; +; AARCH64-LABEL: define void @tes() { +; AARCH64-NEXT: entry: +; AARCH64-NEXT: [[TMP0:%.*]] = extractelement <2 x i1> zeroinitializer, i64 0 +; AARCH64-NEXT: [[TMP1:%.*]] = extractelement <2 x i1> zeroinitializer, i64 0 +; AARCH64-NEXT: [[TMP2:%.*]] = fcmp ole <2 x double> zeroinitializer, zeroinitializer +; AARCH64-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i64 0 +; AARCH64-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> zeroinitializer, i64 0 +; AARCH64-NEXT: br label [[TMP5:%.*]] +; AARCH64: 5: +; AARCH64-NEXT: [[TMP6:%.*]] = select i1 false, i1 false, i1 false +; AARCH64-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], i1 [[TMP0]], i1 false +; AARCH64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i1 [[TMP1]], i1 false +; AARCH64-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i1 false, i1 false +; AARCH64-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i1 [[TMP3]], i1 false +; AARCH64-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i1 [[TMP4]], i1 false +; AARCH64-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]] +; AARCH64: 12: +; AARCH64-NEXT: ret void +; AARCH64: 13: +; AARCH64-NEXT: ret void ; entry: %0 = extractelement <2 x i1> zeroinitializer, i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/scalarazied-result.ll b/llvm/test/Transforms/SLPVectorizer/scalarazied-result.ll index 2570cdb45e1e7..0a6b86c953ee0 100644 --- a/llvm/test/Transforms/SLPVectorizer/scalarazied-result.ll +++ b/llvm/test/Transforms/SLPVectorizer/scalarazied-result.ll @@ -1,11 +1,19 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -S < %s | FileCheck %s %} -; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu -S < %s | FileCheck %s %} +; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -S < %s | FileCheck %s --check-prefix=X86 %} +; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu -S < %s | FileCheck %s --check-prefix=AARCH64 %} define void @test() { -; CHECK-LABEL: @test( -; CHECK-NEXT: entry: -; CHECK-NEXT: ret void +; X86-LABEL: @test( +; X86-NEXT: entry: +; X86-NEXT: ret void +; +; AARCH64-LABEL: @test( +; AARCH64-NEXT: entry: +; AARCH64-NEXT: [[TMP0:%.*]] = extractelement <8 x half> zeroinitializer, i64 1 +; AARCH64-NEXT: [[TOBOOL:%.*]] = fcmp une half [[TMP0]], 0xH0000 +; AARCH64-NEXT: [[TMP1:%.*]] = extractelement <8 x half> zeroinitializer, i64 1 +; AARCH64-NEXT: [[TOBOOL3:%.*]] = fcmp une half [[TMP1]], 0xH0000 +; AARCH64-NEXT: ret void ; entry: %0 = extractelement <8 x half> zeroinitializer, i64 1