From a776618e620951f1f7516849fe8b03154165439a Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Thu, 10 Apr 2025 20:04:29 +0000 Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?= =?UTF-8?q?l=20version?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Created using spr 1.3.5 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 42 ++++++++++----- .../SystemZ/vec-elt-insertion.ll | 8 +-- .../Transforms/SLPVectorizer/X86/lookahead.ll | 46 +++++++++++------ .../X86/multi-incoming-blocks-in-phi.ll | 14 +++-- ...duced-val-extracted-and-externally-used.ll | 37 +++++++++----- .../X86/replaced-external-in-reduction.ll | 18 +++---- .../SLPVectorizer/X86/select-reduction-op.ll | 8 +-- .../SLPVectorizer/X86/vectorize-cmps.ll | 10 ++-- .../full-overlap-non-schedulable.ll | 51 ++++++++++--------- 9 files changed, 137 insertions(+), 97 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index b563dc8e4f2a6..0a1389e906aea 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -15353,15 +15353,14 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef VL, bool ForPoisonSrc, constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; InstructionCost Cost; auto EstimateInsertCost = [&](unsigned I, Value *V) { - if (V->getType() != ScalarTy) { + DemandedElements.setBit(I); + if (V->getType() != ScalarTy) Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(), TTI::CastContextHint::None, CostKind); - V = nullptr; - } - if (!ForPoisonSrc) - DemandedElements.setBit(I); }; SmallVector ShuffleMask(VL.size(), PoisonMaskElem); + SmallVector ConstantShuffleMask(VL.size(), PoisonMaskElem); + std::iota(ConstantShuffleMask.begin(), ConstantShuffleMask.end(), 0); for (unsigned I = 0, E = VL.size(); I < E; ++I) { Value *V = VL[I]; // No need to shuffle duplicates for constants. @@ -15371,6 +15370,11 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef VL, bool ForPoisonSrc, continue; } + if (isConstant(V)) { + ConstantShuffleMask[I] = I + E; + ShuffleMask[I] = I; + continue; + } auto Res = UniqueElements.try_emplace(V, I); if (Res.second) { EstimateInsertCost(I, V); @@ -15382,18 +15386,28 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef VL, bool ForPoisonSrc, ShuffledElements.setBit(I); ShuffleMask[I] = Res.first->second; } - if (ForPoisonSrc) { - Cost = getScalarizationOverhead(*TTI, ScalarTy, VecTy, - /*DemandedElts*/ ~ShuffledElements, - /*Insert*/ true, - /*Extract*/ false, CostKind, - /*ForPoisonSrc=*/true, VL); - } else if (!DemandedElements.isZero()) { + // FIXME: add a cost for constant vector materialization. + bool IsAnyNonUndefConst = + any_of(VL, [](Value *V) { return !isa(V) && isConstant(V); }); + // 1. Shuffle input source vector and constant vector. + if (!ForPoisonSrc && IsAnyNonUndefConst) { + Cost += ::getShuffleCost(*TTI, TargetTransformInfo::SK_PermuteTwoSrc, + VecTy, ConstantShuffleMask); + for (auto [Idx, I] : enumerate(ShuffleMask)) { + if (I == PoisonMaskElem) + I = Idx; + else + I += VL.size(); + } + } + + // 2. Insert unique non-constants. + if (!DemandedElements.isZero()) Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElements, /*Insert=*/true, /*Extract=*/false, CostKind, - /*ForPoisonSrc=*/false, VL); - } + ForPoisonSrc && !IsAnyNonUndefConst, VL); + // 3. Shuffle duplicates. if (DuplicateNonConst) Cost += ::getShuffleCost(*TTI, TargetTransformInfo::SK_PermuteSingleSrc, VecTy, ShuffleMask); diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll index 85b8157c949f1..afe01d3cd673d 100644 --- a/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll +++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll @@ -114,13 +114,13 @@ define void @fun2(ptr %0, ptr %Dst) { ; CHECK: [[BB4]]: ; CHECK-NEXT: ret void ; CHECK: [[BB5]]: -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[DST]], i64 24 -; CHECK-NEXT: store i64 [[TMP2]], ptr [[TMP6]], align 8 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], i64 16 -; CHECK-NEXT: store i64 0, ptr [[TMP7]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> , i64 [[TMP2]], i32 1 +; CHECK-NEXT: store <2 x i64> [[TMP8]], ptr [[TMP7]], align 8 ; CHECK-NEXT: br label %[[BB4]] ; -; REMARK-NOT: Function: fun2 +; Looks like there is bug in TTI, where insertion into index 1 is free, while insertion in to index 0 is 1. +; REMARK: Function: fun2 %3 = load i64, ptr %0, align 8 %4 = icmp eq i64 %3, 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll index faaac0c7614f6..d2e3d82cbe10a 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll @@ -520,22 +520,36 @@ define i1 @ExtractIdxNotConstantInt2(float %a, float %b, float %c, <4 x float> % define i1 @foo(float %a, float %b, float %c, <4 x float> %vec, i64 %idx2) { -; CHECK-LABEL: @foo( -; CHECK-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 0 -; CHECK-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]] -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[VEC]], <4 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[SUB14_I167]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> , float [[B:%.*]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1 -; CHECK-NEXT: [[MUL123_I184:%.*]] = fmul float [[TMP9]], [[TMP10]] -; CHECK-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00 -; CHECK-NEXT: ret i1 [[CMP_I185]] +; SSE-LABEL: @foo( +; SSE-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 0 +; SSE-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]] +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1 +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[VEC]], <4 x float> poison, <2 x i32> +; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[SUB14_I167]], i32 0 +; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> , float [[B:%.*]], i32 0 +; SSE-NEXT: [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]] +; SSE-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], +; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0 +; SSE-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1 +; SSE-NEXT: [[MUL123_I184:%.*]] = fmul float [[TMP9]], [[TMP10]] +; SSE-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00 +; SSE-NEXT: ret i1 [[CMP_I185]] +; +; AVX-LABEL: @foo( +; AVX-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 0 +; AVX-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]] +; AVX-NEXT: [[FM:%.*]] = fmul float [[A:%.*]], [[SUB14_I167]] +; AVX-NEXT: [[SUB25_I168:%.*]] = fsub float [[FM]], [[B:%.*]] +; AVX-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 1 +; AVX-NEXT: [[ADD36_I173:%.*]] = fadd float [[SUB25_I168]], 1.000000e+01 +; AVX-NEXT: [[MUL72_I179:%.*]] = fmul float [[C:%.*]], [[VECEXT_I276_I169]] +; AVX-NEXT: [[ADD78_I180:%.*]] = fsub float [[MUL72_I179]], 3.000000e+01 +; AVX-NEXT: [[ADD79_I181:%.*]] = fadd float 2.000000e+00, [[ADD78_I180]] +; AVX-NEXT: [[MUL123_I184:%.*]] = fmul float [[ADD36_I173]], [[ADD79_I181]] +; AVX-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00 +; AVX-NEXT: ret i1 [[CMP_I185]] ; %vecext.i291.i166 = extractelement <4 x float> %vec, i64 0 %sub14.i167 = fsub float undef, %vecext.i291.i166 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-incoming-blocks-in-phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-incoming-blocks-in-phi.ll index ba35de391f9eb..2765cd03a42db 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/multi-incoming-blocks-in-phi.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-incoming-blocks-in-phi.ll @@ -26,9 +26,8 @@ define void @foo(ptr %arg) { ; CHECK-NEXT: [[PHI:%.*]] = phi float [ 4.000000e+00, %[[BB]] ], [ 0.000000e+00, %[[BB27:.*]] ] ; CHECK-NEXT: [[FADD8:%.*]] = fadd float 0.000000e+00, 0.000000e+00 ; CHECK-NEXT: [[FADD9:%.*]] = fadd float [[PHI]], 1.000000e+00 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x float> , float [[FADD9]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x float> , [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = fadd float [[FADD9]], 1.000000e+00 +; CHECK-NEXT: [[FADD11:%.*]] = fadd float 0.000000e+00, 0.000000e+00 ; CHECK-NEXT: [[FREM:%.*]] = frem float [[TMP2]], 7.000000e+00 ; CHECK-NEXT: [[CALL12:%.*]] = call i32 @llvm.x86.sse.cvttss2si(<4 x float> zeroinitializer) ; CHECK-NEXT: switch i32 [[CALL12]], label %[[BB13:.*]] [ @@ -60,7 +59,6 @@ define void @foo(ptr %arg) { ; CHECK-NEXT: br label %[[BB20:.*]] ; CHECK: [[BB20]]: ; CHECK-NEXT: [[FADD21:%.*]] = fadd float [[FADD18]], 1.000000e+00 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> , float [[FADD21]], i32 0 ; CHECK-NEXT: switch i32 0, label %[[BB22:.*]] [ ; CHECK-NEXT: i32 125, label %[[BB30]] ; CHECK-NEXT: i32 98, label %[[BB30]] @@ -71,8 +69,8 @@ define void @foo(ptr %arg) { ; CHECK-NEXT: i32 121, label %[[BB30]] ; CHECK-NEXT: ] ; CHECK: [[BB22]]: -; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x float> , [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = fadd float [[FADD21]], 1.000000e+00 +; CHECK-NEXT: [[TMP6:%.*]] = fadd float 0.000000e+00, 0.000000e+00 ; CHECK-NEXT: [[FREM25:%.*]] = frem float [[TMP5]], 7.000000e+00 ; CHECK-NEXT: [[FMUL26:%.*]] = fmul float [[FREM25]], 5.000000e+00 ; CHECK-NEXT: switch i32 0, label %[[BB27]] [ @@ -86,11 +84,11 @@ define void @foo(ptr %arg) { ; CHECK-NEXT: ] ; CHECK: [[BB27]]: ; CHECK-NEXT: [[FADD28:%.*]] = fadd float [[TMP5]], 1.000000e+00 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1 ; CHECK-NEXT: [[FADD29:%.*]] = fadd float [[TMP6]], 0.000000e+00 ; CHECK-NEXT: br label %[[BB7]] ; CHECK: [[BB30]]: -; CHECK-NEXT: [[TMP7:%.*]] = phi <2 x float> [ [[TMP1]], %[[BB7]] ], [ [[TMP1]], %[[BB7]] ], [ [[TMP1]], %[[BB7]] ], [ [[TMP1]], %[[BB7]] ], [ [[TMP1]], %[[BB7]] ], [ [[TMP1]], %[[BB7]] ], [ [[TMP1]], %[[BB7]] ], [ zeroinitializer, %[[BB13]] ], [ zeroinitializer, %[[BB13]] ], [ zeroinitializer, %[[BB13]] ], [ zeroinitializer, %[[BB13]] ], [ zeroinitializer, %[[BB13]] ], [ zeroinitializer, %[[BB13]] ], [ zeroinitializer, %[[BB13]] ], [ [[TMP3]], %[[BB20]] ], [ [[TMP3]], %[[BB20]] ], [ [[TMP3]], %[[BB20]] ], [ [[TMP3]], %[[BB20]] ], [ [[TMP3]], %[[BB20]] ], [ [[TMP3]], %[[BB20]] ], [ [[TMP3]], %[[BB20]] ], [ [[TMP4]], %[[BB22]] ], [ [[TMP4]], %[[BB22]] ], [ [[TMP4]], %[[BB22]] ], [ [[TMP4]], %[[BB22]] ], [ [[TMP4]], %[[BB22]] ], [ [[TMP4]], %[[BB22]] ], [ [[TMP4]], %[[BB22]] ] +; CHECK-NEXT: [[PHI31:%.*]] = phi float [ [[TMP2]], %[[BB7]] ], [ [[TMP2]], %[[BB7]] ], [ [[TMP2]], %[[BB7]] ], [ [[TMP2]], %[[BB7]] ], [ [[TMP2]], %[[BB7]] ], [ [[TMP2]], %[[BB7]] ], [ [[TMP2]], %[[BB7]] ], [ 0.000000e+00, %[[BB13]] ], [ 0.000000e+00, %[[BB13]] ], [ 0.000000e+00, %[[BB13]] ], [ 0.000000e+00, %[[BB13]] ], [ 0.000000e+00, %[[BB13]] ], [ 0.000000e+00, %[[BB13]] ], [ 0.000000e+00, %[[BB13]] ], [ [[FADD21]], %[[BB20]] ], [ [[FADD21]], %[[BB20]] ], [ [[FADD21]], %[[BB20]] ], [ [[FADD21]], %[[BB20]] ], [ [[FADD21]], %[[BB20]] ], [ [[FADD21]], %[[BB20]] ], [ [[FADD21]], %[[BB20]] ], [ [[TMP5]], %[[BB22]] ], [ [[TMP5]], %[[BB22]] ], [ [[TMP5]], %[[BB22]] ], [ [[TMP5]], %[[BB22]] ], [ [[TMP5]], %[[BB22]] ], [ [[TMP5]], %[[BB22]] ], [ [[TMP5]], %[[BB22]] ] +; CHECK-NEXT: [[PHI32:%.*]] = phi float [ [[FADD11]], %[[BB7]] ], [ [[FADD11]], %[[BB7]] ], [ [[FADD11]], %[[BB7]] ], [ [[FADD11]], %[[BB7]] ], [ [[FADD11]], %[[BB7]] ], [ [[FADD11]], %[[BB7]] ], [ [[FADD11]], %[[BB7]] ], [ 0.000000e+00, %[[BB13]] ], [ 0.000000e+00, %[[BB13]] ], [ 0.000000e+00, %[[BB13]] ], [ 0.000000e+00, %[[BB13]] ], [ 0.000000e+00, %[[BB13]] ], [ 0.000000e+00, %[[BB13]] ], [ 0.000000e+00, %[[BB13]] ], [ 0.000000e+00, %[[BB20]] ], [ 0.000000e+00, %[[BB20]] ], [ 0.000000e+00, %[[BB20]] ], [ 0.000000e+00, %[[BB20]] ], [ 0.000000e+00, %[[BB20]] ], [ 0.000000e+00, %[[BB20]] ], [ 0.000000e+00, %[[BB20]] ], [ [[TMP6]], %[[BB22]] ], [ [[TMP6]], %[[BB22]] ], [ [[TMP6]], %[[BB22]] ], [ [[TMP6]], %[[BB22]] ], [ [[TMP6]], %[[BB22]] ], [ [[TMP6]], %[[BB22]] ], [ [[TMP6]], %[[BB22]] ] ; CHECK-NEXT: ret void ; bb: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-extracted-and-externally-used.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-extracted-and-externally-used.ll index b25bf07067830..477a818ba5ccb 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-extracted-and-externally-used.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-extracted-and-externally-used.ll @@ -5,28 +5,37 @@ define void @test(i32 %arg) { ; CHECK-LABEL: define void @test( ; CHECK-SAME: i32 [[ARG:%.*]]) { ; CHECK-NEXT: [[BB:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 [[ARG]], i32 0 ; CHECK-NEXT: br label %[[BB1:.*]] ; CHECK: [[BB1]]: ; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[TMP5:%.*]], %[[BB1]] ] -; CHECK-NEXT: [[PHI2:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[TMP6:%.*]], %[[BB1]] ] -; CHECK-NEXT: [[PHI3:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[OP_RDX4:%.*]], %[[BB1]] ] -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ zeroinitializer, %[[BB]] ], [ [[TMP4:%.*]], %[[BB1]] ] -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[PHI2:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[ADD24:%.*]], %[[BB1]] ] +; CHECK-NEXT: [[PHI3:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[XOR26:%.*]], %[[BB1]] ] ; CHECK-NEXT: [[ADD:%.*]] = add i32 [[PHI2]], 0 ; CHECK-NEXT: [[ADD4:%.*]] = add i32 [[PHI2]], 0 +; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[ADD]], [[ADD4]] ; CHECK-NEXT: [[ADD23:%.*]] = add i32 [[PHI]], 0 ; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[PHI2]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4]] = add <2 x i32> [[TMP0]], -; CHECK-NEXT: [[TMP5]] = extractelement <2 x i32> [[TMP4]], i32 1 -; CHECK-NEXT: [[TMP6]] = extractelement <2 x i32> [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> [[TMP3]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = xor i32 [[TMP7]], [[ADD]] -; CHECK-NEXT: [[OP_RDX1:%.*]] = xor i32 [[ADD4]], [[ADD6]] -; CHECK-NEXT: [[OP_RDX2:%.*]] = xor i32 [[ADD23]], [[TMP6]] +; CHECK-NEXT: [[ADD7:%.*]] = add i32 [[PHI2]], 0 +; CHECK-NEXT: [[XOR8:%.*]] = xor i32 [[ADD6]], [[XOR]] +; CHECK-NEXT: [[XOR9:%.*]] = xor i32 [[XOR8]], [[ADD23]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = xor i32 [[XOR9]], [[ADD7]] +; CHECK-NEXT: [[OP_RDX2:%.*]] = add i32 [[PHI]], 0 +; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[PHI2]], 0 +; CHECK-NEXT: [[ADD13:%.*]] = add i32 [[PHI2]], 0 ; CHECK-NEXT: [[OP_RDX3:%.*]] = xor i32 [[OP_RDX]], [[OP_RDX1]] -; CHECK-NEXT: [[OP_RDX4]] = xor i32 [[OP_RDX3]], [[OP_RDX2]] +; CHECK-NEXT: [[OP_RDX4:%.*]] = xor i32 [[OP_RDX3]], [[OP_RDX2]] +; CHECK-NEXT: [[XOR16:%.*]] = xor i32 [[OP_RDX4]], [[ADD13]] +; CHECK-NEXT: [[ADD17:%.*]] = add i32 [[PHI]], 0 +; CHECK-NEXT: [[ADD18:%.*]] = add i32 [[PHI2]], 0 +; CHECK-NEXT: [[ADD19:%.*]] = add i32 [[PHI2]], 0 +; CHECK-NEXT: [[XOR20:%.*]] = xor i32 [[ADD18]], [[XOR16]] +; CHECK-NEXT: [[XOR21:%.*]] = xor i32 [[XOR20]], [[ADD17]] +; CHECK-NEXT: [[XOR22:%.*]] = xor i32 [[XOR21]], [[ADD19]] +; CHECK-NEXT: [[ADD25:%.*]] = add i32 [[PHI2]], 0 +; CHECK-NEXT: [[ADD24]] = add i32 [[ARG]], 0 +; CHECK-NEXT: [[XOR25:%.*]] = xor i32 [[ADD25]], [[XOR22]] +; CHECK-NEXT: [[XOR26]] = xor i32 [[XOR25]], [[ADD24]] +; CHECK-NEXT: [[TMP5]] = add i32 1, 0 ; CHECK-NEXT: [[ICMP:%.*]] = icmp ult i32 [[TMP5]], 0 ; CHECK-NEXT: br label %[[BB1]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/replaced-external-in-reduction.ll b/llvm/test/Transforms/SLPVectorizer/X86/replaced-external-in-reduction.ll index 8fa84699a267c..52e13de8118d7 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/replaced-external-in-reduction.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/replaced-external-in-reduction.ll @@ -5,21 +5,21 @@ define void @test(i32 %0, ptr %p) { ; CHECK-LABEL: define void @test( ; CHECK-SAME: i32 [[TMP0:%.*]], ptr [[P:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> , i32 [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP0]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> , i32 [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[OP_RDX:%.*]] = extractelement <8 x i32> [[TMP8]], i32 3 ; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[PH:%.*]] ; CHECK: ph: -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> , i32 [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> , i32 [[TMP0]], i32 4 ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: ; CHECK-NEXT: [[TMP9:%.*]] = phi <8 x i32> [ [[TMP8]], [[ENTRY:%.*]] ], [ [[TMP6]], [[PH]] ] ; CHECK-NEXT: [[TMP7:%.*]] = phi <4 x i32> [ [[TMP5]], [[ENTRY]] ], [ zeroinitializer, [[PH]] ] -; CHECK-NEXT: [[OP_RDX:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[TMP9]]) -; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP7]]) -; CHECK-NEXT: [[OP_RDX5:%.*]] = or i32 [[TMP10]], [[TMP3]] +; CHECK-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32> [[TMP9]], i64 0) +; CHECK-NEXT: [[RDX_OP:%.*]] = or <4 x i32> [[TMP10]], [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP9]], <4 x i32> [[RDX_OP]], i64 0) +; CHECK-NEXT: [[OP_RDX5:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[TMP11]]) ; CHECK-NEXT: [[OP_RDX2:%.*]] = or i32 [[OP_RDX5]], [[OP_RDX]] ; CHECK-NEXT: store i32 [[OP_RDX2]], ptr [[P]], align 4 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/select-reduction-op.ll b/llvm/test/Transforms/SLPVectorizer/X86/select-reduction-op.ll index 5f62def150d8f..f16bf31f85ecc 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/select-reduction-op.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/select-reduction-op.ll @@ -4,10 +4,12 @@ define i1 @src(i1 %cmp4.118.i) { ; CHECK-LABEL: define i1 @src( ; CHECK-SAME: i1 [[CMP4_118_I:%.*]]) { -; CHECK-NEXT: [[CMP4_118_I_NOT:%.*]] = xor i1 [[CMP4_118_I]], true -; CHECK-NEXT: [[TMP1:%.*]] = freeze <4 x i1> poison +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i1> , i1 [[CMP4_118_I]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP4]], +; CHECK-NEXT: [[DOTNOT7:%.*]] = xor i1 poison, true +; CHECK-NEXT: [[TMP1:%.*]] = freeze <4 x i1> [[TMP5]] ; CHECK-NEXT: [[TMP2:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP1]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[CMP4_118_I_NOT]], i1 true, i1 [[TMP2]] +; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP2]], i1 true, i1 [[DOTNOT7]] ; CHECK-NEXT: [[TMP3:%.*]] = freeze i1 [[OP_RDX]] ; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[TMP3]], i1 true, i1 poison ; CHECK-NEXT: ret i1 [[OP_RDX1]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-cmps.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-cmps.ll index b8a2d8431b1e6..0f625c4443606 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-cmps.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-cmps.ll @@ -4,12 +4,12 @@ define i32 @test(ptr %isec, float %0) { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[ISEC:%.*]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> , float [[TMP0:%.*]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP2]], [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[ISEC:%.*]], align 4 +; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[ISEC]], i64 1 +; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX10]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP0:%.*]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = fmul fast float 0.000000e+00, [[TMP1]] ; CHECK-NEXT: [[CMP61:%.*]] = fcmp fast oge float 0.000000e+00, 0.000000e+00 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 ; CHECK-NEXT: [[CMP63:%.*]] = fcmp fast ogt float [[TMP4]], [[TMP5]] ; CHECK-NEXT: br i1 [[CMP63]], label [[CLEANUP:%.*]], label [[IF_END:%.*]] ; CHECK: if.end: diff --git a/llvm/test/Transforms/SLPVectorizer/full-overlap-non-schedulable.ll b/llvm/test/Transforms/SLPVectorizer/full-overlap-non-schedulable.ll index c704baaad6f71..043205822b1c5 100644 --- a/llvm/test/Transforms/SLPVectorizer/full-overlap-non-schedulable.ll +++ b/llvm/test/Transforms/SLPVectorizer/full-overlap-non-schedulable.ll @@ -6,42 +6,45 @@ define void @test(ptr %p1, ptr %0, i32 %1, i1 %c1, ptr %p2) { ; CHECK-SAME: ptr [[P1:%.*]], ptr [[TMP0:%.*]], i32 [[TMP1:%.*]], i1 [[C1:%.*]], ptr [[P2:%.*]]) { ; CHECK-NEXT: [[TOP:.*:]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i64 8 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x ptr> [[TMP3]], <4 x ptr> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, <4 x ptr> [[TMP4]], <4 x i64> -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x ptr> [[TMP5]], i32 2 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP0]], i64 12 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[TMP0]], i64 16 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP0]], i64 20 ; CHECK-NEXT: br i1 [[C1]], label %[[L42:.*]], label %[[L41:.*]] ; CHECK: [[L41]]: -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <4 x ptr> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> zeroinitializer, <4 x i32> [[TMP8]] +; CHECK-NEXT: [[DOTNOT276:%.*]] = icmp eq ptr [[TMP2]], null +; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[DOTNOT276]], i32 0, i32 [[TMP10]] +; CHECK-NEXT: [[DOTNOT277:%.*]] = icmp eq ptr [[TMP12]], null +; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP12]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[DOTNOT277]], i32 0, i32 [[TMP8]] +; CHECK-NEXT: [[DOTNOT278:%.*]] = icmp eq ptr [[TMP4]], null +; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP4]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = select i1 [[DOTNOT278]], i32 0, i32 [[TMP15]] +; CHECK-NEXT: [[DOTNOT279:%.*]] = icmp eq ptr [[TMP5]], null +; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP5]], align 4 +; CHECK-NEXT: [[TMP25:%.*]] = select i1 [[DOTNOT279]], i32 0, i32 [[TMP20]] ; CHECK-NEXT: br label %[[L112:.*]] ; CHECK: [[L42]]: -; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP2]], align 4 -; CHECK-NEXT: [[DOTNOT280:%.*]] = icmp eq i32 [[TMP10]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> , i32 [[TMP1]], i32 2 +; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[DOTNOT280:%.*]] = icmp eq i32 [[TMP14]], 0 ; CHECK-NEXT: br i1 [[DOTNOT280]], label %[[L112]], label %[[L47:.*]] ; CHECK: [[L47]]: -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x ptr> [[TMP5]], i32 1 ; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4 -; CHECK-NEXT: [[TMP25:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <4 x ptr> [[TMP5]], <4 x ptr> poison, <2 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x ptr> [[TMP25]], <2 x ptr> [[TMP26]], <2 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq <2 x ptr> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4 -; CHECK-NEXT: [[TMP17:%.*]] = select <2 x i1> [[TMP15]], <2 x i32> zeroinitializer, <2 x i32> [[TMP16]] -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x i32> , i32 [[TMP13]], i32 1 -; CHECK-NEXT: [[TMP19:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP18]], <2 x i32> [[TMP17]], i64 2) +; CHECK-NEXT: [[DOTNOT282:%.*]] = icmp eq ptr [[TMP4]], null +; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP4]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[DOTNOT282]], i32 0, i32 [[TMP16]] +; CHECK-NEXT: [[DOTNOT283:%.*]] = icmp eq ptr [[TMP5]], null +; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP5]], align 4 +; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[DOTNOT283]], i32 0, i32 [[TMP18]] ; CHECK-NEXT: br label %[[L112]] ; CHECK: [[L112]]: -; CHECK-NEXT: [[TMP20:%.*]] = phi <4 x i32> [ [[TMP19]], %[[L47]] ], [ [[TMP9]], %[[L41]] ], [ [[TMP11]], %[[L42]] ] -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i32> [[TMP20]], i32 0 +; CHECK-NEXT: [[TMP24:%.*]] = phi i32 [ [[TMP19]], %[[L47]] ], [ [[TMP25]], %[[L41]] ], [ 0, %[[L42]] ] +; CHECK-NEXT: [[TMP23:%.*]] = phi i32 [ [[TMP17]], %[[L47]] ], [ [[TMP11]], %[[L41]] ], [ [[TMP1]], %[[L42]] ] +; CHECK-NEXT: [[TMP22:%.*]] = phi i32 [ [[TMP13]], %[[L47]] ], [ [[TMP9]], %[[L41]] ], [ 0, %[[L42]] ] +; CHECK-NEXT: [[TMP21:%.*]] = phi i32 [ 0, %[[L47]] ], [ [[TMP7]], %[[L41]] ], [ 0, %[[L42]] ] ; CHECK-NEXT: store i32 [[TMP21]], ptr [[P2]], align 4 -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i32> [[TMP20]], i32 1 ; CHECK-NEXT: store i32 [[TMP22]], ptr [[P1]], align 4 -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[TMP20]], i32 2 ; CHECK-NEXT: store i32 [[TMP23]], ptr [[P2]], align 4 -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i32> [[TMP20]], i32 3 ; CHECK-NEXT: store i32 [[TMP24]], ptr [[P1]], align 4 ; CHECK-NEXT: ret void ;