diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index b62c8f1631ff7..6564e3120a264 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -24126,8 +24126,46 @@ class HorizontalReduction { bool CheckForReusedReductionOps = false; // Try to vectorize elements based on their type. SmallVector States; - for (ArrayRef RV : ReducedVals) + SmallVector> LocalReducedVals; + // Try merge consecutive reduced values into a single vectorizable group and + // check, if they can be vectorized as copyables. + for (ArrayRef RV : ReducedVals) { + // Loads are not very compatible with undefs. + if (isa(RV.front()) && + (States.empty() || !States.back() || + States.back().getOpcode() == Instruction::Load)) { + LocalReducedVals.emplace_back().append(RV.begin(), RV.end()); + States.push_back(InstructionsState::invalid()); + continue; + } + if (!LocalReducedVals.empty() && + isa(LocalReducedVals.back().front()) && + isa(RV.front())) { + LocalReducedVals.emplace_back().append(RV.begin(), RV.end()); + States.push_back(getSameOpcode(RV, TLI)); + continue; + } + SmallVector Ops; + if (!LocalReducedVals.empty()) + Ops = LocalReducedVals.back(); + Ops.append(RV.begin(), RV.end()); + InstructionsCompatibilityAnalysis Analysis(DT, DL, *TTI, TLI); + InstructionsState OpS = + Analysis.buildInstructionsState(Ops, V, VectorizeCopyableElements); + if (LocalReducedVals.empty()) { + LocalReducedVals.push_back(Ops); + States.push_back(OpS); + continue; + } + if (OpS) { + LocalReducedVals.back().swap(Ops); + States.back() = OpS; + continue; + } + LocalReducedVals.emplace_back().append(RV.begin(), RV.end()); States.push_back(getSameOpcode(RV, TLI)); + } + ReducedVals.swap(LocalReducedVals); for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) { ArrayRef OrigReducedVals = ReducedVals[I]; InstructionsState S = States[I]; @@ -24142,8 +24180,10 @@ class HorizontalReduction { // Also check if the instruction was folded to constant/other value. auto *Inst = dyn_cast(RdxVal); if ((Inst && isVectorLikeInstWithConstOps(Inst) && - (!S || !S.getMatchingMainOpOrAltOp(Inst))) || - (S && !Inst)) + (!S || (!S.getMatchingMainOpOrAltOp(Inst) && + !S.isCopyableElement(Inst)))) || + (S && !Inst && !isa(RdxVal) && + !S.isCopyableElement(RdxVal))) continue; Candidates.push_back(RdxVal); TrackedToOrig.try_emplace(RdxVal, ReducedVal); @@ -24728,6 +24768,8 @@ class HorizontalReduction { // Scalar cost is repeated for N-1 elements. int Cnt = ReducedVals.size(); for (Value *RdxVal : ReducedVals) { + if (!isa(RdxVal)) + continue; if (Cnt == 1) break; --Cnt; diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr48223.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr48223.ll index 415089ae213bb..5534ae249a463 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/pr48223.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/pr48223.ll @@ -1,131 +1,31 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -O3 -S -mtriple=x86_64-- -mcpu=x86-64 < %s | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 -; RUN: opt -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 < %s | FileCheck %s --check-prefixes=CHECK,SSE,SSE4 -; RUN: opt -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 < %s | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: opt -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 < %s | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: opt -O3 -S -mtriple=x86_64-- -mcpu=x86-64 < %s | FileCheck %s +; RUN: opt -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 < %s | FileCheck %s +; RUN: opt -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 < %s | FileCheck %s +; RUN: opt -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 < %s | FileCheck %s %"struct.std::array" = type { [8 x i16] } define { i64, i64 } @compute_min(ptr noundef nonnull align 2 dereferenceable(16) %x, ptr noundef nonnull align 2 dereferenceable(16) %y) { -; SSE2-LABEL: @compute_min( -; SSE2-NEXT: entry: -; SSE2-NEXT: [[LD0:%.*]] = load i16, ptr [[Y:%.*]], align 2 -; SSE2-NEXT: [[LD1:%.*]] = load i16, ptr [[X:%.*]], align 2 -; SSE2-NEXT: [[LD2:%.*]] = tail call i16 @llvm.smin.i16(i16 [[LD0]], i16 [[LD1]]) -; SSE2-NEXT: [[PT1_1:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 2 -; SSE2-NEXT: [[PT0_1:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 2 -; SSE2-NEXT: [[LD0_1:%.*]] = load i16, ptr [[PT0_1]], align 2 -; SSE2-NEXT: [[LD1_1:%.*]] = load i16, ptr [[PT1_1]], align 2 -; SSE2-NEXT: [[LD2_1:%.*]] = tail call i16 @llvm.smin.i16(i16 [[LD0_1]], i16 [[LD1_1]]) -; SSE2-NEXT: [[PT1_2:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 4 -; SSE2-NEXT: [[PT0_2:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 4 -; SSE2-NEXT: [[LD0_2:%.*]] = load i16, ptr [[PT0_2]], align 2 -; SSE2-NEXT: [[LD1_2:%.*]] = load i16, ptr [[PT1_2]], align 2 -; SSE2-NEXT: [[LD2_2:%.*]] = tail call i16 @llvm.smin.i16(i16 [[LD0_2]], i16 [[LD1_2]]) -; SSE2-NEXT: [[PT1_3:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 6 -; SSE2-NEXT: [[PT0_3:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 6 -; SSE2-NEXT: [[LD0_3:%.*]] = load i16, ptr [[PT0_3]], align 2 -; SSE2-NEXT: [[LD1_3:%.*]] = load i16, ptr [[PT1_3]], align 2 -; SSE2-NEXT: [[LD2_3:%.*]] = tail call i16 @llvm.smin.i16(i16 [[LD0_3]], i16 [[LD1_3]]) -; SSE2-NEXT: [[PT1_4:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 8 -; SSE2-NEXT: [[PT0_4:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 8 -; SSE2-NEXT: [[LD0_4:%.*]] = load i16, ptr [[PT0_4]], align 2 -; SSE2-NEXT: [[LD1_4:%.*]] = load i16, ptr [[PT1_4]], align 2 -; SSE2-NEXT: [[LD2_4:%.*]] = tail call i16 @llvm.smin.i16(i16 [[LD0_4]], i16 [[LD1_4]]) -; SSE2-NEXT: [[PT1_5:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 10 -; SSE2-NEXT: [[PT0_5:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 10 -; SSE2-NEXT: [[LD0_5:%.*]] = load i16, ptr [[PT0_5]], align 2 -; SSE2-NEXT: [[LD1_5:%.*]] = load i16, ptr [[PT1_5]], align 2 -; SSE2-NEXT: [[LD2_5:%.*]] = tail call i16 @llvm.smin.i16(i16 [[LD0_5]], i16 [[LD1_5]]) -; SSE2-NEXT: [[PT1_6:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 12 -; SSE2-NEXT: [[PT0_6:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 12 -; SSE2-NEXT: [[LD0_6:%.*]] = load i16, ptr [[PT0_6]], align 2 -; SSE2-NEXT: [[LD1_6:%.*]] = load i16, ptr [[PT1_6]], align 2 -; SSE2-NEXT: [[LD2_6:%.*]] = tail call i16 @llvm.smin.i16(i16 [[LD0_6]], i16 [[LD1_6]]) -; SSE2-NEXT: [[PT1_7:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 14 -; SSE2-NEXT: [[PT0_7:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 14 -; SSE2-NEXT: [[LD0_7:%.*]] = load i16, ptr [[PT0_7]], align 2 -; SSE2-NEXT: [[LD1_7:%.*]] = load i16, ptr [[PT1_7]], align 2 -; SSE2-NEXT: [[LD2_7:%.*]] = tail call i16 @llvm.smin.i16(i16 [[LD0_7]], i16 [[LD1_7]]) -; SSE2-NEXT: [[RETVAL_SROA_4_0_INSERT_EXT:%.*]] = zext i16 [[LD2_3]] to i64 -; SSE2-NEXT: [[RETVAL_SROA_4_0_INSERT_SHIFT:%.*]] = shl nuw i64 [[RETVAL_SROA_4_0_INSERT_EXT]], 48 -; SSE2-NEXT: [[RETVAL_SROA_3_0_INSERT_EXT:%.*]] = zext i16 [[LD2_2]] to i64 -; SSE2-NEXT: [[RETVAL_SROA_3_0_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_3_0_INSERT_EXT]], 32 -; SSE2-NEXT: [[RETVAL_SROA_3_0_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_4_0_INSERT_SHIFT]], [[RETVAL_SROA_3_0_INSERT_SHIFT]] -; SSE2-NEXT: [[RETVAL_SROA_2_0_INSERT_EXT:%.*]] = zext i16 [[LD2_1]] to i64 -; SSE2-NEXT: [[RETVAL_SROA_2_0_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_2_0_INSERT_EXT]], 16 -; SSE2-NEXT: [[RETVAL_SROA_2_0_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_3_0_INSERT_INSERT]], [[RETVAL_SROA_2_0_INSERT_SHIFT]] -; SSE2-NEXT: [[RETVAL_SROA_0_0_INSERT_EXT:%.*]] = zext i16 [[LD2]] to i64 -; SSE2-NEXT: [[TMP20:%.*]] = or disjoint i64 [[RETVAL_SROA_2_0_INSERT_INSERT]], [[RETVAL_SROA_0_0_INSERT_EXT]] -; SSE2-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP20]], 0 -; SSE2-NEXT: [[RETVAL_SROA_9_8_INSERT_EXT:%.*]] = zext i16 [[LD2_7]] to i64 -; SSE2-NEXT: [[RETVAL_SROA_9_8_INSERT_SHIFT:%.*]] = shl nuw i64 [[RETVAL_SROA_9_8_INSERT_EXT]], 48 -; SSE2-NEXT: [[RETVAL_SROA_8_8_INSERT_EXT:%.*]] = zext i16 [[LD2_6]] to i64 -; SSE2-NEXT: [[RETVAL_SROA_8_8_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_8_8_INSERT_EXT]], 32 -; SSE2-NEXT: [[RETVAL_SROA_8_8_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_9_8_INSERT_SHIFT]], [[RETVAL_SROA_8_8_INSERT_SHIFT]] -; SSE2-NEXT: [[RETVAL_SROA_7_8_INSERT_EXT:%.*]] = zext i16 [[LD2_5]] to i64 -; SSE2-NEXT: [[RETVAL_SROA_7_8_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_7_8_INSERT_EXT]], 16 -; SSE2-NEXT: [[RETVAL_SROA_7_8_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_8_8_INSERT_INSERT]], [[RETVAL_SROA_7_8_INSERT_SHIFT]] -; SSE2-NEXT: [[RETVAL_SROA_5_8_INSERT_EXT:%.*]] = zext i16 [[LD2_4]] to i64 -; SSE2-NEXT: [[TMP21:%.*]] = or disjoint i64 [[RETVAL_SROA_7_8_INSERT_INSERT]], [[RETVAL_SROA_5_8_INSERT_EXT]] -; SSE2-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP21]], 1 -; SSE2-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]] -; -; SSE4-LABEL: @compute_min( -; SSE4-NEXT: entry: -; SSE4-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[Y:%.*]], align 2 -; SSE4-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[X:%.*]], align 2 -; SSE4-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) -; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <2 x i32> -; SSE4-NEXT: [[TMP4:%.*]] = tail call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) -; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <2 x i32> -; SSE4-NEXT: [[TMP6:%.*]] = tail call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) -; SSE4-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP6]], <8 x i16> poison, <2 x i32> -; SSE4-NEXT: [[TMP8:%.*]] = tail call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) -; SSE4-NEXT: [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP8]], <8 x i16> poison, <2 x i32> -; SSE4-NEXT: [[TMP10:%.*]] = zext <2 x i16> [[TMP9]] to <2 x i64> -; SSE4-NEXT: [[TMP11:%.*]] = shl nuw <2 x i64> [[TMP10]], -; SSE4-NEXT: [[TMP12:%.*]] = zext <2 x i16> [[TMP7]] to <2 x i64> -; SSE4-NEXT: [[TMP13:%.*]] = shl nuw <2 x i64> [[TMP12]], -; SSE4-NEXT: [[TMP14:%.*]] = or disjoint <2 x i64> [[TMP11]], [[TMP13]] -; SSE4-NEXT: [[TMP15:%.*]] = zext <2 x i16> [[TMP5]] to <2 x i64> -; SSE4-NEXT: [[TMP16:%.*]] = shl nuw nsw <2 x i64> [[TMP15]], splat (i64 16) -; SSE4-NEXT: [[TMP17:%.*]] = or disjoint <2 x i64> [[TMP14]], [[TMP16]] -; SSE4-NEXT: [[TMP18:%.*]] = zext <2 x i16> [[TMP3]] to <2 x i64> -; SSE4-NEXT: [[TMP19:%.*]] = or disjoint <2 x i64> [[TMP17]], [[TMP18]] -; SSE4-NEXT: [[TMP20:%.*]] = extractelement <2 x i64> [[TMP19]], i64 0 -; SSE4-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP20]], 0 -; SSE4-NEXT: [[TMP21:%.*]] = extractelement <2 x i64> [[TMP19]], i64 1 -; SSE4-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP21]], 1 -; SSE4-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]] -; -; AVX-LABEL: @compute_min( -; AVX-NEXT: entry: -; AVX-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[Y:%.*]], align 2 -; AVX-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[X:%.*]], align 2 -; AVX-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) -; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <2 x i32> -; AVX-NEXT: [[TMP4:%.*]] = tail call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) -; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <2 x i32> -; AVX-NEXT: [[TMP6:%.*]] = tail call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) -; AVX-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP6]], <8 x i16> poison, <2 x i32> -; AVX-NEXT: [[TMP8:%.*]] = tail call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) -; AVX-NEXT: [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP8]], <8 x i16> poison, <2 x i32> -; AVX-NEXT: [[TMP10:%.*]] = zext <2 x i16> [[TMP9]] to <2 x i64> -; AVX-NEXT: [[TMP11:%.*]] = shl nuw <2 x i64> [[TMP10]], -; AVX-NEXT: [[TMP12:%.*]] = zext <2 x i16> [[TMP7]] to <2 x i64> -; AVX-NEXT: [[TMP13:%.*]] = shl nuw <2 x i64> [[TMP12]], -; AVX-NEXT: [[TMP14:%.*]] = or disjoint <2 x i64> [[TMP11]], [[TMP13]] -; AVX-NEXT: [[TMP15:%.*]] = zext <2 x i16> [[TMP5]] to <2 x i64> -; AVX-NEXT: [[TMP16:%.*]] = shl nuw nsw <2 x i64> [[TMP15]], splat (i64 16) -; AVX-NEXT: [[TMP17:%.*]] = or disjoint <2 x i64> [[TMP14]], [[TMP16]] -; AVX-NEXT: [[TMP18:%.*]] = zext <2 x i16> [[TMP3]] to <2 x i64> -; AVX-NEXT: [[TMP19:%.*]] = or disjoint <2 x i64> [[TMP17]], [[TMP18]] -; AVX-NEXT: [[TMP20:%.*]] = extractelement <2 x i64> [[TMP19]], i64 0 -; AVX-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP20]], 0 -; AVX-NEXT: [[TMP21:%.*]] = extractelement <2 x i64> [[TMP19]], i64 1 -; AVX-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP21]], 1 -; AVX-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]] +; CHECK-LABEL: @compute_min( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[PT1_4:%.*]] = getelementptr inbounds nuw i8, ptr [[X:%.*]], i64 8 +; CHECK-NEXT: [[PT0_4:%.*]] = getelementptr inbounds nuw i8, ptr [[Y:%.*]], i64 8 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[Y]], align 2 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[X]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64> +; CHECK-NEXT: [[TMP4:%.*]] = shl nuw <4 x i64> [[TMP3]], +; CHECK-NEXT: [[TMP5:%.*]] = tail call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP4]]) +; CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i16>, ptr [[PT0_4]], align 2 +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i16>, ptr [[PT1_4]], align 2 +; CHECK-NEXT: [[TMP8:%.*]] = tail call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[TMP6]], <4 x i16> [[TMP7]]) +; CHECK-NEXT: [[TMP9:%.*]] = zext <4 x i16> [[TMP8]] to <4 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = shl nuw <4 x i64> [[TMP9]], +; CHECK-NEXT: [[TMP11:%.*]] = tail call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP10]]) +; CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP11]], 1 +; CHECK-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]] ; entry: %retval = alloca %"struct.std::array", align 2 @@ -158,6 +58,3 @@ for.body: ; preds = %for.cond %inc = add nuw nsw i32 %i.0, 1 br label %for.cond } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK: {{.*}} -; SSE: {{.*}} diff --git a/llvm/test/Transforms/PhaseOrdering/X86/scalarization-inseltpoison.ll b/llvm/test/Transforms/PhaseOrdering/X86/scalarization-inseltpoison.ll index d36da8d028c60..a2ccbb96b6003 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/scalarization-inseltpoison.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/scalarization-inseltpoison.ll @@ -18,14 +18,15 @@ define <4 x i32> @square(<4 x i32> %num, i32 %y, i32 %x, i32 %h, i32 %k, i32 %w, ; CHECK-NEXT: [[MUL13:%.*]] = mul nsw i32 [[W:%.*]], 53 ; CHECK-NEXT: [[DIV17:%.*]] = sdiv i32 [[X:%.*]], 820 ; CHECK-NEXT: [[MUL21:%.*]] = shl nsw i32 [[U:%.*]], 2 -; CHECK-NEXT: [[OP_RDX:%.*]] = add nsw i32 [[DIV17]], 317426 -; CHECK-NEXT: [[OP_RDX9:%.*]] = add nsw i32 [[DIV]], [[DIV9]] -; CHECK-NEXT: [[OP_RDX10:%.*]] = add i32 [[MUL5]], [[MUL13]] -; CHECK-NEXT: [[OP_RDX11:%.*]] = add i32 [[MUL]], [[MUL21]] -; CHECK-NEXT: [[OP_RDX12:%.*]] = add i32 [[OP_RDX]], [[OP_RDX9]] -; CHECK-NEXT: [[OP_RDX13:%.*]] = add i32 [[OP_RDX10]], [[OP_RDX11]] -; CHECK-NEXT: [[OP_RDX14:%.*]] = add i32 [[OP_RDX12]], [[OP_RDX13]] -; CHECK-NEXT: [[OP_RDX15:%.*]] = add i32 [[OP_RDX14]], [[Y:%.*]] +; CHECK-NEXT: [[DOTSCALAR:%.*]] = add i32 [[Y:%.*]], 1 +; CHECK-NEXT: [[DOTSCALAR1:%.*]] = add i32 [[DOTSCALAR]], [[DIV17]] +; CHECK-NEXT: [[DOTSCALAR2:%.*]] = add i32 [[DOTSCALAR1]], [[MUL5]] +; CHECK-NEXT: [[DOTSCALAR3:%.*]] = add i32 [[DOTSCALAR2]], [[DIV]] +; CHECK-NEXT: [[DOTSCALAR4:%.*]] = add i32 [[DOTSCALAR3]], [[MUL13]] +; CHECK-NEXT: [[DOTSCALAR5:%.*]] = add i32 [[DOTSCALAR4]], [[MUL]] +; CHECK-NEXT: [[DOTSCALAR6:%.*]] = add i32 [[DOTSCALAR5]], [[DIV9]] +; CHECK-NEXT: [[DOTSCALAR7:%.*]] = add i32 [[DOTSCALAR6]], [[MUL21]] +; CHECK-NEXT: [[OP_RDX15:%.*]] = add i32 [[DOTSCALAR7]], 317425 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[OP_RDX15]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[ADD29:%.*]] = add <4 x i32> [[TMP2]], [[NUM:%.*]] diff --git a/llvm/test/Transforms/PhaseOrdering/X86/scalarization.ll b/llvm/test/Transforms/PhaseOrdering/X86/scalarization.ll index c3131a41c2b2e..eb9b249b9a898 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/scalarization.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/scalarization.ll @@ -18,14 +18,15 @@ define <4 x i32> @square(<4 x i32> %num, i32 %y, i32 %x, i32 %h, i32 %k, i32 %w, ; CHECK-NEXT: [[MUL13:%.*]] = mul nsw i32 [[W:%.*]], 53 ; CHECK-NEXT: [[DIV17:%.*]] = sdiv i32 [[X:%.*]], 820 ; CHECK-NEXT: [[MUL21:%.*]] = shl nsw i32 [[U:%.*]], 2 -; CHECK-NEXT: [[OP_RDX:%.*]] = add nsw i32 [[DIV17]], 317426 -; CHECK-NEXT: [[OP_RDX9:%.*]] = add nsw i32 [[DIV]], [[DIV9]] -; CHECK-NEXT: [[OP_RDX10:%.*]] = add i32 [[MUL5]], [[MUL13]] -; CHECK-NEXT: [[OP_RDX11:%.*]] = add i32 [[MUL]], [[MUL21]] -; CHECK-NEXT: [[OP_RDX12:%.*]] = add i32 [[OP_RDX]], [[OP_RDX9]] -; CHECK-NEXT: [[OP_RDX13:%.*]] = add i32 [[OP_RDX10]], [[OP_RDX11]] -; CHECK-NEXT: [[OP_RDX14:%.*]] = add i32 [[OP_RDX12]], [[OP_RDX13]] -; CHECK-NEXT: [[OP_RDX15:%.*]] = add i32 [[OP_RDX14]], [[Y:%.*]] +; CHECK-NEXT: [[DOTSCALAR:%.*]] = add i32 [[Y:%.*]], 1 +; CHECK-NEXT: [[DOTSCALAR1:%.*]] = add i32 [[DOTSCALAR]], [[DIV17]] +; CHECK-NEXT: [[DOTSCALAR2:%.*]] = add i32 [[DOTSCALAR1]], [[MUL5]] +; CHECK-NEXT: [[DOTSCALAR3:%.*]] = add i32 [[DOTSCALAR2]], [[DIV]] +; CHECK-NEXT: [[DOTSCALAR4:%.*]] = add i32 [[DOTSCALAR3]], [[MUL13]] +; CHECK-NEXT: [[DOTSCALAR5:%.*]] = add i32 [[DOTSCALAR4]], [[MUL]] +; CHECK-NEXT: [[DOTSCALAR6:%.*]] = add i32 [[DOTSCALAR5]], [[DIV9]] +; CHECK-NEXT: [[DOTSCALAR7:%.*]] = add i32 [[DOTSCALAR6]], [[MUL21]] +; CHECK-NEXT: [[OP_RDX15:%.*]] = add i32 [[DOTSCALAR7]], 317425 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[OP_RDX15]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[ADD29:%.*]] = add <4 x i32> [[TMP2]], [[NUM:%.*]] diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reduced-value-repeated-and-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reduced-value-repeated-and-vectorized.ll index a985207fb97f4..d431813d7ca23 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/reduced-value-repeated-and-vectorized.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reduced-value-repeated-and-vectorized.ll @@ -9,8 +9,8 @@ define void @test() { ; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr null, align 2 ; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i16> [[TMP0]], zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> [[TMP2]]) -; CHECK-NEXT: [[TMP4:%.*]] = call i16 @llvm.smax.i16(i16 [[TMP1]], i16 [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = call i16 @llvm.smax.i16(i16 [[TMP4]], i16 0) +; CHECK-NEXT: [[TMP4:%.*]] = call i16 @llvm.smax.i16(i16 [[TMP1]], i16 0) +; CHECK-NEXT: [[TMP5:%.*]] = call i16 @llvm.smax.i16(i16 [[TMP4]], i16 [[TMP3]]) ; CHECK-NEXT: [[TMP6:%.*]] = tail call i16 @llvm.smax.i16(i16 [[TMP5]], i16 0) ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll index 143e09374a891..e3279290c3074 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll @@ -5,34 +5,38 @@ define void @Test(i32) { ; CHECK-LABEL: @Test( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[TMP0:%.*]], i32 0 ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[LOCAL_8_43_US:%.*]] = phi i32 [ [[VAL_43:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP6:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], +; CHECK-NEXT: [[LOCAL_8_43_US1:%.*]] = phi i32 [ [[VAL_44:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[LOCAL_8_43_US]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[LOCAL_8_43_US1]], i32 2 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP6]], +; CHECK-NEXT: [[VAL_41:%.*]] = add i32 [[LOCAL_8_43_US1]], 13685 ; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP0:%.*]], [[TMP4]] -; CHECK-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[LOCAL_8_43_US]] -; CHECK-NEXT: [[VAL_43]] = add i32 [[LOCAL_8_43_US]], 14910 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX1]], i32 0 -; CHECK-NEXT: [[TMP6]] = insertelement <2 x i32> [[TMP5]], i32 [[VAL_43]], i32 1 +; CHECK-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP4]], [[VAL_41]] +; CHECK-NEXT: [[VAL_43]] = and i32 [[OP_RDX]], [[LOCAL_8_43_US1]] +; CHECK-NEXT: [[VAL_44]] = add i32 [[LOCAL_8_43_US1]], 14910 ; CHECK-NEXT: br label [[LOOP]] ; ; FORCE_REDUCTION-LABEL: @Test( ; FORCE_REDUCTION-NEXT: entry: +; FORCE_REDUCTION-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[TMP0:%.*]], i32 0 ; FORCE_REDUCTION-NEXT: br label [[LOOP:%.*]] ; FORCE_REDUCTION: loop: ; FORCE_REDUCTION-NEXT: [[LOCAL_8_43_US:%.*]] = phi i32 [ [[VAL_43:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] -; FORCE_REDUCTION-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP6:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY]] ] -; FORCE_REDUCTION-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> -; FORCE_REDUCTION-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], +; FORCE_REDUCTION-NEXT: [[LOCAL_8_43_US1:%.*]] = phi i32 [ [[VAL_44:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ] +; FORCE_REDUCTION-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[LOCAL_8_43_US]], i32 1 +; FORCE_REDUCTION-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[LOCAL_8_43_US1]], i32 2 +; FORCE_REDUCTION-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> +; FORCE_REDUCTION-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP6]], +; FORCE_REDUCTION-NEXT: [[VAL_41:%.*]] = add i32 [[LOCAL_8_43_US1]], 13685 ; FORCE_REDUCTION-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]]) -; FORCE_REDUCTION-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP0:%.*]], [[TMP4]] -; FORCE_REDUCTION-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[LOCAL_8_43_US]] -; FORCE_REDUCTION-NEXT: [[VAL_43]] = add i32 [[LOCAL_8_43_US]], 14910 -; FORCE_REDUCTION-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX1]], i32 0 -; FORCE_REDUCTION-NEXT: [[TMP6]] = insertelement <2 x i32> [[TMP5]], i32 [[VAL_43]], i32 1 +; FORCE_REDUCTION-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP4]], [[VAL_41]] +; FORCE_REDUCTION-NEXT: [[VAL_43]] = and i32 [[OP_RDX]], [[LOCAL_8_43_US1]] +; FORCE_REDUCTION-NEXT: [[VAL_44]] = add i32 [[LOCAL_8_43_US1]], 14910 ; FORCE_REDUCTION-NEXT: br label [[LOOP]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll index 7a1cf7b573a99..bb177196904fc 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll @@ -11,8 +11,9 @@ define i64 @foo(i32 %tmp7) { ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> [[TMP6]], <8 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = add <8 x i32> zeroinitializer, [[TMP7]] ; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i32> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP10]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 0, [[TMP8]] +; CHECK-NEXT: [[TMP8:%.*]] = mul <8 x i32> [[TMP10]], splat (i32 1) +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP8]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 0, [[TMP11]] ; CHECK-NEXT: [[TMP64:%.*]] = zext i32 [[OP_RDX]] to i64 ; CHECK-NEXT: ret i64 [[TMP64]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-tracked-reduced-value.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-tracked-reduced-value.ll index 79c6c6b3f046f..d7c63457bf5c1 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/multi-tracked-reduced-value.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-tracked-reduced-value.ll @@ -8,11 +8,11 @@ define i8 @test() { ; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 0 to i8 ; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 0 to i8 ; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 0 to i8 +; CHECK-NEXT: [[TMP4:%.*]] = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> zeroinitializer) ; CHECK-NEXT: [[OP_RDX1:%.*]] = or i8 [[TMP0]], [[TMP2]] ; CHECK-NEXT: [[OP_RDX2:%.*]] = or i8 [[OP_RDX1]], [[TMP0]] ; CHECK-NEXT: [[OP_RDX3:%.*]] = or i8 [[OP_RDX2]], [[TMP1]] ; CHECK-NEXT: [[OP_RDX5:%.*]] = or i8 [[OP_RDX3]], [[TMP3]] -; CHECK-NEXT: [[TMP4:%.*]] = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> zeroinitializer) ; CHECK-NEXT: [[OP_RDX4:%.*]] = or i8 [[OP_RDX5]], [[TMP4]] ; CHECK-NEXT: ret i8 [[OP_RDX4]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr48879-sroa.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr48879-sroa.ll index 4f8661f6bac07..330022478ebdb 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr48879-sroa.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr48879-sroa.ll @@ -7,129 +7,61 @@ define { i64, i64 } @compute_min(ptr nocapture noundef nonnull readonly align 2 dereferenceable(16) %x, ptr nocapture noundef nonnull readonly align 2 dereferenceable(16) %y) { ; SSE-LABEL: @compute_min( ; SSE-NEXT: entry: -; SSE-NEXT: [[TMP0:%.*]] = load i16, ptr [[Y:%.*]], align 2 -; SSE-NEXT: [[TMP1:%.*]] = load i16, ptr [[X:%.*]], align 2 -; SSE-NEXT: [[TMP2:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP0]], i16 [[TMP1]]) -; SSE-NEXT: [[ARRAYIDX_I_I_1:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 1 -; SSE-NEXT: [[ARRAYIDX_I_I10_1:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 1 -; SSE-NEXT: [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_1]], align 2 -; SSE-NEXT: [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_I_I_1]], align 2 -; SSE-NEXT: [[TMP5:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP3]], i16 [[TMP4]]) -; SSE-NEXT: [[ARRAYIDX_I_I_2:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 2 -; SSE-NEXT: [[ARRAYIDX_I_I10_2:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 2 -; SSE-NEXT: [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_2]], align 2 -; SSE-NEXT: [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_I_I_2]], align 2 -; SSE-NEXT: [[TMP8:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP6]], i16 [[TMP7]]) -; SSE-NEXT: [[ARRAYIDX_I_I_3:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 3 -; SSE-NEXT: [[ARRAYIDX_I_I10_3:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 3 -; SSE-NEXT: [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_3]], align 2 -; SSE-NEXT: [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX_I_I_3]], align 2 -; SSE-NEXT: [[TMP11:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP9]], i16 [[TMP10]]) -; SSE-NEXT: [[ARRAYIDX_I_I_4:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 4 -; SSE-NEXT: [[ARRAYIDX_I_I10_4:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 4 -; SSE-NEXT: [[TMP12:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_4]], align 2 -; SSE-NEXT: [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX_I_I_4]], align 2 -; SSE-NEXT: [[TMP14:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP12]], i16 [[TMP13]]) -; SSE-NEXT: [[ARRAYIDX_I_I_5:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 5 -; SSE-NEXT: [[ARRAYIDX_I_I10_5:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 5 -; SSE-NEXT: [[TMP15:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_5]], align 2 -; SSE-NEXT: [[TMP16:%.*]] = load i16, ptr [[ARRAYIDX_I_I_5]], align 2 -; SSE-NEXT: [[TMP17:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP15]], i16 [[TMP16]]) -; SSE-NEXT: [[ARRAYIDX_I_I_6:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 6 -; SSE-NEXT: [[ARRAYIDX_I_I10_6:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 6 -; SSE-NEXT: [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_6]], align 2 -; SSE-NEXT: [[TMP19:%.*]] = load i16, ptr [[ARRAYIDX_I_I_6]], align 2 -; SSE-NEXT: [[TMP20:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP18]], i16 [[TMP19]]) -; SSE-NEXT: [[ARRAYIDX_I_I_7:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 7 -; SSE-NEXT: [[ARRAYIDX_I_I10_7:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 7 -; SSE-NEXT: [[TMP21:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_7]], align 2 -; SSE-NEXT: [[TMP22:%.*]] = load i16, ptr [[ARRAYIDX_I_I_7]], align 2 -; SSE-NEXT: [[TMP23:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP21]], i16 [[TMP22]]) -; SSE-NEXT: [[RETVAL_SROA_4_0_INSERT_EXT:%.*]] = zext i16 [[TMP11]] to i64 -; SSE-NEXT: [[RETVAL_SROA_4_0_INSERT_SHIFT:%.*]] = shl nuw i64 [[RETVAL_SROA_4_0_INSERT_EXT]], 48 -; SSE-NEXT: [[RETVAL_SROA_3_0_INSERT_EXT:%.*]] = zext i16 [[TMP8]] to i64 -; SSE-NEXT: [[RETVAL_SROA_3_0_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_3_0_INSERT_EXT]], 32 -; SSE-NEXT: [[RETVAL_SROA_3_0_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_4_0_INSERT_SHIFT]], [[RETVAL_SROA_3_0_INSERT_SHIFT]] -; SSE-NEXT: [[RETVAL_SROA_2_0_INSERT_EXT:%.*]] = zext i16 [[TMP5]] to i64 -; SSE-NEXT: [[RETVAL_SROA_2_0_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_2_0_INSERT_EXT]], 16 -; SSE-NEXT: [[RETVAL_SROA_2_0_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_3_0_INSERT_INSERT]], [[RETVAL_SROA_2_0_INSERT_SHIFT]] -; SSE-NEXT: [[RETVAL_SROA_0_0_INSERT_EXT:%.*]] = zext i16 [[TMP2]] to i64 -; SSE-NEXT: [[RETVAL_SROA_0_0_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_2_0_INSERT_INSERT]], [[RETVAL_SROA_0_0_INSERT_EXT]] +; SSE-NEXT: [[ARRAYIDX_I_I_4:%.*]] = getelementptr inbounds [8 x i16], ptr [[X:%.*]], i64 0, i64 4 +; SSE-NEXT: [[ARRAYIDX_I_I10_4:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y:%.*]], i64 0, i64 4 +; SSE-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[Y]], align 2 +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[X]], align 2 +; SSE-NEXT: [[TMP2:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +; SSE-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64> +; SSE-NEXT: [[TMP4:%.*]] = shl <4 x i64> [[TMP3]], +; SSE-NEXT: [[RETVAL_SROA_0_0_INSERT_INSERT:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP4]]) ; SSE-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[RETVAL_SROA_0_0_INSERT_INSERT]], 0 -; SSE-NEXT: [[RETVAL_SROA_9_8_INSERT_EXT:%.*]] = zext i16 [[TMP23]] to i64 -; SSE-NEXT: [[RETVAL_SROA_9_8_INSERT_SHIFT:%.*]] = shl nuw i64 [[RETVAL_SROA_9_8_INSERT_EXT]], 48 -; SSE-NEXT: [[RETVAL_SROA_8_8_INSERT_EXT:%.*]] = zext i16 [[TMP20]] to i64 -; SSE-NEXT: [[RETVAL_SROA_8_8_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_8_8_INSERT_EXT]], 32 -; SSE-NEXT: [[RETVAL_SROA_8_8_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_9_8_INSERT_SHIFT]], [[RETVAL_SROA_8_8_INSERT_SHIFT]] -; SSE-NEXT: [[RETVAL_SROA_7_8_INSERT_EXT:%.*]] = zext i16 [[TMP17]] to i64 -; SSE-NEXT: [[RETVAL_SROA_7_8_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_7_8_INSERT_EXT]], 16 -; SSE-NEXT: [[RETVAL_SROA_7_8_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_8_8_INSERT_INSERT]], [[RETVAL_SROA_7_8_INSERT_SHIFT]] -; SSE-NEXT: [[RETVAL_SROA_5_8_INSERT_EXT:%.*]] = zext i16 [[TMP14]] to i64 -; SSE-NEXT: [[RETVAL_SROA_5_8_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_7_8_INSERT_INSERT]], [[RETVAL_SROA_5_8_INSERT_EXT]] +; SSE-NEXT: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX_I_I10_4]], align 2 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX_I_I_4]], align 2 +; SSE-NEXT: [[TMP8:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[TMP6]], <4 x i16> [[TMP7]]) +; SSE-NEXT: [[TMP9:%.*]] = zext <4 x i16> [[TMP8]] to <4 x i64> +; SSE-NEXT: [[TMP10:%.*]] = shl <4 x i64> [[TMP9]], +; SSE-NEXT: [[RETVAL_SROA_5_8_INSERT_INSERT:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP10]]) ; SSE-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[RETVAL_SROA_5_8_INSERT_INSERT]], 1 ; SSE-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]] ; ; AVX-LABEL: @compute_min( ; AVX-NEXT: entry: -; AVX-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[Y:%.*]], align 2 -; AVX-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[X:%.*]], align 2 -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <2 x i32> -; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> -; AVX-NEXT: [[TMP4:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP2]], <2 x i16> [[TMP3]]) -; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <2 x i32> -; AVX-NEXT: [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> -; AVX-NEXT: [[TMP7:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP5]], <2 x i16> [[TMP6]]) -; AVX-NEXT: [[TMP8:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <2 x i32> -; AVX-NEXT: [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> -; AVX-NEXT: [[TMP10:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP8]], <2 x i16> [[TMP9]]) -; AVX-NEXT: [[TMP11:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <2 x i32> -; AVX-NEXT: [[TMP12:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> -; AVX-NEXT: [[TMP13:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP11]], <2 x i16> [[TMP12]]) -; AVX-NEXT: [[TMP14:%.*]] = zext <2 x i16> [[TMP13]] to <2 x i64> -; AVX-NEXT: [[TMP15:%.*]] = shl nuw <2 x i64> [[TMP14]], -; AVX-NEXT: [[TMP16:%.*]] = zext <2 x i16> [[TMP10]] to <2 x i64> -; AVX-NEXT: [[TMP17:%.*]] = shl nuw <2 x i64> [[TMP16]], -; AVX-NEXT: [[TMP18:%.*]] = or <2 x i64> [[TMP15]], [[TMP17]] -; AVX-NEXT: [[TMP19:%.*]] = zext <2 x i16> [[TMP7]] to <2 x i64> -; AVX-NEXT: [[TMP20:%.*]] = shl nuw nsw <2 x i64> [[TMP19]], splat (i64 16) -; AVX-NEXT: [[TMP21:%.*]] = or <2 x i64> [[TMP18]], [[TMP20]] -; AVX-NEXT: [[TMP22:%.*]] = zext <2 x i16> [[TMP4]] to <2 x i64> -; AVX-NEXT: [[TMP23:%.*]] = or <2 x i64> [[TMP21]], [[TMP22]] -; AVX-NEXT: [[TMP24:%.*]] = extractelement <2 x i64> [[TMP23]], i32 0 +; AVX-NEXT: [[ARRAYIDX_I_I_4:%.*]] = getelementptr inbounds [8 x i16], ptr [[X:%.*]], i64 0, i64 4 +; AVX-NEXT: [[ARRAYIDX_I_I10_4:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y:%.*]], i64 0, i64 4 +; AVX-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[Y]], align 2 +; AVX-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[X]], align 2 +; AVX-NEXT: [[TMP2:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +; AVX-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64> +; AVX-NEXT: [[TMP4:%.*]] = shl <4 x i64> [[TMP3]], +; AVX-NEXT: [[TMP24:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP4]]) ; AVX-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP24]], 0 -; AVX-NEXT: [[TMP25:%.*]] = extractelement <2 x i64> [[TMP23]], i32 1 +; AVX-NEXT: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX_I_I10_4]], align 2 +; AVX-NEXT: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX_I_I_4]], align 2 +; AVX-NEXT: [[TMP8:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[TMP6]], <4 x i16> [[TMP7]]) +; AVX-NEXT: [[TMP9:%.*]] = zext <4 x i16> [[TMP8]] to <4 x i64> +; AVX-NEXT: [[TMP10:%.*]] = shl <4 x i64> [[TMP9]], +; AVX-NEXT: [[TMP25:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP10]]) ; AVX-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP25]], 1 ; AVX-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]] ; ; AVX2-LABEL: @compute_min( ; AVX2-NEXT: entry: -; AVX2-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[Y:%.*]], align 2 -; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[X:%.*]], align 2 -; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <2 x i32> -; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> -; AVX2-NEXT: [[TMP4:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP2]], <2 x i16> [[TMP3]]) -; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <2 x i32> -; AVX2-NEXT: [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> -; AVX2-NEXT: [[TMP7:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP5]], <2 x i16> [[TMP6]]) -; AVX2-NEXT: [[TMP8:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <2 x i32> -; AVX2-NEXT: [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> -; AVX2-NEXT: [[TMP10:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP8]], <2 x i16> [[TMP9]]) -; AVX2-NEXT: [[TMP11:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <2 x i32> -; AVX2-NEXT: [[TMP12:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> -; AVX2-NEXT: [[TMP13:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP11]], <2 x i16> [[TMP12]]) -; AVX2-NEXT: [[TMP14:%.*]] = zext <2 x i16> [[TMP13]] to <2 x i64> -; AVX2-NEXT: [[TMP15:%.*]] = shl nuw <2 x i64> [[TMP14]], -; AVX2-NEXT: [[TMP16:%.*]] = zext <2 x i16> [[TMP10]] to <2 x i64> -; AVX2-NEXT: [[TMP17:%.*]] = shl nuw <2 x i64> [[TMP16]], -; AVX2-NEXT: [[TMP18:%.*]] = or <2 x i64> [[TMP15]], [[TMP17]] -; AVX2-NEXT: [[TMP19:%.*]] = zext <2 x i16> [[TMP7]] to <2 x i64> -; AVX2-NEXT: [[TMP20:%.*]] = shl nuw nsw <2 x i64> [[TMP19]], splat (i64 16) -; AVX2-NEXT: [[TMP21:%.*]] = or <2 x i64> [[TMP18]], [[TMP20]] -; AVX2-NEXT: [[TMP22:%.*]] = zext <2 x i16> [[TMP4]] to <2 x i64> -; AVX2-NEXT: [[TMP23:%.*]] = or <2 x i64> [[TMP21]], [[TMP22]] -; AVX2-NEXT: [[TMP24:%.*]] = extractelement <2 x i64> [[TMP23]], i32 0 +; AVX2-NEXT: [[ARRAYIDX_I_I_4:%.*]] = getelementptr inbounds [8 x i16], ptr [[X:%.*]], i64 0, i64 4 +; AVX2-NEXT: [[ARRAYIDX_I_I10_4:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y:%.*]], i64 0, i64 4 +; AVX2-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[Y]], align 2 +; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[X]], align 2 +; AVX2-NEXT: [[TMP2:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +; AVX2-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64> +; AVX2-NEXT: [[TMP4:%.*]] = shl <4 x i64> [[TMP3]], +; AVX2-NEXT: [[TMP24:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP4]]) ; AVX2-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP24]], 0 -; AVX2-NEXT: [[TMP25:%.*]] = extractelement <2 x i64> [[TMP23]], i32 1 +; AVX2-NEXT: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX_I_I10_4]], align 2 +; AVX2-NEXT: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX_I_I_4]], align 2 +; AVX2-NEXT: [[TMP8:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[TMP6]], <4 x i16> [[TMP7]]) +; AVX2-NEXT: [[TMP9:%.*]] = zext <4 x i16> [[TMP8]] to <4 x i64> +; AVX2-NEXT: [[TMP10:%.*]] = shl <4 x i64> [[TMP9]], +; AVX2-NEXT: [[TMP25:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP10]]) ; AVX2-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP25]], 1 ; AVX2-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-extracted-value.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-extracted-value.ll index 1cf837df719ec..56919ae0ffc90 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-extracted-value.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-extracted-value.ll @@ -10,9 +10,9 @@ define i32 @foo() { ; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP3]], zeroinitializer ; CHECK-NEXT: [[RDX_OP:%.*]] = mul <4 x i32> [[TMP4]], zeroinitializer ; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[RDX_OP]]) +; CHECK-NEXT: [[OP_RDX7:%.*]] = mul i32 0, [[TMP2]] ; CHECK-NEXT: [[OP_RDX:%.*]] = mul i32 0, [[TMP5]] -; CHECK-NEXT: [[OP_RDX1:%.*]] = mul i32 [[OP_RDX]], 0 -; CHECK-NEXT: [[OP_RDX6:%.*]] = mul i32 [[OP_RDX1]], [[TMP2]] +; CHECK-NEXT: [[OP_RDX6:%.*]] = mul i32 [[OP_RDX7]], [[OP_RDX]] ; CHECK-NEXT: ret i32 [[OP_RDX6]] ; bb: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-value-in-tree.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-value-in-tree.ll index 6d6dd502415e5..dd3dee6910a4c 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-value-in-tree.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-value-in-tree.ll @@ -6,7 +6,7 @@ define void @test() { ; CHECK-NEXT: br i1 false, label [[PH:%.*]], label [[EXIT:%.*]] ; CHECK: ph: ; CHECK-NEXT: [[TMP0:%.*]] = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> zeroinitializer) -; CHECK-NEXT: [[OP_RDX:%.*]] = and i8 0, [[TMP0]] +; CHECK-NEXT: [[OP_RDX:%.*]] = and i8 [[TMP0]], 0 ; CHECK-NEXT: [[OP_RDX1:%.*]] = and i8 [[OP_RDX]], 0 ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll b/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll index f7811aba5ab5f..2774d5f3b64e4 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll @@ -7,16 +7,17 @@ define <4 x i16> @test() { ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> zeroinitializer, <4 x i16> poison, <16 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = add <16 x i16> [[TMP0]], zeroinitializer ; CHECK-NEXT: [[RDX_OP:%.*]] = or <16 x i16> zeroinitializer, [[TMP1]] -; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> +; CHECK-NEXT: [[RDX_OP1:%.*]] = or <16 x i16> [[RDX_OP]], zeroinitializer +; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <16 x i16> [[RDX_OP1]], <16 x i16> poison, <4 x i32> ; CHECK-NEXT: [[TMP37:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP36]]) ; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i16> poison, i16 [[TMP37]], i64 0 -; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x i16> [[RDX_OP1]], <16 x i16> poison, <4 x i32> ; CHECK-NEXT: [[TMP40:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP39]]) ; CHECK-NEXT: [[TMP41:%.*]] = insertelement <4 x i16> [[TMP38]], i16 [[TMP40]], i64 1 -; CHECK-NEXT: [[TMP42:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP42:%.*]] = shufflevector <16 x i16> [[RDX_OP1]], <16 x i16> poison, <4 x i32> ; CHECK-NEXT: [[TMP43:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP42]]) ; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i16> [[TMP41]], i16 [[TMP43]], i64 2 -; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <16 x i16> [[RDX_OP1]], <16 x i16> poison, <4 x i32> ; CHECK-NEXT: [[TMP46:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP45]]) ; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i16> [[TMP44]], i16 [[TMP46]], i64 3 ; CHECK-NEXT: [[OP_RDX9:%.*]] = or <4 x i16> [[TMP47]], zeroinitializer diff --git a/llvm/test/Transforms/SLPVectorizer/operand-is-reduced-val.ll b/llvm/test/Transforms/SLPVectorizer/operand-is-reduced-val.ll index 62417268bf3d0..7ed1edc278806 100644 --- a/llvm/test/Transforms/SLPVectorizer/operand-is-reduced-val.ll +++ b/llvm/test/Transforms/SLPVectorizer/operand-is-reduced-val.ll @@ -1,27 +1,41 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux < %s -slp-threshold=-10 | FileCheck %s %} -; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux < %s -slp-threshold=-10 | FileCheck %s %} +; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux < %s | FileCheck %s --check-prefix=X86 %} +; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux < %s | FileCheck %s --check-prefix=AARCH64 %} define i64 @src(i32 %a) { -; CHECK-LABEL: define i64 @src( -; CHECK-SAME: i32 [[A:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP17:%.*]] = sext i32 [[A]] to i64 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64> -; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i64> [[TMP3]], splat (i64 4294967297) -; CHECK-NEXT: [[TMP6:%.*]] = and <4 x i64> [[TMP4]], splat (i64 1) -; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP6]]) -; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP4]]) -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> poison, i64 [[TMP16]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> [[TMP8]], i64 [[TMP18]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> , i64 [[TMP17]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = add <2 x i64> [[TMP9]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 -; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[TMP12]], [[TMP13]] -; CHECK-NEXT: ret i64 [[TMP21]] +; X86-LABEL: define i64 @src( +; X86-SAME: i32 [[A:%.*]]) { +; X86-NEXT: [[ENTRY:.*:]] +; X86-NEXT: [[TMP0:%.*]] = sext i32 [[A]] to i64 +; X86-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i32 0 +; X86-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer +; X86-NEXT: [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64> +; X86-NEXT: [[TMP4:%.*]] = add nsw <4 x i64> [[TMP3]], splat (i64 4294967297) +; X86-NEXT: [[TMP5:%.*]] = and <4 x i64> [[TMP4]], splat (i64 1) +; X86-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> +; X86-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <8 x i32> +; X86-NEXT: [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> [[TMP7]], <8 x i32> +; X86-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP8]]) +; X86-NEXT: [[OP_RDX:%.*]] = add i64 [[TMP9]], 4294967297 +; X86-NEXT: [[OP_RDX1:%.*]] = add i64 [[OP_RDX]], [[TMP0]] +; X86-NEXT: ret i64 [[OP_RDX1]] +; +; AARCH64-LABEL: define i64 @src( +; AARCH64-SAME: i32 [[A:%.*]]) { +; AARCH64-NEXT: [[ENTRY:.*:]] +; AARCH64-NEXT: [[TMP0:%.*]] = sext i32 [[A]] to i64 +; AARCH64-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i32 0 +; AARCH64-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer +; AARCH64-NEXT: [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64> +; AARCH64-NEXT: [[TMP4:%.*]] = add nsw <4 x i64> [[TMP3]], splat (i64 4294967297) +; AARCH64-NEXT: [[TMP5:%.*]] = and <4 x i64> [[TMP4]], splat (i64 1) +; AARCH64-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> +; AARCH64-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <8 x i32> +; AARCH64-NEXT: [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> [[TMP4]], <8 x i32> +; AARCH64-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP8]]) +; AARCH64-NEXT: [[OP_RDX:%.*]] = add i64 [[TMP9]], 4294967297 +; AARCH64-NEXT: [[OP_RDX1:%.*]] = add i64 [[OP_RDX]], [[TMP0]] +; AARCH64-NEXT: ret i64 [[OP_RDX1]] ; entry: %0 = sext i32 %a to i64 diff --git a/llvm/test/Transforms/SLPVectorizer/reduction-modified-values.ll b/llvm/test/Transforms/SLPVectorizer/reduction-modified-values.ll index 2da65114eae04..c90e76c6d00f7 100644 --- a/llvm/test/Transforms/SLPVectorizer/reduction-modified-values.ll +++ b/llvm/test/Transforms/SLPVectorizer/reduction-modified-values.ll @@ -1,15 +1,25 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %} -; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %} +; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s --check-prefix=X86 %} +; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefix=AARCH64 %} define i32 @test() { -; CHECK-LABEL: @test( -; CHECK-NEXT: bb: -; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, <4 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = or <4 x i32> [[TMP0]], zeroinitializer -; CHECK-NEXT: [[RDX_OP:%.*]] = add <4 x i32> [[TMP1]], [[TMP0]] -; CHECK-NEXT: [[OP_RDX:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[RDX_OP]]) -; CHECK-NEXT: ret i32 [[OP_RDX]] +; X86-LABEL: @test( +; X86-NEXT: bb: +; X86-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, <4 x i32> +; X86-NEXT: [[TMP1:%.*]] = or <4 x i32> [[TMP0]], zeroinitializer +; X86-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> +; X86-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <8 x i32> +; X86-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <8 x i32> +; X86-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]]) +; X86-NEXT: ret i32 [[TMP5]] +; +; AARCH64-LABEL: @test( +; AARCH64-NEXT: bb: +; AARCH64-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, <4 x i32> +; AARCH64-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <8 x i32> +; AARCH64-NEXT: [[TMP2:%.*]] = or <8 x i32> [[TMP1]], zeroinitializer +; AARCH64-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]]) +; AARCH64-NEXT: ret i32 [[TMP3]] ; bb: %0 = shufflevector <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, <4 x i32>