Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 43 additions & 3 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24126,8 +24126,44 @@ class HorizontalReduction {
bool CheckForReusedReductionOps = false;
// Try to vectorize elements based on their type.
SmallVector<InstructionsState> States;
for (ArrayRef<Value *> RV : ReducedVals)
SmallVector<SmallVector<Value *>> LocalReducedVals;
for (ArrayRef<Value *> RV : ReducedVals) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comments?

// Loads are not very compatible with undefs.
if (isa<UndefValue>(RV.front()) &&
(States.empty() || !States.back() ||
States.back().getOpcode() == Instruction::Load)) {
LocalReducedVals.emplace_back().append(RV.begin(), RV.end());
States.push_back(InstructionsState::invalid());
continue;
}
if (!LocalReducedVals.empty() &&
isa<UndefValue>(LocalReducedVals.back().front()) &&
isa<LoadInst>(RV.front())) {
LocalReducedVals.emplace_back().append(RV.begin(), RV.end());
States.push_back(getSameOpcode(RV, TLI));
continue;
}
SmallVector<Value *> Ops;
if (!LocalReducedVals.empty())
Ops = LocalReducedVals.back();
Ops.append(RV.begin(), RV.end());
InstructionsCompatibilityAnalysis Analysis(DT, DL, *TTI, TLI);
InstructionsState OpS =
Analysis.buildInstructionsState(Ops, V, VectorizeCopyableElements);
if (LocalReducedVals.empty()) {
LocalReducedVals.push_back(Ops);
States.push_back(OpS);
continue;
}
if (OpS) {
LocalReducedVals.back().swap(Ops);
States.back() = OpS;
continue;
}
LocalReducedVals.emplace_back().append(RV.begin(), RV.end());
States.push_back(getSameOpcode(RV, TLI));
}
ReducedVals.swap(LocalReducedVals);
for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
InstructionsState S = States[I];
Expand All @@ -24142,8 +24178,10 @@ class HorizontalReduction {
// Also check if the instruction was folded to constant/other value.
auto *Inst = dyn_cast<Instruction>(RdxVal);
if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
(!S || !S.getMatchingMainOpOrAltOp(Inst))) ||
(S && !Inst))
(!S || (!S.getMatchingMainOpOrAltOp(Inst) &&
!S.isCopyableElement(Inst)))) ||
(S && !Inst && !isa<PoisonValue>(RdxVal) &&
!S.isCopyableElement(RdxVal)))
continue;
Candidates.push_back(RdxVal);
TrackedToOrig.try_emplace(RdxVal, ReducedVal);
Expand Down Expand Up @@ -24728,6 +24766,8 @@ class HorizontalReduction {
// Scalar cost is repeated for N-1 elements.
int Cnt = ReducedVals.size();
for (Value *RdxVal : ReducedVals) {
if (!isa<Instruction>(RdxVal))
continue;
if (Cnt == 1)
break;
--Cnt;
Expand Down
149 changes: 23 additions & 126 deletions llvm/test/Transforms/PhaseOrdering/X86/pr48223.ll
Original file line number Diff line number Diff line change
@@ -1,131 +1,31 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -O3 -S -mtriple=x86_64-- -mcpu=x86-64 < %s | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
; RUN: opt -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 < %s | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
; RUN: opt -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 < %s | FileCheck %s --check-prefixes=CHECK,AVX
; RUN: opt -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 < %s | FileCheck %s --check-prefixes=CHECK,AVX
; RUN: opt -O3 -S -mtriple=x86_64-- -mcpu=x86-64 < %s | FileCheck %s
; RUN: opt -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 < %s | FileCheck %s
; RUN: opt -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 < %s | FileCheck %s
; RUN: opt -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 < %s | FileCheck %s

%"struct.std::array" = type { [8 x i16] }

define { i64, i64 } @compute_min(ptr noundef nonnull align 2 dereferenceable(16) %x, ptr noundef nonnull align 2 dereferenceable(16) %y) {
; SSE2-LABEL: @compute_min(
; SSE2-NEXT: entry:
; SSE2-NEXT: [[LD0:%.*]] = load i16, ptr [[Y:%.*]], align 2
; SSE2-NEXT: [[LD1:%.*]] = load i16, ptr [[X:%.*]], align 2
; SSE2-NEXT: [[LD2:%.*]] = tail call i16 @llvm.smin.i16(i16 [[LD0]], i16 [[LD1]])
; SSE2-NEXT: [[PT1_1:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 2
; SSE2-NEXT: [[PT0_1:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 2
; SSE2-NEXT: [[LD0_1:%.*]] = load i16, ptr [[PT0_1]], align 2
; SSE2-NEXT: [[LD1_1:%.*]] = load i16, ptr [[PT1_1]], align 2
; SSE2-NEXT: [[LD2_1:%.*]] = tail call i16 @llvm.smin.i16(i16 [[LD0_1]], i16 [[LD1_1]])
; SSE2-NEXT: [[PT1_2:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 4
; SSE2-NEXT: [[PT0_2:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 4
; SSE2-NEXT: [[LD0_2:%.*]] = load i16, ptr [[PT0_2]], align 2
; SSE2-NEXT: [[LD1_2:%.*]] = load i16, ptr [[PT1_2]], align 2
; SSE2-NEXT: [[LD2_2:%.*]] = tail call i16 @llvm.smin.i16(i16 [[LD0_2]], i16 [[LD1_2]])
; SSE2-NEXT: [[PT1_3:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 6
; SSE2-NEXT: [[PT0_3:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 6
; SSE2-NEXT: [[LD0_3:%.*]] = load i16, ptr [[PT0_3]], align 2
; SSE2-NEXT: [[LD1_3:%.*]] = load i16, ptr [[PT1_3]], align 2
; SSE2-NEXT: [[LD2_3:%.*]] = tail call i16 @llvm.smin.i16(i16 [[LD0_3]], i16 [[LD1_3]])
; SSE2-NEXT: [[PT1_4:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 8
; SSE2-NEXT: [[PT0_4:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 8
; SSE2-NEXT: [[LD0_4:%.*]] = load i16, ptr [[PT0_4]], align 2
; SSE2-NEXT: [[LD1_4:%.*]] = load i16, ptr [[PT1_4]], align 2
; SSE2-NEXT: [[LD2_4:%.*]] = tail call i16 @llvm.smin.i16(i16 [[LD0_4]], i16 [[LD1_4]])
; SSE2-NEXT: [[PT1_5:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 10
; SSE2-NEXT: [[PT0_5:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 10
; SSE2-NEXT: [[LD0_5:%.*]] = load i16, ptr [[PT0_5]], align 2
; SSE2-NEXT: [[LD1_5:%.*]] = load i16, ptr [[PT1_5]], align 2
; SSE2-NEXT: [[LD2_5:%.*]] = tail call i16 @llvm.smin.i16(i16 [[LD0_5]], i16 [[LD1_5]])
; SSE2-NEXT: [[PT1_6:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 12
; SSE2-NEXT: [[PT0_6:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 12
; SSE2-NEXT: [[LD0_6:%.*]] = load i16, ptr [[PT0_6]], align 2
; SSE2-NEXT: [[LD1_6:%.*]] = load i16, ptr [[PT1_6]], align 2
; SSE2-NEXT: [[LD2_6:%.*]] = tail call i16 @llvm.smin.i16(i16 [[LD0_6]], i16 [[LD1_6]])
; SSE2-NEXT: [[PT1_7:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 14
; SSE2-NEXT: [[PT0_7:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 14
; SSE2-NEXT: [[LD0_7:%.*]] = load i16, ptr [[PT0_7]], align 2
; SSE2-NEXT: [[LD1_7:%.*]] = load i16, ptr [[PT1_7]], align 2
; SSE2-NEXT: [[LD2_7:%.*]] = tail call i16 @llvm.smin.i16(i16 [[LD0_7]], i16 [[LD1_7]])
; SSE2-NEXT: [[RETVAL_SROA_4_0_INSERT_EXT:%.*]] = zext i16 [[LD2_3]] to i64
; SSE2-NEXT: [[RETVAL_SROA_4_0_INSERT_SHIFT:%.*]] = shl nuw i64 [[RETVAL_SROA_4_0_INSERT_EXT]], 48
; SSE2-NEXT: [[RETVAL_SROA_3_0_INSERT_EXT:%.*]] = zext i16 [[LD2_2]] to i64
; SSE2-NEXT: [[RETVAL_SROA_3_0_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_3_0_INSERT_EXT]], 32
; SSE2-NEXT: [[RETVAL_SROA_3_0_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_4_0_INSERT_SHIFT]], [[RETVAL_SROA_3_0_INSERT_SHIFT]]
; SSE2-NEXT: [[RETVAL_SROA_2_0_INSERT_EXT:%.*]] = zext i16 [[LD2_1]] to i64
; SSE2-NEXT: [[RETVAL_SROA_2_0_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_2_0_INSERT_EXT]], 16
; SSE2-NEXT: [[RETVAL_SROA_2_0_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_3_0_INSERT_INSERT]], [[RETVAL_SROA_2_0_INSERT_SHIFT]]
; SSE2-NEXT: [[RETVAL_SROA_0_0_INSERT_EXT:%.*]] = zext i16 [[LD2]] to i64
; SSE2-NEXT: [[TMP20:%.*]] = or disjoint i64 [[RETVAL_SROA_2_0_INSERT_INSERT]], [[RETVAL_SROA_0_0_INSERT_EXT]]
; SSE2-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP20]], 0
; SSE2-NEXT: [[RETVAL_SROA_9_8_INSERT_EXT:%.*]] = zext i16 [[LD2_7]] to i64
; SSE2-NEXT: [[RETVAL_SROA_9_8_INSERT_SHIFT:%.*]] = shl nuw i64 [[RETVAL_SROA_9_8_INSERT_EXT]], 48
; SSE2-NEXT: [[RETVAL_SROA_8_8_INSERT_EXT:%.*]] = zext i16 [[LD2_6]] to i64
; SSE2-NEXT: [[RETVAL_SROA_8_8_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_8_8_INSERT_EXT]], 32
; SSE2-NEXT: [[RETVAL_SROA_8_8_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_9_8_INSERT_SHIFT]], [[RETVAL_SROA_8_8_INSERT_SHIFT]]
; SSE2-NEXT: [[RETVAL_SROA_7_8_INSERT_EXT:%.*]] = zext i16 [[LD2_5]] to i64
; SSE2-NEXT: [[RETVAL_SROA_7_8_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_7_8_INSERT_EXT]], 16
; SSE2-NEXT: [[RETVAL_SROA_7_8_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_8_8_INSERT_INSERT]], [[RETVAL_SROA_7_8_INSERT_SHIFT]]
; SSE2-NEXT: [[RETVAL_SROA_5_8_INSERT_EXT:%.*]] = zext i16 [[LD2_4]] to i64
; SSE2-NEXT: [[TMP21:%.*]] = or disjoint i64 [[RETVAL_SROA_7_8_INSERT_INSERT]], [[RETVAL_SROA_5_8_INSERT_EXT]]
; SSE2-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP21]], 1
; SSE2-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]]
;
; SSE4-LABEL: @compute_min(
; SSE4-NEXT: entry:
; SSE4-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[Y:%.*]], align 2
; SSE4-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[X:%.*]], align 2
; SSE4-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <2 x i32> <i32 0, i32 4>
; SSE4-NEXT: [[TMP4:%.*]] = tail call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <2 x i32> <i32 1, i32 5>
; SSE4-NEXT: [[TMP6:%.*]] = tail call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
; SSE4-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP6]], <8 x i16> poison, <2 x i32> <i32 3, i32 6>
; SSE4-NEXT: [[TMP8:%.*]] = tail call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
; SSE4-NEXT: [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP8]], <8 x i16> poison, <2 x i32> <i32 2, i32 7>
; SSE4-NEXT: [[TMP10:%.*]] = zext <2 x i16> [[TMP9]] to <2 x i64>
; SSE4-NEXT: [[TMP11:%.*]] = shl nuw <2 x i64> [[TMP10]], <i64 32, i64 48>
; SSE4-NEXT: [[TMP12:%.*]] = zext <2 x i16> [[TMP7]] to <2 x i64>
; SSE4-NEXT: [[TMP13:%.*]] = shl nuw <2 x i64> [[TMP12]], <i64 48, i64 32>
; SSE4-NEXT: [[TMP14:%.*]] = or disjoint <2 x i64> [[TMP11]], [[TMP13]]
; SSE4-NEXT: [[TMP15:%.*]] = zext <2 x i16> [[TMP5]] to <2 x i64>
; SSE4-NEXT: [[TMP16:%.*]] = shl nuw nsw <2 x i64> [[TMP15]], splat (i64 16)
; SSE4-NEXT: [[TMP17:%.*]] = or disjoint <2 x i64> [[TMP14]], [[TMP16]]
; SSE4-NEXT: [[TMP18:%.*]] = zext <2 x i16> [[TMP3]] to <2 x i64>
; SSE4-NEXT: [[TMP19:%.*]] = or disjoint <2 x i64> [[TMP17]], [[TMP18]]
; SSE4-NEXT: [[TMP20:%.*]] = extractelement <2 x i64> [[TMP19]], i64 0
; SSE4-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP20]], 0
; SSE4-NEXT: [[TMP21:%.*]] = extractelement <2 x i64> [[TMP19]], i64 1
; SSE4-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP21]], 1
; SSE4-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]]
;
; AVX-LABEL: @compute_min(
; AVX-NEXT: entry:
; AVX-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[Y:%.*]], align 2
; AVX-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[X:%.*]], align 2
; AVX-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <2 x i32> <i32 0, i32 4>
; AVX-NEXT: [[TMP4:%.*]] = tail call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <2 x i32> <i32 1, i32 5>
; AVX-NEXT: [[TMP6:%.*]] = tail call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
; AVX-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP6]], <8 x i16> poison, <2 x i32> <i32 3, i32 6>
; AVX-NEXT: [[TMP8:%.*]] = tail call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
; AVX-NEXT: [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP8]], <8 x i16> poison, <2 x i32> <i32 2, i32 7>
; AVX-NEXT: [[TMP10:%.*]] = zext <2 x i16> [[TMP9]] to <2 x i64>
; AVX-NEXT: [[TMP11:%.*]] = shl nuw <2 x i64> [[TMP10]], <i64 32, i64 48>
; AVX-NEXT: [[TMP12:%.*]] = zext <2 x i16> [[TMP7]] to <2 x i64>
; AVX-NEXT: [[TMP13:%.*]] = shl nuw <2 x i64> [[TMP12]], <i64 48, i64 32>
; AVX-NEXT: [[TMP14:%.*]] = or disjoint <2 x i64> [[TMP11]], [[TMP13]]
; AVX-NEXT: [[TMP15:%.*]] = zext <2 x i16> [[TMP5]] to <2 x i64>
; AVX-NEXT: [[TMP16:%.*]] = shl nuw nsw <2 x i64> [[TMP15]], splat (i64 16)
; AVX-NEXT: [[TMP17:%.*]] = or disjoint <2 x i64> [[TMP14]], [[TMP16]]
; AVX-NEXT: [[TMP18:%.*]] = zext <2 x i16> [[TMP3]] to <2 x i64>
; AVX-NEXT: [[TMP19:%.*]] = or disjoint <2 x i64> [[TMP17]], [[TMP18]]
; AVX-NEXT: [[TMP20:%.*]] = extractelement <2 x i64> [[TMP19]], i64 0
; AVX-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP20]], 0
; AVX-NEXT: [[TMP21:%.*]] = extractelement <2 x i64> [[TMP19]], i64 1
; AVX-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP21]], 1
; AVX-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]]
; CHECK-LABEL: @compute_min(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[PT1_4:%.*]] = getelementptr inbounds nuw i8, ptr [[X:%.*]], i64 8
; CHECK-NEXT: [[PT0_4:%.*]] = getelementptr inbounds nuw i8, ptr [[Y:%.*]], i64 8
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[Y]], align 2
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[X]], align 2
; CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64>
; CHECK-NEXT: [[TMP4:%.*]] = shl nuw <4 x i64> [[TMP3]], <i64 0, i64 16, i64 32, i64 48>
; CHECK-NEXT: [[TMP5:%.*]] = tail call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP4]])
; CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP5]], 0
; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i16>, ptr [[PT0_4]], align 2
; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i16>, ptr [[PT1_4]], align 2
; CHECK-NEXT: [[TMP8:%.*]] = tail call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[TMP6]], <4 x i16> [[TMP7]])
; CHECK-NEXT: [[TMP9:%.*]] = zext <4 x i16> [[TMP8]] to <4 x i64>
; CHECK-NEXT: [[TMP10:%.*]] = shl nuw <4 x i64> [[TMP9]], <i64 0, i64 16, i64 32, i64 48>
; CHECK-NEXT: [[TMP11:%.*]] = tail call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP10]])
; CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP11]], 1
; CHECK-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]]
;
entry:
%retval = alloca %"struct.std::array", align 2
Expand Down Expand Up @@ -158,6 +58,3 @@ for.body: ; preds = %for.cond
%inc = add nuw nsw i32 %i.0, 1
br label %for.cond
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; CHECK: {{.*}}
; SSE: {{.*}}
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,15 @@ define <4 x i32> @square(<4 x i32> %num, i32 %y, i32 %x, i32 %h, i32 %k, i32 %w,
; CHECK-NEXT: [[MUL13:%.*]] = mul nsw i32 [[W:%.*]], 53
; CHECK-NEXT: [[DIV17:%.*]] = sdiv i32 [[X:%.*]], 820
; CHECK-NEXT: [[MUL21:%.*]] = shl nsw i32 [[U:%.*]], 2
; CHECK-NEXT: [[OP_RDX:%.*]] = add nsw i32 [[DIV17]], 317426
; CHECK-NEXT: [[OP_RDX9:%.*]] = add nsw i32 [[DIV]], [[DIV9]]
; CHECK-NEXT: [[OP_RDX10:%.*]] = add i32 [[MUL5]], [[MUL13]]
; CHECK-NEXT: [[OP_RDX11:%.*]] = add i32 [[MUL]], [[MUL21]]
; CHECK-NEXT: [[OP_RDX12:%.*]] = add i32 [[OP_RDX]], [[OP_RDX9]]
; CHECK-NEXT: [[OP_RDX13:%.*]] = add i32 [[OP_RDX10]], [[OP_RDX11]]
; CHECK-NEXT: [[OP_RDX14:%.*]] = add i32 [[OP_RDX12]], [[OP_RDX13]]
; CHECK-NEXT: [[OP_RDX15:%.*]] = add i32 [[OP_RDX14]], [[Y:%.*]]
; CHECK-NEXT: [[DOTSCALAR:%.*]] = add i32 [[Y:%.*]], 1
; CHECK-NEXT: [[DOTSCALAR1:%.*]] = add i32 [[DOTSCALAR]], [[DIV17]]
; CHECK-NEXT: [[DOTSCALAR2:%.*]] = add i32 [[DOTSCALAR1]], [[MUL5]]
; CHECK-NEXT: [[DOTSCALAR3:%.*]] = add i32 [[DOTSCALAR2]], [[DIV]]
; CHECK-NEXT: [[DOTSCALAR4:%.*]] = add i32 [[DOTSCALAR3]], [[MUL13]]
; CHECK-NEXT: [[DOTSCALAR5:%.*]] = add i32 [[DOTSCALAR4]], [[MUL]]
; CHECK-NEXT: [[DOTSCALAR6:%.*]] = add i32 [[DOTSCALAR5]], [[DIV9]]
; CHECK-NEXT: [[DOTSCALAR7:%.*]] = add i32 [[DOTSCALAR6]], [[MUL21]]
; CHECK-NEXT: [[OP_RDX15:%.*]] = add i32 [[DOTSCALAR7]], 317425
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[OP_RDX15]], i64 0
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[ADD29:%.*]] = add <4 x i32> [[TMP2]], [[NUM:%.*]]
Expand Down
17 changes: 9 additions & 8 deletions llvm/test/Transforms/PhaseOrdering/X86/scalarization.ll
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,15 @@ define <4 x i32> @square(<4 x i32> %num, i32 %y, i32 %x, i32 %h, i32 %k, i32 %w,
; CHECK-NEXT: [[MUL13:%.*]] = mul nsw i32 [[W:%.*]], 53
; CHECK-NEXT: [[DIV17:%.*]] = sdiv i32 [[X:%.*]], 820
; CHECK-NEXT: [[MUL21:%.*]] = shl nsw i32 [[U:%.*]], 2
; CHECK-NEXT: [[OP_RDX:%.*]] = add nsw i32 [[DIV17]], 317426
; CHECK-NEXT: [[OP_RDX9:%.*]] = add nsw i32 [[DIV]], [[DIV9]]
; CHECK-NEXT: [[OP_RDX10:%.*]] = add i32 [[MUL5]], [[MUL13]]
; CHECK-NEXT: [[OP_RDX11:%.*]] = add i32 [[MUL]], [[MUL21]]
; CHECK-NEXT: [[OP_RDX12:%.*]] = add i32 [[OP_RDX]], [[OP_RDX9]]
; CHECK-NEXT: [[OP_RDX13:%.*]] = add i32 [[OP_RDX10]], [[OP_RDX11]]
; CHECK-NEXT: [[OP_RDX14:%.*]] = add i32 [[OP_RDX12]], [[OP_RDX13]]
; CHECK-NEXT: [[OP_RDX15:%.*]] = add i32 [[OP_RDX14]], [[Y:%.*]]
; CHECK-NEXT: [[DOTSCALAR:%.*]] = add i32 [[Y:%.*]], 1
; CHECK-NEXT: [[DOTSCALAR1:%.*]] = add i32 [[DOTSCALAR]], [[DIV17]]
; CHECK-NEXT: [[DOTSCALAR2:%.*]] = add i32 [[DOTSCALAR1]], [[MUL5]]
; CHECK-NEXT: [[DOTSCALAR3:%.*]] = add i32 [[DOTSCALAR2]], [[DIV]]
; CHECK-NEXT: [[DOTSCALAR4:%.*]] = add i32 [[DOTSCALAR3]], [[MUL13]]
; CHECK-NEXT: [[DOTSCALAR5:%.*]] = add i32 [[DOTSCALAR4]], [[MUL]]
; CHECK-NEXT: [[DOTSCALAR6:%.*]] = add i32 [[DOTSCALAR5]], [[DIV9]]
; CHECK-NEXT: [[DOTSCALAR7:%.*]] = add i32 [[DOTSCALAR6]], [[MUL21]]
; CHECK-NEXT: [[OP_RDX15:%.*]] = add i32 [[DOTSCALAR7]], 317425
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[OP_RDX15]], i64 0
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[ADD29:%.*]] = add <4 x i32> [[TMP2]], [[NUM:%.*]]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ define void @test() {
; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr null, align 2
; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i16> [[TMP0]], zeroinitializer
; CHECK-NEXT: [[TMP3:%.*]] = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> [[TMP2]])
; CHECK-NEXT: [[TMP4:%.*]] = call i16 @llvm.smax.i16(i16 [[TMP1]], i16 [[TMP3]])
; CHECK-NEXT: [[TMP5:%.*]] = call i16 @llvm.smax.i16(i16 [[TMP4]], i16 0)
; CHECK-NEXT: [[TMP4:%.*]] = call i16 @llvm.smax.i16(i16 [[TMP1]], i16 0)
; CHECK-NEXT: [[TMP5:%.*]] = call i16 @llvm.smax.i16(i16 [[TMP4]], i16 [[TMP3]])
; CHECK-NEXT: [[TMP6:%.*]] = tail call i16 @llvm.smax.i16(i16 [[TMP5]], i16 0)
; CHECK-NEXT: ret void
;
Expand Down
Loading
Loading