diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h index 3b92cbff28de4..b985292ccee40 100644 --- a/llvm/include/llvm/Analysis/IVDescriptors.h +++ b/llvm/include/llvm/Analysis/IVDescriptors.h @@ -57,6 +57,9 @@ enum class RecurKind { FindFirstIVSMin, /// FindFirst reduction with select(icmp(),x,y) where one of ///< (x,y) is a decreasing loop induction, and both x and y ///< are integer type, producing a SMin reduction. + FindFirstIVUMin, /// FindFirst reduction with select(icmp(),x,y) where one of + ///< (x,y) is a decreasing loop induction, and both x and y + ///< are integer type, producing a UMin reduction. FindLastIVSMax, ///< FindLast reduction with select(cmp(),x,y) where one of ///< (x,y) is increasing loop induction, and both x and y ///< are integer type, producing a SMax reduction. @@ -265,7 +268,8 @@ class RecurrenceDescriptor { /// Returns true if the recurrence kind is of the form /// select(cmp(),x,y) where one of (x,y) is decreasing loop induction. static bool isFindFirstIVRecurrenceKind(RecurKind Kind) { - return Kind == RecurKind::FindFirstIVSMin; + return Kind == RecurKind::FindFirstIVSMin || + Kind == RecurKind::FindFirstIVUMin; } /// Returns true if the recurrence kind is of the form diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp index b275b1064cef2..39f74beca082f 100644 --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -51,6 +51,7 @@ bool RecurrenceDescriptor::isIntegerRecurrenceKind(RecurKind Kind) { case RecurKind::UMin: case RecurKind::AnyOf: case RecurKind::FindFirstIVSMin: + case RecurKind::FindFirstIVUMin: case RecurKind::FindLastIVSMax: case RecurKind::FindLastIVUMax: return true; @@ -663,9 +664,9 @@ RecurrenceDescriptor::isAnyOfPattern(Loop *Loop, PHINode *OrigPhi, // if (src[i] > 3) // r = i; // } -// The reduction value (r) is derived from either the values of an increasing -// induction variable (i) sequence, or from the start value (0). -// The LLVM IR generated for such loops would be as follows: +// The reduction value (r) is derived from either the values of an induction +// variable (i) sequence, or from the start value (0). The LLVM IR generated for +// such loops would be as follows: // for.body: // %r = phi i32 [ %spec.select, %for.body ], [ 0, %entry ] // %i = phi i32 [ %inc, %for.body ], [ 0, %entry ] @@ -674,13 +675,16 @@ RecurrenceDescriptor::isAnyOfPattern(Loop *Loop, PHINode *OrigPhi, // %spec.select = select i1 %cmp, i32 %i, i32 %r // %inc = add nsw i32 %i, 1 // ... -// Since 'i' is an increasing induction variable, the reduction value after the -// loop will be the maximum value of 'i' that the condition (src[i] > 3) is -// satisfied, or the start value (0 in the example above). When the start value -// of the increasing induction variable 'i' is greater than the minimum value of -// the data type, we can use the minimum value of the data type as a sentinel -// value to replace the start value. This allows us to perform a single -// reduction max operation to obtain the final reduction result. +// Since 'i' is an induction variable, the reduction value after the loop will +// be the maximum (increasing induction) or minimum (decreasing induction) value +// of 'i' that the condition (src[i] > 3) is satisfied, or the start value (0 in +// the example above). When the start value of the induction variable 'i' is +// greater than the minimum (increasing induction) or maximum (decreasing +// induction) value of the data type, we can use the minimum (increasing +// induction) or maximum (decreasing induction) value of the data type as a +// sentinel value to replace the start value. This allows us to perform a single +// reduction max (increasing induction) or min (decreasing induction) operation +// to obtain the final reduction result. // TODO: It is possible to solve the case where the start value is the minimum // value of the data type or a non-constant value by using mask and multiple // reduction operations. @@ -695,6 +699,9 @@ RecurrenceDescriptor::isFindIVPattern(RecurKind Kind, Loop *TheLoop, if (!OrigPhi->hasOneUse()) return InstDesc(false, I); + // We are looking for selects of the form: + // select(cmp(), phi, loop_induction) or + // select(cmp(), loop_induction, phi) // TODO: Match selects with multi-use cmp conditions. Value *NonRdxPhi = nullptr; if (!match(I, m_CombineOr(m_Select(m_OneUse(m_Cmp()), m_Value(NonRdxPhi), @@ -703,8 +710,8 @@ RecurrenceDescriptor::isFindIVPattern(RecurKind Kind, Loop *TheLoop, m_Value(NonRdxPhi))))) return InstDesc(false, I); - // Returns a non-nullopt boolean indicating the signedness of the recurrence - // when a valid FindLastIV pattern is found. + // Returns either FindFirstIV/FindLastIV, if such a pattern is found, or + // std::nullopt. auto GetRecurKind = [&](Value *V) -> std::optional { Type *Ty = V->getType(); if (!SE.isSCEVable(Ty)) @@ -719,16 +726,12 @@ RecurrenceDescriptor::isFindIVPattern(RecurKind Kind, Loop *TheLoop, (isFindLastIVRecurrenceKind(Kind) && !SE.isKnownPositive(Step))) return std::nullopt; - // Keep the minimum value of the recurrence type as the sentinel value. - // The maximum acceptable range for the increasing induction variable, - // called the valid range, will be defined as - - // Keep the minimum (FindLast) or maximum (FindFirst) value of the - // recurrence type as the sentinel value. The maximum acceptable range for - // the induction variable, called the valid range, will be defined as - // [ + 1, ) - // where is [Signed|Unsigned]Min() for - // FindLastIV or [Signed|Unsigned]Max() for FindFirstIV. + // Check if the minimum (FindLast) or maximum (FindFirst) value of the + // recurrence type can be used as a sentinel value. The maximum acceptable + // range for the induction variable, called the valid range will exclude + // , where is + // [Signed|Unsigned]Min() for FindLastIV or + // [Signed|Unsigned]Max() for FindFirstIV. // TODO: This range restriction can be lifted by adding an additional // virtual OR reduction. auto CheckRange = [&](bool IsSigned) { @@ -741,10 +744,13 @@ RecurrenceDescriptor::isFindIVPattern(RecurKind Kind, Loop *TheLoop, : APInt::getMinValue(NumBits); ValidRange = ConstantRange::getNonEmpty(Sentinel + 1, Sentinel); } else { - assert(IsSigned && "Only FindFirstIV with SMax is supported currently"); - ValidRange = - ConstantRange::getNonEmpty(APInt::getSignedMinValue(NumBits), - APInt::getSignedMaxValue(NumBits) - 1); + if (IsSigned) + ValidRange = + ConstantRange::getNonEmpty(APInt::getSignedMinValue(NumBits), + APInt::getSignedMaxValue(NumBits) - 1); + else + ValidRange = ConstantRange::getNonEmpty( + APInt::getMinValue(NumBits), APInt::getMaxValue(NumBits) - 1); } LLVM_DEBUG(dbgs() << "LV: " @@ -770,13 +776,11 @@ RecurrenceDescriptor::isFindIVPattern(RecurKind Kind, Loop *TheLoop, if (CheckRange(true)) return RecurKind::FindFirstIVSMin; + if (CheckRange(false)) + return RecurKind::FindFirstIVUMin; return std::nullopt; }; - // We are looking for selects of the form: - // select(cmp(), phi, increasing_loop_induction) or - // select(cmp(), increasing_loop_induction, phi) - // TODO: Support for monotonically decreasing induction variable if (auto RK = GetRecurKind(NonRdxPhi)) return InstDesc(I, *RK); @@ -1183,6 +1187,7 @@ unsigned RecurrenceDescriptor::getOpcode(RecurKind Kind) { return Instruction::Mul; case RecurKind::AnyOf: case RecurKind::FindFirstIVSMin: + case RecurKind::FindFirstIVUMin: case RecurKind::FindLastIVSMax: case RecurKind::FindLastIVUMax: case RecurKind::Or: diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 11b2aac0005c9..26d48fdcceabd 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -23185,6 +23185,7 @@ class HorizontalReduction { case RecurKind::FMulAdd: case RecurKind::AnyOf: case RecurKind::FindFirstIVSMin: + case RecurKind::FindFirstIVUMin: case RecurKind::FindLastIVSMax: case RecurKind::FindLastIVUMax: case RecurKind::FMaximumNum: @@ -23321,6 +23322,7 @@ class HorizontalReduction { case RecurKind::FMulAdd: case RecurKind::AnyOf: case RecurKind::FindFirstIVSMin: + case RecurKind::FindFirstIVUMin: case RecurKind::FindLastIVSMax: case RecurKind::FindLastIVUMax: case RecurKind::FMaximumNum: @@ -23422,6 +23424,7 @@ class HorizontalReduction { case RecurKind::FMulAdd: case RecurKind::AnyOf: case RecurKind::FindFirstIVSMin: + case RecurKind::FindFirstIVUMin: case RecurKind::FindLastIVSMax: case RecurKind::FindLastIVUMax: case RecurKind::FMaximumNum: diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index ccb7512051d77..75ade13b09d9c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -763,14 +763,10 @@ Value *VPInstruction::generate(VPTransformState &State) { Value *ReducedPartRdx = State.get(getOperand(3)); RecurKind MinMaxKind; bool IsSigned = RecurrenceDescriptor::isSignedRecurrenceKind(RK); - if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) { + if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) MinMaxKind = IsSigned ? RecurKind::SMax : RecurKind::UMax; - } else { - assert(RecurrenceDescriptor::isFindFirstIVRecurrenceKind(RK) && - "Kind must either be FindLastIV or FindFirstIV"); - assert(IsSigned && "Only FindFirstIV with SMax is currently supported"); - MinMaxKind = RecurKind::SMin; - } + else + MinMaxKind = IsSigned ? RecurKind::SMin : RecurKind::UMin; for (unsigned Part = 1; Part < UF; ++Part) ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, State.get(getOperand(3 + Part))); diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll index d224da795997d..eb9c1cd2848df 100644 --- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll +++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll @@ -941,6 +941,250 @@ exit: ; preds = %loop ret i16 %spec.select.lcssa } +; The signed sentinel value for decreasing-IV vectorization is LONG_MAX, and since +; the IV hits this value with smin vectorization, it needs to be vectorized with a +; an unsigned sentinel and umin instead. +define i64 @select_decreasing_induction_icmp_iv_unsigned(ptr %a) { +; IC1VF4-LABEL: define i64 @select_decreasing_induction_icmp_iv_unsigned( +; IC1VF4-SAME: ptr [[A:%.*]]) { +; IC1VF4-NEXT: [[ENTRY:.*]]: +; IC1VF4-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; IC1VF4: [[VECTOR_PH]]: +; IC1VF4-NEXT: br label %[[VECTOR_BODY:.*]] +; IC1VF4: [[VECTOR_BODY]]: +; IC1VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC1VF4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC1VF4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -1), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; IC1VF4-NEXT: [[OFFSET_IDX:%.*]] = sub i64 9223372036854775807, [[INDEX]] +; IC1VF4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]] +; IC1VF4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i32 0 +; IC1VF4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 -3 +; IC1VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 +; IC1VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> poison, <4 x i32> +; IC1VF4-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i64> [[REVERSE]], splat (i64 3) +; IC1VF4-NEXT: [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; IC1VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; IC1VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4) +; IC1VF4-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], -9223372036854775808 +; IC1VF4-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; IC1VF4: [[MIDDLE_BLOCK]]: +; IC1VF4-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> [[TMP4]]) +; IC1VF4-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP6]], -1 +; IC1VF4-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP6]], i64 331 +; IC1VF4-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IC1VF4: [[SCALAR_PH]]: +; IC1VF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, %[[MIDDLE_BLOCK]] ], [ 9223372036854775807, %[[ENTRY]] ] +; IC1VF4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 331, %[[ENTRY]] ] +; IC1VF4-NEXT: br label %[[LOOP:.*]] +; IC1VF4: [[LOOP]]: +; IC1VF4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; IC1VF4-NEXT: [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ] +; IC1VF4-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] +; IC1VF4-NEXT: [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8 +; IC1VF4-NEXT: [[CMP_A_3:%.*]] = icmp sgt i64 [[LD_A]], 3 +; IC1VF4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP_A_3]], i64 [[IV]], i64 [[RDX]] +; IC1VF4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1 +; IC1VF4-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV]], 0 +; IC1VF4-NEXT: br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; IC1VF4: [[EXIT]]: +; IC1VF4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ] +; IC1VF4-NEXT: ret i64 [[SPEC_SELECT_LCSSA]] +; +; IC4VF4-LABEL: define i64 @select_decreasing_induction_icmp_iv_unsigned( +; IC4VF4-SAME: ptr [[A:%.*]]) { +; IC4VF4-NEXT: [[ENTRY:.*]]: +; IC4VF4-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; IC4VF4: [[VECTOR_PH]]: +; IC4VF4-NEXT: br label %[[VECTOR_BODY:.*]] +; IC4VF4: [[VECTOR_BODY]]: +; IC4VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC4VF4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC4VF4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -1), %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ] +; IC4VF4-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ splat (i64 -1), %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ] +; IC4VF4-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i64> [ splat (i64 -1), %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ] +; IC4VF4-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i64> [ splat (i64 -1), %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ] +; IC4VF4-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -4) +; IC4VF4-NEXT: [[STEP_ADD_2:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 -4) +; IC4VF4-NEXT: [[STEP_ADD_3:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 -4) +; IC4VF4-NEXT: [[OFFSET_IDX:%.*]] = sub i64 9223372036854775807, [[INDEX]] +; IC4VF4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]] +; IC4VF4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i32 0 +; IC4VF4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 -3 +; IC4VF4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i32 -4 +; IC4VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 -3 +; IC4VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i32 -8 +; IC4VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 -3 +; IC4VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i32 -12 +; IC4VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 -3 +; IC4VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 +; IC4VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> poison, <4 x i32> +; IC4VF4-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 +; IC4VF4-NEXT: [[REVERSE5:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD4]], <4 x i64> poison, <4 x i32> +; IC4VF4-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i64>, ptr [[TMP6]], align 8 +; IC4VF4-NEXT: [[REVERSE7:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD6]], <4 x i64> poison, <4 x i32> +; IC4VF4-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i64>, ptr [[TMP8]], align 8 +; IC4VF4-NEXT: [[REVERSE9:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD8]], <4 x i64> poison, <4 x i32> +; IC4VF4-NEXT: [[TMP9:%.*]] = icmp sgt <4 x i64> [[REVERSE]], splat (i64 3) +; IC4VF4-NEXT: [[TMP10:%.*]] = icmp sgt <4 x i64> [[REVERSE5]], splat (i64 3) +; IC4VF4-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i64> [[REVERSE7]], splat (i64 3) +; IC4VF4-NEXT: [[TMP12:%.*]] = icmp sgt <4 x i64> [[REVERSE9]], splat (i64 3) +; IC4VF4-NEXT: [[TMP13]] = select <4 x i1> [[TMP9]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; IC4VF4-NEXT: [[TMP14]] = select <4 x i1> [[TMP10]], <4 x i64> [[STEP_ADD]], <4 x i64> [[VEC_PHI1]] +; IC4VF4-NEXT: [[TMP15]] = select <4 x i1> [[TMP11]], <4 x i64> [[STEP_ADD_2]], <4 x i64> [[VEC_PHI2]] +; IC4VF4-NEXT: [[TMP16]] = select <4 x i1> [[TMP12]], <4 x i64> [[STEP_ADD_3]], <4 x i64> [[VEC_PHI3]] +; IC4VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; IC4VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD_3]], splat (i64 -4) +; IC4VF4-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], -9223372036854775808 +; IC4VF4-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; IC4VF4: [[MIDDLE_BLOCK]]: +; IC4VF4-NEXT: [[RDX_MINMAX:%.*]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[TMP13]], <4 x i64> [[TMP14]]) +; IC4VF4-NEXT: [[RDX_MINMAX10:%.*]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[RDX_MINMAX]], <4 x i64> [[TMP15]]) +; IC4VF4-NEXT: [[RDX_MINMAX11:%.*]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[RDX_MINMAX10]], <4 x i64> [[TMP16]]) +; IC4VF4-NEXT: [[TMP18:%.*]] = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> [[RDX_MINMAX11]]) +; IC4VF4-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP18]], -1 +; IC4VF4-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP18]], i64 331 +; IC4VF4-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IC4VF4: [[SCALAR_PH]]: +; IC4VF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, %[[MIDDLE_BLOCK]] ], [ 9223372036854775807, %[[ENTRY]] ] +; IC4VF4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 331, %[[ENTRY]] ] +; IC4VF4-NEXT: br label %[[LOOP:.*]] +; IC4VF4: [[LOOP]]: +; IC4VF4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; IC4VF4-NEXT: [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ] +; IC4VF4-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] +; IC4VF4-NEXT: [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8 +; IC4VF4-NEXT: [[CMP_A_3:%.*]] = icmp sgt i64 [[LD_A]], 3 +; IC4VF4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP_A_3]], i64 [[IV]], i64 [[RDX]] +; IC4VF4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1 +; IC4VF4-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV]], 0 +; IC4VF4-NEXT: br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; IC4VF4: [[EXIT]]: +; IC4VF4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ] +; IC4VF4-NEXT: ret i64 [[SPEC_SELECT_LCSSA]] +; +; IC4VF1-LABEL: define i64 @select_decreasing_induction_icmp_iv_unsigned( +; IC4VF1-SAME: ptr [[A:%.*]]) { +; IC4VF1-NEXT: [[ENTRY:.*]]: +; IC4VF1-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; IC4VF1: [[VECTOR_PH]]: +; IC4VF1-NEXT: br label %[[VECTOR_BODY:.*]] +; IC4VF1: [[VECTOR_BODY]]: +; IC4VF1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC4VF1-NEXT: [[VEC_PHI:%.*]] = phi i64 [ -1, %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ] +; IC4VF1-NEXT: [[VEC_PHI1:%.*]] = phi i64 [ -1, %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ] +; IC4VF1-NEXT: [[VEC_PHI2:%.*]] = phi i64 [ -1, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ] +; IC4VF1-NEXT: [[VEC_PHI3:%.*]] = phi i64 [ -1, %[[VECTOR_PH]] ], [ [[TMP18:%.*]], %[[VECTOR_BODY]] ] +; IC4VF1-NEXT: [[OFFSET_IDX:%.*]] = sub i64 9223372036854775807, [[INDEX]] +; IC4VF1-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], -1 +; IC4VF1-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], -2 +; IC4VF1-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], -3 +; IC4VF1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]] +; IC4VF1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]] +; IC4VF1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] +; IC4VF1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] +; IC4VF1-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP3]], align 8 +; IC4VF1-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP4]], align 8 +; IC4VF1-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP5]], align 8 +; IC4VF1-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 8 +; IC4VF1-NEXT: [[TMP11:%.*]] = icmp sgt i64 [[TMP7]], 3 +; IC4VF1-NEXT: [[TMP12:%.*]] = icmp sgt i64 [[TMP8]], 3 +; IC4VF1-NEXT: [[TMP13:%.*]] = icmp sgt i64 [[TMP9]], 3 +; IC4VF1-NEXT: [[TMP14:%.*]] = icmp sgt i64 [[TMP10]], 3 +; IC4VF1-NEXT: [[TMP15]] = select i1 [[TMP11]], i64 [[OFFSET_IDX]], i64 [[VEC_PHI]] +; IC4VF1-NEXT: [[TMP16]] = select i1 [[TMP12]], i64 [[TMP0]], i64 [[VEC_PHI1]] +; IC4VF1-NEXT: [[TMP17]] = select i1 [[TMP13]], i64 [[TMP1]], i64 [[VEC_PHI2]] +; IC4VF1-NEXT: [[TMP18]] = select i1 [[TMP14]], i64 [[TMP2]], i64 [[VEC_PHI3]] +; IC4VF1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; IC4VF1-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], -9223372036854775808 +; IC4VF1-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; IC4VF1: [[MIDDLE_BLOCK]]: +; IC4VF1-NEXT: [[RDX_MINMAX:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP15]], i64 [[TMP16]]) +; IC4VF1-NEXT: [[RDX_MINMAX4:%.*]] = call i64 @llvm.umin.i64(i64 [[RDX_MINMAX]], i64 [[TMP17]]) +; IC4VF1-NEXT: [[RDX_MINMAX5:%.*]] = call i64 @llvm.umin.i64(i64 [[RDX_MINMAX4]], i64 [[TMP18]]) +; IC4VF1-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[RDX_MINMAX5]], -1 +; IC4VF1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[RDX_MINMAX5]], i64 331 +; IC4VF1-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IC4VF1: [[SCALAR_PH]]: +; IC4VF1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, %[[MIDDLE_BLOCK]] ], [ 9223372036854775807, %[[ENTRY]] ] +; IC4VF1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 331, %[[ENTRY]] ] +; IC4VF1-NEXT: br label %[[LOOP:.*]] +; IC4VF1: [[LOOP]]: +; IC4VF1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; IC4VF1-NEXT: [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ] +; IC4VF1-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] +; IC4VF1-NEXT: [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8 +; IC4VF1-NEXT: [[CMP_A_3:%.*]] = icmp sgt i64 [[LD_A]], 3 +; IC4VF1-NEXT: [[SPEC_SELECT]] = select i1 [[CMP_A_3]], i64 [[IV]], i64 [[RDX]] +; IC4VF1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1 +; IC4VF1-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV]], 0 +; IC4VF1-NEXT: br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; IC4VF1: [[EXIT]]: +; IC4VF1-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ] +; IC4VF1-NEXT: ret i64 [[SPEC_SELECT_LCSSA]] +; +entry: + br label %loop + +loop: ; preds = %entry, %loop + %iv = phi i64 [ 9223372036854775807, %entry ], [ %iv.next, %loop ] + %rdx = phi i64 [ 331, %entry ], [ %spec.select, %loop ] + %gep.a.iv = getelementptr inbounds i64, ptr %a, i64 %iv + %ld.a = load i64, ptr %gep.a.iv, align 8 + %cmp.a.3 = icmp sgt i64 %ld.a, 3 + %spec.select = select i1 %cmp.a.3, i64 %iv, i64 %rdx + %iv.next = add nsw i64 %iv, -1 + %exit.cond = icmp eq i64 %iv, 0 + br i1 %exit.cond, label %exit, label %loop + +exit: ; preds = %loop + ret i64 %spec.select +} + +; The unsigned sentinel value for decreasing-IV vectorization is ULONG_MAX, +; and since the IV hits this value, it is impossible to vectorize this case. +; In this test, %iv's range will include both signed and unsigned +; maximum (sentinel) values. +define i64 @not_vectorized_select_decreasing_induction_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx.start) { +; CHECK-LABEL: define i64 @not_vectorized_select_decreasing_induction_icmp_iv_out_of_bound( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ -1, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], -1 +; CHECK-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV_NEXT]] +; CHECK-NEXT: [[LD_A:%.*]] = load i8, ptr [[GEP_A_IV]], align 1 +; CHECK-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV_NEXT]] +; CHECK-NEXT: [[LD_B:%.*]] = load i8, ptr [[GEP_B_IV]], align 1 +; CHECK-NEXT: [[CMP_A_B:%.*]] = icmp sgt i8 [[LD_A]], [[LD_B]] +; CHECK-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]] +; CHECK-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; CHECK-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ] +; CHECK-NEXT: ret i64 [[COND_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ -1, %entry ], [ %iv.next, %loop ] + %rdx = phi i64 [ %rdx.start, %entry ], [ %cond, %loop ] + %iv.next = add i64 %iv, -1 + %gep.a.iv = getelementptr inbounds i8, ptr %a, i64 %iv.next + %ld.a = load i8, ptr %gep.a.iv, align 1 + %gep.b.iv = getelementptr inbounds i8, ptr %b, i64 %iv.next + %ld.b = load i8, ptr %gep.b.iv, align 1 + %cmp.a.b = icmp sgt i8 %ld.a, %ld.b + %cond = select i1 %cmp.a.b, i64 %iv.next, i64 %rdx + %exit.cond = icmp eq i64 %iv.next, 0 + br i1 %exit.cond, label %exit, label %loop + +exit: + ret i64 %cond +} + define i64 @not_vectorized_select_decreasing_induction_icmp_non_const_start(ptr %a, ptr %b, i64 %rdx.start, i64 %n) { ; CHECK-LABEL: define i64 @not_vectorized_select_decreasing_induction_icmp_non_const_start( ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) { @@ -981,42 +1225,3 @@ loop: ; preds = %entry, %loop exit: ; preds = %loop ret i64 %cond } - -; The sentinel value for decreasing-IV vectorization is LONG_MAX, and since -; the IV hits this value, it is impossible to vectorize this case. -define i64 @not_vectorized_select_decreasing_induction_icmp_iv_out_of_bound(ptr %a) { -; CHECK-LABEL: define i64 @not_vectorized_select_decreasing_induction_icmp_iv_out_of_bound( -; CHECK-SAME: ptr [[A:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: br label %[[LOOP:.*]] -; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 9223372036854775807, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] -; CHECK-NEXT: [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8 -; CHECK-NEXT: [[CMP_A_3:%.*]] = icmp sgt i64 [[LD_A]], 3 -; CHECK-NEXT: [[SPEC_SELECT]] = select i1 [[CMP_A_3]], i64 [[IV]], i64 [[RDX]] -; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1 -; CHECK-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV]], 0 -; CHECK-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]] -; CHECK: [[EXIT]]: -; CHECK-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT]], %[[LOOP]] ] -; CHECK-NEXT: ret i64 [[SPEC_SELECT_LCSSA]] -; -entry: - br label %loop - -loop: ; preds = %entry, %loop - %iv = phi i64 [ 9223372036854775807, %entry ], [ %iv.next, %loop ] - %rdx = phi i64 [ 331, %entry ], [ %spec.select, %loop ] - %gep.a.iv = getelementptr inbounds i64, ptr %a, i64 %iv - %ld.a = load i64, ptr %gep.a.iv, align 8 - %cmp.a.3 = icmp sgt i64 %ld.a, 3 - %spec.select = select i1 %cmp.a.3, i64 %iv, i64 %rdx - %iv.next = add nsw i64 %iv, -1 - %exit.cond = icmp eq i64 %iv, 0 - br i1 %exit.cond, label %exit, label %loop - -exit: ; preds = %loop - ret i64 %spec.select -}