From 938a0e31a231ec4715821a3ba5d1dcfc83723533 Mon Sep 17 00:00:00 2001 From: Kerry McLaughlin Date: Thu, 7 Aug 2025 13:53:14 +0000 Subject: [PATCH 1/6] [LV] Ignore user-specified interleave count when unsafe. When an VF is specified via a loop hint, it will be clamped to a safe VF or ignored if it is found to be unsafe. This is not the case for user-specified interleave counts, which can lead to loops such as the following with a memory dependence being vectorised with the specified IC: #pragma clang loop interleave_count(4) for (int i = 4; i < LEN; i++) b[i] = b[i - 4] + a[i]; According to [1], loop hints are ignored if they are not safe to apply. This patch adds a check to prevent vectorisation with interleaving if isSafeForAnyVectorWidth() returns false. This is already checked in selectInterleaveCount(). [1] https://llvm.org/docs/LangRef.html#llvm-loop-vectorize-and-llvm-loop-interleave --- .../Transforms/Vectorize/LoopVectorize.cpp | 22 +++++++++---- .../AArch64/scalable-reductions.ll | 13 +++----- .../LoopVectorize/unsafe-ic-hint-remark.ll | 33 +++++++++++++++++++ 3 files changed, 53 insertions(+), 15 deletions(-) create mode 100644 llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 0b7963b98e7a4..38fecec6766c8 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9844,8 +9844,10 @@ bool LoopVectorizePass::processLoop(Loop *L) { ElementCount UserVF = Hints.getWidth(); unsigned UserIC = Hints.getInterleave(); + unsigned SafeUserIC = CM.Legal->isSafeForAnyVectorWidth() ? UserIC : 0; + // Plan how to best vectorize. - LVP.plan(UserVF, UserIC); + LVP.plan(UserVF, SafeUserIC); VectorizationFactor VF = LVP.computeBestVF(); unsigned IC = 1; @@ -9857,7 +9859,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Select the interleave count. IC = LVP.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost); - unsigned SelectedIC = std::max(IC, UserIC); + unsigned SelectedIC = std::max(IC, SafeUserIC); + // Optimistically generate runtime checks if they are needed. Drop them if // they turn out to not be profitable. if (VF.Width.isVector() || SelectedIC > 1) { @@ -9907,7 +9910,14 @@ bool LoopVectorizePass::processLoop(Loop *L) { VectorizeLoop = false; } - if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) { + if (UserIC > 0 && UserIC != SafeUserIC) { + LLVM_DEBUG(dbgs() << "LV: Disabling interleaving as user-specified " + "interleave count is unsafe.\n"); + IntDiagMsg = {"InterleavingUnsafe", + "User-specified interleave count is not safe, interleave " + "count is set to 1."}; + InterleaveLoop = false; + } else if (!LVP.hasPlanWithVF(VF.Width) && SafeUserIC > 1) { // Tell the user interleaving was avoided up-front, despite being explicitly // requested. LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " @@ -9915,7 +9925,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { IntDiagMsg = {"InterleavingAvoided", "Ignoring UserIC, because interleaving was avoided up front"}; InterleaveLoop = false; - } else if (IC == 1 && UserIC <= 1) { + } else if (IC == 1 && SafeUserIC <= 1) { // Tell the user interleaving is not beneficial. LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); IntDiagMsg = { @@ -9927,7 +9937,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { IntDiagMsg.second += " and is explicitly disabled or interleave count is set to 1"; } - } else if (IC > 1 && UserIC == 1) { + } else if (IC > 1 && SafeUserIC == 1) { // Tell the user interleaving is beneficial, but it explicitly disabled. LLVM_DEBUG(dbgs() << "LV: Interleaving is beneficial but is explicitly " "disabled.\n"); @@ -9951,7 +9961,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { } // Override IC if user provided an interleave count. - IC = UserIC > 0 ? UserIC : IC; + IC = SafeUserIC > 0 ? SafeUserIC : IC; // Emit diagnostic messages, if any. const char *VAPassName = Hints.vectorizeAnalysisPassName(); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll index 11cc971586773..f1fc78f117fba 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll @@ -417,21 +417,16 @@ for.end: ; preds = %for.body, %entry ; Note: This test was added to ensure we always check the legality of reductions (end emit a warning if necessary) before checking for memory dependencies ; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. -; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2) +; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 1) define i32 @memory_dependence(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i64 %n) { ; CHECK-LABEL: @memory_dependence ; CHECK: vector.body: ; CHECK: %[[LOAD1:.*]] = load <4 x i32> ; CHECK: %[[LOAD2:.*]] = load <4 x i32> -; CHECK: %[[LOAD3:.*]] = load <4 x i32> -; CHECK: %[[LOAD4:.*]] = load <4 x i32> -; CHECK: %[[ADD1:.*]] = add nsw <4 x i32> %[[LOAD3]], %[[LOAD1]] -; CHECK: %[[ADD2:.*]] = add nsw <4 x i32> %[[LOAD4]], %[[LOAD2]] -; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD3]] -; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD4]] +; CHECK: %[[ADD1:.*]] = add nsw <4 x i32> %[[LOAD2]], %[[LOAD1]] +; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD2]] ; CHECK: middle.block: -; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]] -; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]]) +; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[MUL1]]) entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll b/llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll new file mode 100644 index 0000000000000..034df3f54e7e5 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll @@ -0,0 +1,33 @@ +; REQUIRES: asserts +; RUN: opt -passes=loop-vectorize -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s + +; Make sure the unsafe user specified interleave count is ignored. + +; CHECK: LV: Disabling interleaving as user-specified interleave count is unsafe. +; CHECK: remark: :0:0: User-specified interleave count is not safe, interleave count is set to 1. +; CHECK-LABEL: @loop_distance_4 +define void @loop_distance_4(i64 %N, ptr %a, ptr %b) { +entry: + %cmp10 = icmp sgt i64 %N, 4 + br i1 %cmp10, label %for.body, label %for.end + +for.body: + %indvars.iv = phi i64 [ 4, %entry ], [ %indvars.iv.next, %for.body ] + %0 = getelementptr i32, ptr %b, i64 %indvars.iv + %arrayidx = getelementptr i8, ptr %0, i64 -16 + %1 = load i32, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds nuw i32, ptr %a, i64 %indvars.iv + %2 = load i32, ptr %arrayidx2, align 4 + %add = add nsw i32 %2, %1 + store i32 %add, ptr %0, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %N + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1 + +for.end: + ret void +} + +!1 = !{!1, !2, !3} +!2 = !{!"llvm.loop.interleave.count", i32 4} +!3 = !{!"llvm.loop.vectorize.width", i32 4} From 31c6578bfd76cc6cf69ecd02489543ea17819f55 Mon Sep 17 00:00:00 2001 From: Kerry McLaughlin Date: Mon, 11 Aug 2025 14:54:13 +0000 Subject: [PATCH 2/6] - Reworded diagnostic message - Removed need for asserts in new test --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 7 +++---- .../LoopVectorize/AArch64/scalable-reductions.ll | 1 + .../test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll | 6 ++---- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 38fecec6766c8..7e8c50852ff96 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9911,11 +9911,10 @@ bool LoopVectorizePass::processLoop(Loop *L) { } if (UserIC > 0 && UserIC != SafeUserIC) { - LLVM_DEBUG(dbgs() << "LV: Disabling interleaving as user-specified " - "interleave count is unsafe.\n"); + LLVM_DEBUG(dbgs() << "LV: Ignoring user-specified interleave count.\n"); IntDiagMsg = {"InterleavingUnsafe", - "User-specified interleave count is not safe, interleave " - "count is set to 1."}; + "Ignoring user-specified interleave count due to possibly " + "unsafe dependencies in the loop."}; InterleaveLoop = false; } else if (!LVP.hasPlanWithVF(VF.Width) && SafeUserIC > 1) { // Tell the user interleaving was avoided up-front, despite being explicitly diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll index f1fc78f117fba..fb7890a3b82f4 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll @@ -417,6 +417,7 @@ for.end: ; preds = %for.body, %entry ; Note: This test was added to ensure we always check the legality of reductions (end emit a warning if necessary) before checking for memory dependencies ; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. +; CHECK-REMARK: Ignoring user-specified interleave count due to possibly unsafe dependencies in the loop. ; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 1) define i32 @memory_dependence(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i64 %n) { ; CHECK-LABEL: @memory_dependence diff --git a/llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll b/llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll index 034df3f54e7e5..f2fb7a240bc9e 100644 --- a/llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll +++ b/llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll @@ -1,10 +1,8 @@ -; REQUIRES: asserts -; RUN: opt -passes=loop-vectorize -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s +; RUN: opt -passes=loop-vectorize -pass-remarks-analysis=loop-vectorize -S < %s 2>&1 | FileCheck %s ; Make sure the unsafe user specified interleave count is ignored. -; CHECK: LV: Disabling interleaving as user-specified interleave count is unsafe. -; CHECK: remark: :0:0: User-specified interleave count is not safe, interleave count is set to 1. +; CHECK: remark: :0:0: Ignoring user-specified interleave count due to possibly unsafe dependencies in the loop. ; CHECK-LABEL: @loop_distance_4 define void @loop_distance_4(i64 %N, ptr %a, ptr %b) { entry: From 20fe702e2e84da337152139581ba5bf061b36751 Mon Sep 17 00:00:00 2001 From: Kerry McLaughlin Date: Tue, 12 Aug 2025 10:01:23 +0000 Subject: [PATCH 3/6] - Handle UserIC as part of selectInterleaveCount --- .../Vectorize/LoopVectorizationPlanner.h | 4 +- .../Transforms/Vectorize/LoopVectorize.cpp | 72 ++++++++++--------- 2 files changed, 40 insertions(+), 36 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 456fa4c858535..ddf8b1054bf49 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -517,8 +517,8 @@ class LoopVectorizationPlanner { /// If interleave count has been specified by metadata it will be returned. /// Otherwise, the interleave count is computed and returned. VF and LoopCost /// are the selected vectorization factor and the cost of the selected VF. - unsigned selectInterleaveCount(VPlan &Plan, ElementCount VF, - InstructionCost LoopCost); + unsigned selectInterleaveCount(VPlan &Plan, ElementCount VF, unsigned UserIC, + InstructionCost LoopCost, bool &IntBeneficial); /// Generate the IR code for the vectorized loop captured in VPlan \p BestPlan /// according to the best selected \p VF and \p UF. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 7e8c50852ff96..3a9cbfca91fca 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4501,9 +4501,9 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() { } } -unsigned -LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF, - InstructionCost LoopCost) { +unsigned LoopVectorizationPlanner::selectInterleaveCount( + VPlan &Plan, ElementCount VF, unsigned UserIC, InstructionCost LoopCost, + bool &IntBeneficial) { // -- The interleave heuristics -- // We interleave the loop in order to expose ILP and reduce the loop overhead. // There are many micro-architectural considerations that we can't predict @@ -4518,25 +4518,26 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF, // 3. We don't interleave if we think that we will spill registers to memory // due to the increased register pressure. - if (!CM.isScalarEpilogueAllowed()) + // We used the distance for the interleave count. This should not be overriden + // by a user-specified IC. + if (!Legal->isSafeForAnyVectorWidth()) return 1; + if (!CM.isScalarEpilogueAllowed()) + return std::max(1U, UserIC); + if (any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(), IsaPred)) { LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. " "Unroll factor forced to be 1.\n"); - return 1; + return std::max(1U, UserIC); } - // We used the distance for the interleave count. - if (!Legal->isSafeForAnyVectorWidth()) - return 1; - // We don't attempt to perform interleaving for loops with uncountable early // exits because the VPInstruction::AnyOf code cannot currently handle // multiple parts. if (Plan.hasEarlyExit()) - return 1; + return std::max(1U, UserIC); const bool HasReductions = any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(), @@ -4553,7 +4554,7 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF, // Loop body is free and there is no need for interleaving. if (LoopCost == 0) - return 1; + return std::max(1U, UserIC); } VPRegisterUsage R = @@ -4690,7 +4691,8 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF, // benefit from interleaving. if (VF.isVector() && HasReductions) { LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); - return IC; + IntBeneficial = IC > 1; + return UserIC > 0 ? UserIC : IC; } // For any scalar loop that either requires runtime checks or predication we @@ -4773,7 +4775,7 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF, }); if (HasSelectCmpReductions) { LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); - return 1; + return std::max(1U, UserIC); } // If we have a scalar reduction (vector reductions are already dealt with @@ -4792,7 +4794,7 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF, if (HasOrderedReductions) { LLVM_DEBUG( dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); - return 1; + return std::max(1U, UserIC); } unsigned F = MaxNestedScalarReductionIC; @@ -4805,7 +4807,9 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF, std::max(StoresIC, LoadsIC) > SmallIC) { LLVM_DEBUG( dbgs() << "LV: Interleaving to saturate store or load ports.\n"); - return std::max(StoresIC, LoadsIC); + IC = std::max(StoresIC, LoadsIC); + IntBeneficial = IC > 1; + return UserIC > 0 ? UserIC : IC; } // If there are scalar reductions and TTI has enabled aggressive @@ -4814,22 +4818,27 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF, LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); // Interleave no less than SmallIC but not as aggressive as the normal IC // to satisfy the rare situation when resources are too limited. - return std::max(IC / 2, SmallIC); + IC = std::max(IC / 2, SmallIC); + IntBeneficial = IC > 1; + return UserIC > 0 ? UserIC : IC; } LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); - return SmallIC; + IC = std::max(SmallIC, UserIC); + IntBeneficial = IC > 1; + return UserIC > 0 ? UserIC : IC; } // Interleave if this is a large loop (small loops are already dealt with by // this point) that could benefit from interleaving. if (AggressivelyInterleaveReductions) { LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); - return IC; + IntBeneficial = IC > 1; + return UserIC > 0 ? UserIC : IC; } LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); - return 1; + return std::max(1U, UserIC); } bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, @@ -9844,10 +9853,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { ElementCount UserVF = Hints.getWidth(); unsigned UserIC = Hints.getInterleave(); - unsigned SafeUserIC = CM.Legal->isSafeForAnyVectorWidth() ? UserIC : 0; - // Plan how to best vectorize. - LVP.plan(UserVF, SafeUserIC); + LVP.plan(UserVF, UserIC); VectorizationFactor VF = LVP.computeBestVF(); unsigned IC = 1; @@ -9855,16 +9862,16 @@ bool LoopVectorizePass::processLoop(Loop *L) { LVP.emitInvalidCostRemarks(ORE); GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind); + bool IntBeneficial = false; if (LVP.hasPlanWithVF(VF.Width)) { // Select the interleave count. - IC = LVP.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost); - - unsigned SelectedIC = std::max(IC, SafeUserIC); + IC = LVP.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, UserIC, + VF.Cost, IntBeneficial); // Optimistically generate runtime checks if they are needed. Drop them if // they turn out to not be profitable. - if (VF.Width.isVector() || SelectedIC > 1) { - Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); + if (VF.Width.isVector() || IC > 1) { + Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, IC); // Bail out early if either the SCEV or memory runtime checks are known to // fail. In that case, the vector loop would never execute. @@ -9910,13 +9917,13 @@ bool LoopVectorizePass::processLoop(Loop *L) { VectorizeLoop = false; } - if (UserIC > 0 && UserIC != SafeUserIC) { + if (IC == 1 && UserIC > 1) { LLVM_DEBUG(dbgs() << "LV: Ignoring user-specified interleave count.\n"); IntDiagMsg = {"InterleavingUnsafe", "Ignoring user-specified interleave count due to possibly " "unsafe dependencies in the loop."}; InterleaveLoop = false; - } else if (!LVP.hasPlanWithVF(VF.Width) && SafeUserIC > 1) { + } else if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) { // Tell the user interleaving was avoided up-front, despite being explicitly // requested. LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " @@ -9924,7 +9931,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { IntDiagMsg = {"InterleavingAvoided", "Ignoring UserIC, because interleaving was avoided up front"}; InterleaveLoop = false; - } else if (IC == 1 && SafeUserIC <= 1) { + } else if (!IntBeneficial && UserIC <= 1) { // Tell the user interleaving is not beneficial. LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); IntDiagMsg = { @@ -9936,7 +9943,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { IntDiagMsg.second += " and is explicitly disabled or interleave count is set to 1"; } - } else if (IC > 1 && SafeUserIC == 1) { + } else if (IntBeneficial && UserIC == 1) { // Tell the user interleaving is beneficial, but it explicitly disabled. LLVM_DEBUG(dbgs() << "LV: Interleaving is beneficial but is explicitly " "disabled.\n"); @@ -9959,9 +9966,6 @@ bool LoopVectorizePass::processLoop(Loop *L) { InterleaveLoop = false; } - // Override IC if user provided an interleave count. - IC = SafeUserIC > 0 ? SafeUserIC : IC; - // Emit diagnostic messages, if any. const char *VAPassName = Hints.vectorizeAnalysisPassName(); if (!VectorizeLoop && !InterleaveLoop) { From 17eda9450a6c5d61fcf93a696893d4f40edf1c1a Mon Sep 17 00:00:00 2001 From: Kerry McLaughlin Date: Thu, 9 Oct 2025 10:20:29 +0000 Subject: [PATCH 4/6] - Moved UserIC back out of selectInterleaveCount --- .../Vectorize/LoopVectorizationPlanner.h | 4 +- .../Transforms/Vectorize/LoopVectorize.cpp | 67 +++++++++---------- 2 files changed, 32 insertions(+), 39 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index ddf8b1054bf49..456fa4c858535 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -517,8 +517,8 @@ class LoopVectorizationPlanner { /// If interleave count has been specified by metadata it will be returned. /// Otherwise, the interleave count is computed and returned. VF and LoopCost /// are the selected vectorization factor and the cost of the selected VF. - unsigned selectInterleaveCount(VPlan &Plan, ElementCount VF, unsigned UserIC, - InstructionCost LoopCost, bool &IntBeneficial); + unsigned selectInterleaveCount(VPlan &Plan, ElementCount VF, + InstructionCost LoopCost); /// Generate the IR code for the vectorized loop captured in VPlan \p BestPlan /// according to the best selected \p VF and \p UF. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 3a9cbfca91fca..1bf6529fc4011 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4501,9 +4501,9 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() { } } -unsigned LoopVectorizationPlanner::selectInterleaveCount( - VPlan &Plan, ElementCount VF, unsigned UserIC, InstructionCost LoopCost, - bool &IntBeneficial) { +unsigned +LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF, + InstructionCost LoopCost) { // -- The interleave heuristics -- // We interleave the loop in order to expose ILP and reduce the loop overhead. // There are many micro-architectural considerations that we can't predict @@ -4518,26 +4518,25 @@ unsigned LoopVectorizationPlanner::selectInterleaveCount( // 3. We don't interleave if we think that we will spill registers to memory // due to the increased register pressure. - // We used the distance for the interleave count. This should not be overriden - // by a user-specified IC. - if (!Legal->isSafeForAnyVectorWidth()) - return 1; - if (!CM.isScalarEpilogueAllowed()) - return std::max(1U, UserIC); + return 1; if (any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(), IsaPred)) { LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. " "Unroll factor forced to be 1.\n"); - return std::max(1U, UserIC); + return 1; } + // We used the distance for the interleave count. + if (!Legal->isSafeForAnyVectorWidth()) + return 1; + // We don't attempt to perform interleaving for loops with uncountable early // exits because the VPInstruction::AnyOf code cannot currently handle // multiple parts. if (Plan.hasEarlyExit()) - return std::max(1U, UserIC); + return 1; const bool HasReductions = any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(), @@ -4554,7 +4553,7 @@ unsigned LoopVectorizationPlanner::selectInterleaveCount( // Loop body is free and there is no need for interleaving. if (LoopCost == 0) - return std::max(1U, UserIC); + return 1; } VPRegisterUsage R = @@ -4691,8 +4690,7 @@ unsigned LoopVectorizationPlanner::selectInterleaveCount( // benefit from interleaving. if (VF.isVector() && HasReductions) { LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); - IntBeneficial = IC > 1; - return UserIC > 0 ? UserIC : IC; + return IC; } // For any scalar loop that either requires runtime checks or predication we @@ -4775,7 +4773,7 @@ unsigned LoopVectorizationPlanner::selectInterleaveCount( }); if (HasSelectCmpReductions) { LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); - return std::max(1U, UserIC); + return 1; } // If we have a scalar reduction (vector reductions are already dealt with @@ -4794,7 +4792,7 @@ unsigned LoopVectorizationPlanner::selectInterleaveCount( if (HasOrderedReductions) { LLVM_DEBUG( dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); - return std::max(1U, UserIC); + return 1; } unsigned F = MaxNestedScalarReductionIC; @@ -4807,9 +4805,7 @@ unsigned LoopVectorizationPlanner::selectInterleaveCount( std::max(StoresIC, LoadsIC) > SmallIC) { LLVM_DEBUG( dbgs() << "LV: Interleaving to saturate store or load ports.\n"); - IC = std::max(StoresIC, LoadsIC); - IntBeneficial = IC > 1; - return UserIC > 0 ? UserIC : IC; + return std::max(StoresIC, LoadsIC); } // If there are scalar reductions and TTI has enabled aggressive @@ -4818,27 +4814,22 @@ unsigned LoopVectorizationPlanner::selectInterleaveCount( LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); // Interleave no less than SmallIC but not as aggressive as the normal IC // to satisfy the rare situation when resources are too limited. - IC = std::max(IC / 2, SmallIC); - IntBeneficial = IC > 1; - return UserIC > 0 ? UserIC : IC; + return std::max(IC / 2, SmallIC); } LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); - IC = std::max(SmallIC, UserIC); - IntBeneficial = IC > 1; - return UserIC > 0 ? UserIC : IC; + return SmallIC; } // Interleave if this is a large loop (small loops are already dealt with by // this point) that could benefit from interleaving. if (AggressivelyInterleaveReductions) { LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); - IntBeneficial = IC > 1; - return UserIC > 0 ? UserIC : IC; + return IC; } LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); - return std::max(1U, UserIC); + return 1; } bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, @@ -9851,7 +9842,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Get user vectorization factor and interleave count. ElementCount UserVF = Hints.getWidth(); - unsigned UserIC = Hints.getInterleave(); + unsigned UserIC = LVL.isSafeForAnyVectorWidth() ? Hints.getInterleave() : 1; // Plan how to best vectorize. LVP.plan(UserVF, UserIC); @@ -9862,16 +9853,15 @@ bool LoopVectorizePass::processLoop(Loop *L) { LVP.emitInvalidCostRemarks(ORE); GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind); - bool IntBeneficial = false; if (LVP.hasPlanWithVF(VF.Width)) { // Select the interleave count. - IC = LVP.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, UserIC, - VF.Cost, IntBeneficial); + IC = LVP.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost); + unsigned SelectedIC = std::max(IC, UserIC); // Optimistically generate runtime checks if they are needed. Drop them if // they turn out to not be profitable. - if (VF.Width.isVector() || IC > 1) { - Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, IC); + if (VF.Width.isVector() || SelectedIC > 1) { + Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); // Bail out early if either the SCEV or memory runtime checks are known to // fail. In that case, the vector loop would never execute. @@ -9917,7 +9907,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { VectorizeLoop = false; } - if (IC == 1 && UserIC > 1) { + if (UserIC == 1 && Hints.getInterleave() > 1) { LLVM_DEBUG(dbgs() << "LV: Ignoring user-specified interleave count.\n"); IntDiagMsg = {"InterleavingUnsafe", "Ignoring user-specified interleave count due to possibly " @@ -9931,7 +9921,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { IntDiagMsg = {"InterleavingAvoided", "Ignoring UserIC, because interleaving was avoided up front"}; InterleaveLoop = false; - } else if (!IntBeneficial && UserIC <= 1) { + } else if (IC == 1 && UserIC <= 1) { // Tell the user interleaving is not beneficial. LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); IntDiagMsg = { @@ -9943,7 +9933,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { IntDiagMsg.second += " and is explicitly disabled or interleave count is set to 1"; } - } else if (IntBeneficial && UserIC == 1) { + } else if (IC > 1 && UserIC == 1) { // Tell the user interleaving is beneficial, but it explicitly disabled. LLVM_DEBUG(dbgs() << "LV: Interleaving is beneficial but is explicitly " "disabled.\n"); @@ -9966,6 +9956,9 @@ bool LoopVectorizePass::processLoop(Loop *L) { InterleaveLoop = false; } + // Override IC if user provided an interleave count. + IC = UserIC > 0 ? UserIC : IC; + // Emit diagnostic messages, if any. const char *VAPassName = Hints.vectorizeAnalysisPassName(); if (!VectorizeLoop && !InterleaveLoop) { From 444e58cc075d5416fce5a6875fb32d9d403a8472 Mon Sep 17 00:00:00 2001 From: Kerry McLaughlin Date: Tue, 21 Oct 2025 12:44:44 +0000 Subject: [PATCH 5/6] - Only set UserIC to 1 if an interleave count > 1 was requested --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 1bf6529fc4011..3cae917643b45 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9842,7 +9842,9 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Get user vectorization factor and interleave count. ElementCount UserVF = Hints.getWidth(); - unsigned UserIC = LVL.isSafeForAnyVectorWidth() ? Hints.getInterleave() : 1; + unsigned UserIC = Hints.getInterleave(); + if (UserIC > 1 && !LVL.isSafeForAnyVectorWidth()) + UserIC = 1; // Plan how to best vectorize. LVP.plan(UserVF, UserIC); From 5ad8fb2ac1cde81889c27874f3000108ea25bf8d Mon Sep 17 00:00:00 2001 From: Kerry McLaughlin Date: Tue, 21 Oct 2025 14:58:42 +0000 Subject: [PATCH 6/6] - Add an assert for !isSafeForAnyVectorWidth() - Cleanup test --- .../Transforms/Vectorize/LoopVectorize.cpp | 2 ++ .../LoopVectorize/unsafe-ic-hint-remark.ll | 19 +++++++++---------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 3cae917643b45..4a185023fae29 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9910,6 +9910,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { } if (UserIC == 1 && Hints.getInterleave() > 1) { + assert(!LVL.isSafeForAnyVectorWidth() && + "UserIC should only be ignored due to unsafe dependencies"); LLVM_DEBUG(dbgs() << "LV: Ignoring user-specified interleave count.\n"); IntDiagMsg = {"InterleavingUnsafe", "Ignoring user-specified interleave count due to possibly " diff --git a/llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll b/llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll index f2fb7a240bc9e..01934b1d7fbd2 100644 --- a/llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll +++ b/llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll @@ -4,23 +4,22 @@ ; CHECK: remark: :0:0: Ignoring user-specified interleave count due to possibly unsafe dependencies in the loop. ; CHECK-LABEL: @loop_distance_4 -define void @loop_distance_4(i64 %N, ptr %a, ptr %b) { +define void @loop_distance_4(ptr %a, ptr %b) { entry: - %cmp10 = icmp sgt i64 %N, 4 - br i1 %cmp10, label %for.body, label %for.end + br label %loop -for.body: - %indvars.iv = phi i64 [ 4, %entry ], [ %indvars.iv.next, %for.body ] - %0 = getelementptr i32, ptr %b, i64 %indvars.iv +loop: + %iv = phi i64 [ 4, %entry ], [ %iv.next, %loop ] + %0 = getelementptr i32, ptr %b, i64 %iv %arrayidx = getelementptr i8, ptr %0, i64 -16 %1 = load i32, ptr %arrayidx, align 4 - %arrayidx2 = getelementptr inbounds nuw i32, ptr %a, i64 %indvars.iv + %arrayidx2 = getelementptr inbounds nuw i32, ptr %a, i64 %iv %2 = load i32, ptr %arrayidx2, align 4 %add = add nsw i32 %2, %1 store i32 %add, ptr %0, align 4 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond.not = icmp eq i64 %indvars.iv.next, %N - br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 64 + br i1 %exitcond.not, label %for.end, label %loop, !llvm.loop !1 for.end: ret void