[LV]: Improve accuracy of calculating remaining iterations of MainLoopVF #156723

hassnaaHamdi · 2025-09-03T17:44:56Z

Transform TC and VF to same numerical space when they are in different spaces.

llvmbot · 2025-09-03T19:21:35Z

@llvm/pr-subscribers-vectorizers

@llvm/pr-subscribers-llvm-transforms

Author: Hassnaa Hamdi (hassnaaHamdi)

Changes

Account for vscale for vscale-based TC when calculating remaining iterations.

Patch is 63.28 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/156723.diff

2 Files Affected:

(modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+14-1)
(modified) llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll (+262-326)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3fbeef1211954..ebdd3a69cf0e1 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4411,8 +4411,21 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
   const SCEV *TC =
       vputils::getSCEVExprForVPValue(getPlanFor(MainLoopVF).getTripCount(), SE);
   assert(!isa<SCEVCouldNotCompute>(TC) && "Trip count SCEV must be computable");
+
+  // TODO: Maybe this could be removed when SCEV can evaluate expressions with
+  // 'vscale'.
+  // If TC is multiple of vscale, try to get estimated value:
+  if (match(TC, m_scev_Mul(m_SCEV(), m_SCEVVScale()))) {
+    std::optional<ElementCount> BestKnownTC =
+        getSmallBestKnownTC(PSE, OrigLoop);
+    if (BestKnownTC) {
+      unsigned EstimatedRuntimeTC =
+          estimateElementCount(*BestKnownTC, CM.getVScaleForTuning());
+      TC = SE.getConstant(TCType, EstimatedRuntimeTC);
+    }
+  }
   RemainingIterations =
-      SE.getURemExpr(TC, SE.getElementCount(TCType, MainLoopVF * IC));
+      SE.getURemExpr(TC, SE.getElementCount(TCType, EstimatedRuntimeVF * IC));
 
   // No iterations left to process in the epilogue.
   if (RemainingIterations->isZero())
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
index 6b0da1bb2ed82..d1f13a344ecbf 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
@@ -10,8 +10,7 @@
 target triple = "aarch64-linux-gnu"
 
 ; DEBUG: LV: Checking a loop in 'main_vf_vscale_x_16'
-; DEBUG: Create Skeleton for epilogue vectorized loop (first pass)
-; DEBUG: Main Loop VF:vscale x 16, Main Loop UF:2, Epilogue Loop VF:vscale x 8, Epilogue Loop UF:1
+; DEBUG: Executing best plan with VF=vscale x 16, UF=2
 
 ; DEBUG-FORCED: LV: Checking a loop in 'main_vf_vscale_x_16'
 ; DEBUG-FORCED: LEV: Epilogue vectorization factor is forced.
@@ -20,61 +19,33 @@ target triple = "aarch64-linux-gnu"
 
 define void @main_vf_vscale_x_16(ptr %A) #0 {
 ; CHECK-LABEL: @main_vf_vscale_x_16(
-; CHECK-NEXT:  iter.check:
+; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 5
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; CHECK:       vector.main.loop.iter.check:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 5
-; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 1024, [[TMP3]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 32
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP5]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 32
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP18:%.*]] = shl nuw i64 [[TMP17]], 4
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 [[TMP18]]
-; CHECK-NEXT:    store <vscale x 16 x i8> splat (i8 1), ptr [[TMP14]], align 1
-; CHECK-NEXT:    store <vscale x 16 x i8> splat (i8 1), ptr [[TMP19]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 [[TMP6]]
+; CHECK-NEXT:    store <vscale x 16 x i8> splat (i8 1), ptr [[TMP4]], align 1
+; CHECK-NEXT:    store <vscale x 16 x i8> splat (i8 1), ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 1024, [[N_VEC]]
-; CHECK-NEXT:    [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP22:%.*]] = shl nuw i64 [[TMP21]], 3
-; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP22]]
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
-; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    [[TMP23:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP24:%.*]] = mul nuw i64 [[TMP23]], 8
-; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 1024, [[TMP24]]
-; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 1024, [[N_MOD_VF2]]
-; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX5]]
-; CHECK-NEXT:    store <vscale x 8 x i8> splat (i8 1), ptr [[TMP28]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT6]] = add nuw i64 [[INDEX5]], [[TMP24]]
-; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]]
-; CHECK-NEXT:    br i1 [[TMP30]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    [[CMP_N4:%.*]] = icmp eq i64 1024, [[N_VEC3]]
-; CHECK-NEXT:    br i1 [[CMP_N4]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
-; CHECK:       vec.epilog.scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ;
@@ -94,32 +65,32 @@ define void @main_vf_vscale_x_16(ptr %A) #0 {
 ; CHECK-VF8-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-VF8:       vector.body:
 ; CHECK-VF8-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF8-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-VF8-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF8-NEXT:    [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 4
-; CHECK-VF8-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i64 [[TMP16]]
-; CHECK-VF8-NEXT:    store <vscale x 16 x i8> splat (i8 1), ptr [[TMP12]], align 1
-; CHECK-VF8-NEXT:    store <vscale x 16 x i8> splat (i8 1), ptr [[TMP17]], align 1
+; CHECK-VF8-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-VF8-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT:    [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 4
+; CHECK-VF8-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 [[TMP6]]
+; CHECK-VF8-NEXT:    store <vscale x 16 x i8> splat (i8 1), ptr [[TMP4]], align 1
+; CHECK-VF8-NEXT:    store <vscale x 16 x i8> splat (i8 1), ptr [[TMP7]], align 1
 ; CHECK-VF8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-VF8-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF8-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-VF8-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF8-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-VF8:       middle.block:
 ; CHECK-VF8-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-VF8-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK-VF8:       vec.epilog.iter.check:
 ; CHECK-VF8-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 1024, [[N_VEC]]
 ; CHECK-VF8-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
-; CHECK-VF8-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-VF8-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
 ; CHECK-VF8:       vec.epilog.ph:
 ; CHECK-VF8-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-VF8-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK-VF8:       vec.epilog.vector.body:
 ; CHECK-VF8-NEXT:    [[INDEX1:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-VF8-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX1]]
-; CHECK-VF8-NEXT:    store <8 x i8> splat (i8 1), ptr [[TMP20]], align 1
+; CHECK-VF8-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX1]]
+; CHECK-VF8-NEXT:    store <8 x i8> splat (i8 1), ptr [[TMP9]], align 1
 ; CHECK-VF8-NEXT:    [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 8
-; CHECK-VF8-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 1024
-; CHECK-VF8-NEXT:    br i1 [[TMP22]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-VF8-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 1024
+; CHECK-VF8-NEXT:    br i1 [[TMP10]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK-VF8:       vec.epilog.middle.block:
 ; CHECK-VF8-NEXT:    br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK-VF8:       vec.epilog.scalar.ph:
@@ -147,22 +118,22 @@ define void @main_vf_vscale_x_2_no_epi_iteration(ptr %A) #0 vscale_range(8, 8) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw i64 [[TMP7]], 1
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i64 [[TMP8]]
-; CHECK-NEXT:    store <vscale x 2 x i64> splat (i64 1), ptr [[TMP6]], align 1
-; CHECK-NEXT:    store <vscale x 2 x i64> splat (i64 1), ptr [[TMP9]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i64 [[TMP4]]
+; CHECK-NEXT:    store <vscale x 2 x i64> splat (i64 1), ptr [[TMP2]], align 1
+; CHECK-NEXT:    store <vscale x 2 x i64> splat (i64 1), ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -187,32 +158,32 @@ define void @main_vf_vscale_x_2_no_epi_iteration(ptr %A) #0 vscale_range(8, 8) {
 ; CHECK-VF8-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-VF8:       vector.body:
 ; CHECK-VF8-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF8-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-VF8-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF8-NEXT:    [[TMP8:%.*]] = shl nuw i64 [[TMP7]], 1
-; CHECK-VF8-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i64 [[TMP8]]
-; CHECK-VF8-NEXT:    store <vscale x 2 x i64> splat (i64 1), ptr [[TMP6]], align 1
-; CHECK-VF8-NEXT:    store <vscale x 2 x i64> splat (i64 1), ptr [[TMP9]], align 1
+; CHECK-VF8-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-VF8-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT:    [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 1
+; CHECK-VF8-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i64 [[TMP6]]
+; CHECK-VF8-NEXT:    store <vscale x 2 x i64> splat (i64 1), ptr [[TMP4]], align 1
+; CHECK-VF8-NEXT:    store <vscale x 2 x i64> splat (i64 1), ptr [[TMP7]], align 1
 ; CHECK-VF8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-VF8-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF8-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-VF8-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF8-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK-VF8:       middle.block:
 ; CHECK-VF8-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-VF8-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK-VF8:       vec.epilog.iter.check:
 ; CHECK-VF8-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 1024, [[N_VEC]]
 ; CHECK-VF8-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
-; CHECK-VF8-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-VF8-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3]]
 ; CHECK-VF8:       vec.epilog.ph:
 ; CHECK-VF8-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-VF8-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK-VF8:       vec.epilog.vector.body:
 ; CHECK-VF8-NEXT:    [[INDEX1:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-VF8-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX1]]
-; CHECK-VF8-NEXT:    store <8 x i64> splat (i64 1), ptr [[TMP11]], align 1
+; CHECK-VF8-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX1]]
+; CHECK-VF8-NEXT:    store <8 x i64> splat (i64 1), ptr [[TMP9]], align 1
 ; CHECK-VF8-NEXT:    [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 8
-; CHECK-VF8-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 1024
-; CHECK-VF8-NEXT:    br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-VF8-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 1024
+; CHECK-VF8-NEXT:    br i1 [[TMP10]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK-VF8:       vec.epilog.middle.block:
 ; CHECK-VF8-NEXT:    br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK-VF8:       vec.epilog.scalar.ph:
@@ -267,51 +238,51 @@ define void @main_vf_vscale_x_2(ptr %A, i64 %n) #0 vscale_range(8, 8) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 1
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i64 [[TMP16]]
-; CHECK-NEXT:    store <vscale x 2 x i64> splat (i64 1), ptr [[TMP12]], align 1
-; CHECK-NEXT:    store <vscale x 2 x i64> splat (i64 1), ptr [[TMP17]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i64 [[TMP6]]
+; CHECK-NEXT:    store <vscale x 2 x i64> splat (i64 1), ptr [[TMP4]], align 1
+; CHECK-NEXT:    store <vscale x 2 x i64> splat (i64 1), ptr [[TMP7]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH:%.*]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
 ; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF7:![0-9]+]]
 ; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[SCALAR_PH]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[N]], 8
 ; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 [[N]], [[N_MOD_VF2]]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[IN...
[truncated]

david-arm

Thanks for this @hassnaaHamdi!

david-arm · 2025-09-04T09:19:57Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+  if (match(TC, m_scev_Mul(m_SCEV(), m_SCEVVScale()))) {
+    std::optional<ElementCount> BestKnownTC =
+        getSmallBestKnownTC(PSE, OrigLoop);
+    if (BestKnownTC) {


nit: I think you can just do:

if (std::optional<ElementCount> BestKnownTC = getSmallBestKnownTC(PSE, OrigLoop)) { ...

david-arm · 2025-09-04T09:23:37Z

llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll

Could you regenerate the CHECK lines for this test in a seperate pre-commit patch please? It's not your fault, but it's quite hard to see which test changes are actually related to the code changes.

github-actions · 2025-09-07T22:43:09Z

✅ With the latest revision this PR passed the C/C++ code formatter.

hassnaaHamdi · 2025-09-08T12:42:27Z

There is a failure because this file Transforms/LoopVectorize/AArch64/sve-epilog-vscale-fixed.ll got affected and I didn't update it deliberately because the new change will hide the original motivation behind the test. So I will create a NFC for that test file to create a new test case that doesn't get affected by this patch.

paulwalker-arm · 2025-09-10T11:50:47Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+  if (match(TC, m_scev_Mul(m_SCEV(), m_SCEVVScale()))) {
+    if (std::optional<ElementCount> BestKnownTC =


For the case when both TC and VF are multiples of vscale this is effectively doing the opposite of the PR's title in that we're going from potentially knowing the exact value for RemainingIterations to having a guess based on the tuning option?

I've uploaded a WIP patch (#157836, which is loosely blocked on #141798) where I'm improving SCEV's handling for vscale to improve control flow for vscale based loops.

Yeah, this is not the most optimal solution that's why there is a TODO depending on SCEV.
For the mentioned case the current logic doesn't know the exact value for RemainingIterations , while the proposed solution approximately decides it.

I guess I'm suggestion we should restrict the use of estimateElementCount to only those cases where the values being compared are in different domains (i.e. one is fixed and the other scalable). That is the scenario where vscale becomes problematic and providing an estimate is our only option and typically better than doing nothing.

Sure, there are scalable TC and VF cases where we don't do a great job, but often that's due to missing optimisation and/or analysis.

paulwalker-arm · 2025-09-10T11:57:33Z

llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll

+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDEX6]]
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 2 x float>, ptr [[TMP24]], align 4
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[INDEX6]]
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 2 x float>, ptr [[TMP25]], align 4
+; CHECK-NEXT:    [[TMP26:%.*]] = fmul <vscale x 2 x float> [[WIDE_LOAD7]], [[WIDE_LOAD8]]
+; CHECK-NEXT:    store <vscale x 2 x float> [[TMP26]], ptr [[TMP25]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], [[TMP23]]
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC5]]
+; CHECK-NEXT:    br i1 [[TMP27]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]


This looks worse? perhaps an artefact of what I mentioned above?

The loop has a trip count of 1033*vscale, the main VF is 4*vscale, so after the vector loop there will be (1033-(1033/4))*vscale ==> vscale iterations remaining, yet the epilogue loop now has a VF of 2*vscale and thus will never execute?

That is an issue related to deciding the correct VF for the epilogue, it's fixed here:
#156724

david-arm · 2025-09-30T12:13:34Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+  // different numerical space.
+  // TODO: This is a workaround until SCEV can evaluate vscale-based
+  // expressions.
+  if (match(TC, m_scev_Mul(m_SCEV(), m_SCEVVScale())) &&


I think it's probably worth caching this result in a variable since it's used in two places.

david-arm · 2025-09-30T12:15:59Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+  // TODO: This is a workaround until SCEV can evaluate vscale-based
+  // expressions.
+  if (match(TC, m_scev_Mul(m_SCEV(), m_SCEVVScale())) &&
+      MainLoopVF.isScalable()) {


It might be a bit clearer if you handle all cases where the numerical space is the same here, i.e.

bool ScalableTC = match(TC, m_scev_Mul(m_SCEV(), m_SCEVVScale())); if (ScalableTC == MainLoopVF.isScalable()) RemainingIterations = SE.getURemExpr(TC, SE.getElementCount(TCType, MainLoopVF * IC)); else { if (ScalableTC) { ...

david-arm · 2025-09-30T12:21:12Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+    RemainingIterations =
+        SE.getURemExpr(TC, SE.getElementCount(TCType, MainLoopVF * IC));
+  } else {
+    if (match(TC, m_scev_Mul(m_SCEV(), m_SCEVVScale()))) {


I'm not sure if this is right, because what about when the trip count is tc = mul i64 %n, %vscale, i.e. unknown?

Don't you just want to create a SCEV mul expression based on CM.getVScaleForTuning, i.e. if you did something like:

const SCEV *KnownMinTC; bool ScalableTC = match(TC, m_scev_Mul(m_SCEV(KnownMinTC), m_SCEVVScale())); ... if (ScalableTC) { TC = SE.getMulExpr(KnownMinTC, SE.getConstant(CM.getVScaleForTuning())); }

hassnaaHamdi · 2025-10-02T00:20:03Z

I think this patch is not needed now as #157836 is landed. SCEV now can evaluate vscale-based expressions.

hassnaaHamdi · 2025-10-06T12:14:45Z

Sorry, I got confused about the SCEV patch. It handles the case when both TC and VF are vscale-based only.
So, this patch is still needed for handling the case of different numerical spaces.

david-arm · 2025-10-10T14:25:36Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp


  // No iterations left to process in the epilogue.
-  if (RemainingIterations->isZero())
+  if (!RemainingIterations || RemainingIterations->isZero())


Is this change needed? I think RemainingIterations should always be non-null because it's always set to the value returned by getURemExpr. The worst thing that can happen is that it returns a pointer to a SCEVUnknown object, but isZero will just return false.

david-arm · 2025-10-10T14:29:11Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+  } else {
+    if (ScalableTC)
+      RemainingIterations = SE.getURemExpr(
+          KnownMinTC, SE.getElementCount(TCType, MainLoopVF * IC));


I think you need to multiply KnownMinTC by CM.getVScaleForTuning() to get a more accurate estimate.

david-arm · 2025-10-10T14:35:38Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+    RemainingIterations =
+        SE.getURemExpr(TC, SE.getElementCount(TCType, MainLoopVF * IC));
+  } else {
+    if (ScalableTC)


You can fold this if into the else above, i.e.

if (ScalableTC == MainLoopVF.isScalable()) RemainingIterations = SE.getURemExpr(TC, SE.getElementCount(TCType, MainLoopVF * IC)); else if (ScalableTC) RemainingIterations = SE.getURemExpr( KnownMinTC, SE.getElementCount(TCType, MainLoopVF * IC)); else RemainingIterations = SE.getURemExpr( TC, SE.getElementCount(TCType, EstimatedRuntimeVF * IC));

david-arm · 2025-10-10T14:38:35Z

llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll

 ; DEBUG: LV: Checking a loop in 'main_vf_vscale_x_16'
-; DEBUG: Create Skeleton for epilogue vectorized loop (first pass)
-; DEBUG: Main Loop VF:vscale x 16, Main Loop UF:2, Epilogue Loop VF:8, Epilogue Loop UF:1
+; DEBUG: Executing best plan with VF=vscale x 16, UF=2


Maybe worth adding a DEBUG-NOT: Create Skeleton for epilogue vectorized loop here as well because at the moment you're only testing what VF we chose for the main loop.

david-arm · 2025-10-16T12:18:19Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+        SE.getURemExpr(TC, SE.getElementCount(TCType, MainLoopVF * IC));
+  else if (ScalableTC) {
+    const SCEV *EstimatedTC = SE.getMulExpr(
+        KnownMinTC, SE.getConstant(TCType, CM.getVScaleForTuning().value()));


I think you need CM.getVScaleForTuning().value_or(1) here because the value is only available if the vscale_range attribute is present on the function.

Given that TC is already scalable, isn't that an evidence for the availability of CM.getVScaleForTuning() ?

Unfortunately not. The vscale_range attribute is not guaranteed to be present on functions using scalable vectors. If the attribute is missing then getVScaleForTuning will return std::nullopt (see initializeVScaleForTuning) and std::optional::value() will trigger an exception here.

paulwalker-arm · 2025-10-16T14:08:17Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+  bool ScalableTC = match(TC, m_scev_Mul(m_SCEV(KnownMinTC), m_SCEVVScale())) ||
+                    match(TC, m_scev_Mul(m_SCEVVScale(), m_SCEV(KnownMinTC)));


Suggested change

bool ScalableTC = match(TC, m_scev_Mul(m_SCEV(KnownMinTC), m_SCEVVScale())) ||

match(TC, m_scev_Mul(m_SCEVVScale(), m_SCEV(KnownMinTC)));

bool ScalableTC = match(TC, m_scev_c_Mul(m_SCEV(KnownMinTC), m_SCEVVScale()));

Change-Id: I5683aa86f39e92fdeea5089268cedc3c07f0bb8c

Change-Id: I24d2181a61d8bcaaaf2c0efb4275226a6887aabc

… not set

david-arm

LGTM!

david-arm · 2025-10-23T07:50:16Z

llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll

-; DEFAULT-NEXT:    br i1 [[TMP10]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; DEFAULT-NEXT:    [[INDEX1:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX1]]
+; DEFAULT-NEXT:    store <vscale x 8 x i8> zeroinitializer, ptr [[TMP9]], align 1


Looks like this change is happening because of the unfair check in the loop vectoriser code:

if (RemainingIterations && !NextVF.Width.isScalable()) { ...

i.e. we're only seeing if the remaining iterations is greater than the next VF if it's fixed-width. However, we can apply the same check for scalable vectors by using an estimate of the next VF using vscale-for-tuning. But that's for a different PR!

llvm-ci · 2025-10-26T14:56:49Z

LLVM Buildbot has detected a new failure on builder mlir-nvidia-gcc7 running on mlir-nvidia while building llvm at step 7 "test-build-check-mlir-build-only-check-mlir".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/116/builds/20178

Here is the relevant piece of the build log for the reference

Step 7 (test-build-check-mlir-build-only-check-mlir) failure: test (failure)
******************** TEST 'MLIR :: Integration/GPU/CUDA/async.mlir' FAILED ********************
Exit Code: 1

Command Output (stdout):
--
# RUN: at line 1
/vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.src/mlir/test/Integration/GPU/CUDA/async.mlir  | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -gpu-kernel-outlining  | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm),nvvm-attach-target)'  | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -gpu-async-region -gpu-to-llvm -reconcile-unrealized-casts -gpu-module-to-binary="format=fatbin"  | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -async-to-async-runtime -async-runtime-ref-counting  | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -convert-async-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -convert-cf-to-llvm -reconcile-unrealized-casts  | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-runner    --shared-libs=/vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/lib/libmlir_cuda_runtime.so    --shared-libs=/vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/lib/libmlir_async_runtime.so    --shared-libs=/vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/lib/libmlir_runner_utils.so    --entry-point-result=void -O0  | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/FileCheck /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.src/mlir/test/Integration/GPU/CUDA/async.mlir
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.src/mlir/test/Integration/GPU/CUDA/async.mlir
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -gpu-kernel-outlining
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt '-pass-pipeline=builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm),nvvm-attach-target)'
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -gpu-async-region -gpu-to-llvm -reconcile-unrealized-casts -gpu-module-to-binary=format=fatbin
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -async-to-async-runtime -async-runtime-ref-counting
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -convert-async-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -convert-cf-to-llvm -reconcile-unrealized-casts
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-runner --shared-libs=/vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/lib/libmlir_cuda_runtime.so --shared-libs=/vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/lib/libmlir_async_runtime.so --shared-libs=/vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/lib/libmlir_runner_utils.so --entry-point-result=void -O0
# .---command stderr------------
# | 'cuStreamWaitEvent(stream, event, 0)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuEventDestroy(event)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuStreamWaitEvent(stream, event, 0)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuEventDestroy(event)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuStreamWaitEvent(stream, event, 0)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuStreamWaitEvent(stream, event, 0)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuEventDestroy(event)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuEventDestroy(event)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuEventSynchronize(event)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuEventDestroy(event)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# `-----------------------------
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/FileCheck /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.src/mlir/test/Integration/GPU/CUDA/async.mlir
# .---command stderr------------
# | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.src/mlir/test/Integration/GPU/CUDA/async.mlir:68:12: error: CHECK: expected string not found in input
# |  // CHECK: [84, 84]
# |            ^
# | <stdin>:1:1: note: scanning from here
# | Unranked Memref base@ = 0x58089f123090 rank = 1 offset = 0 sizes = [2] strides = [1] data = 
# | ^
# | <stdin>:2:1: note: possible intended match here
# | [42, 42]
# | ^
# | 
# | Input file: <stdin>
# | Check file: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.src/mlir/test/Integration/GPU/CUDA/async.mlir
# | 
# | -dump-input=help explains the following input dump.
# | 
# | Input was:
# | <<<<<<
# |             1: Unranked Memref base@ = 0x58089f123090 rank = 1 offset = 0 sizes = [2] strides = [1] data =  
# | check:68'0     X~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ error: no match found
# |             2: [42, 42] 
# | check:68'0     ~~~~~~~~~
# | check:68'1     ?         possible intended match
...

llvm-ci · 2025-10-26T15:08:29Z

LLVM Buildbot has detected a new failure on builder clang-aarch64-sve2-vla running on linaro-g4-02 while building llvm at step 7 "ninja check 1".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/198/builds/9130

Here is the relevant piece of the build log for the reference

Step 7 (ninja check 1) failure: stage 1 checked (failure)
******************** TEST 'HWAddressSanitizer-aarch64 :: TestCases/hwasan_symbolize_stack_overflow.cpp' FAILED ********************
Exit Code: 1

Command Output (stderr):
--
rm -rf /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/stage1/runtimes/runtimes-bins/compiler-rt/test/hwasan/AARCH64/TestCases/Output/hwasan_symbolize_stack_overflow.cpp.tmp; mkdir /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/stage1/runtimes/runtimes-bins/compiler-rt/test/hwasan/AARCH64/TestCases/Output/hwasan_symbolize_stack_overflow.cpp.tmp # RUN: at line 1
+ rm -rf /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/stage1/runtimes/runtimes-bins/compiler-rt/test/hwasan/AARCH64/TestCases/Output/hwasan_symbolize_stack_overflow.cpp.tmp
+ mkdir /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/stage1/runtimes/runtimes-bins/compiler-rt/test/hwasan/AARCH64/TestCases/Output/hwasan_symbolize_stack_overflow.cpp.tmp
/home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/stage1/./bin/clang    -Wthread-safety -Wthread-safety-reference -Wthread-safety-beta   -gline-tables-only -fsanitize=hwaddress -fuse-ld=lld -mllvm -hwasan-globals -mllvm -hwasan-use-short-granules -mllvm -hwasan-instrument-landing-pads=0 -mllvm -hwasan-instrument-personality-functions -Wl,--build-id -g /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/llvm/compiler-rt/test/hwasan/TestCases/hwasan_symbolize_stack_overflow.cpp -o /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/stage1/runtimes/runtimes-bins/compiler-rt/test/hwasan/AARCH64/TestCases/Output/hwasan_symbolize_stack_overflow.cpp.tmp/hwasan_overflow # RUN: at line 2
+ /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/stage1/./bin/clang -Wthread-safety -Wthread-safety-reference -Wthread-safety-beta -gline-tables-only -fsanitize=hwaddress -fuse-ld=lld -mllvm -hwasan-globals -mllvm -hwasan-use-short-granules -mllvm -hwasan-instrument-landing-pads=0 -mllvm -hwasan-instrument-personality-functions -Wl,--build-id -g /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/llvm/compiler-rt/test/hwasan/TestCases/hwasan_symbolize_stack_overflow.cpp -o /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/stage1/runtimes/runtimes-bins/compiler-rt/test/hwasan/AARCH64/TestCases/Output/hwasan_symbolize_stack_overflow.cpp.tmp/hwasan_overflow
env HWASAN_OPTIONS=disable_allocator_tagging=1:random_tags=0:fail_without_syscall_abi=0:symbolize=0 not  /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/stage1/runtimes/runtimes-bins/compiler-rt/test/hwasan/AARCH64/TestCases/Output/hwasan_symbolize_stack_overflow.cpp.tmp/hwasan_overflow 16 2>&1 | /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/stage1/bin/hwasan_symbolize --symbols /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/stage1/runtimes/runtimes-bins/compiler-rt/test/hwasan/AARCH64/TestCases/Output/hwasan_symbolize_stack_overflow.cpp.tmp --index | FileCheck /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/llvm/compiler-rt/test/hwasan/TestCases/hwasan_symbolize_stack_overflow.cpp --check-prefixes=CHECK,AFTER0 # RUN: at line 3
+ env HWASAN_OPTIONS=disable_allocator_tagging=1:random_tags=0:fail_without_syscall_abi=0:symbolize=0 not /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/stage1/runtimes/runtimes-bins/compiler-rt/test/hwasan/AARCH64/TestCases/Output/hwasan_symbolize_stack_overflow.cpp.tmp/hwasan_overflow 16
+ /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/stage1/bin/hwasan_symbolize --symbols /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/stage1/runtimes/runtimes-bins/compiler-rt/test/hwasan/AARCH64/TestCases/Output/hwasan_symbolize_stack_overflow.cpp.tmp --index
+ FileCheck /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/llvm/compiler-rt/test/hwasan/TestCases/hwasan_symbolize_stack_overflow.cpp --check-prefixes=CHECK,AFTER0
Could not find symbols for lib/aarch64-linux-gnu/libc.so.6 (Build ID: 034c9554b03099fc969ae2528ea6fd26286057d8)
env HWASAN_OPTIONS=disable_allocator_tagging=1:random_tags=0:fail_without_syscall_abi=0:symbolize=0 not  /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/stage1/runtimes/runtimes-bins/compiler-rt/test/hwasan/AARCH64/TestCases/Output/hwasan_symbolize_stack_overflow.cpp.tmp/hwasan_overflow 17 2>&1 | /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/stage1/bin/hwasan_symbolize --symbols /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/stage1/runtimes/runtimes-bins/compiler-rt/test/hwasan/AARCH64/TestCases/Output/hwasan_symbolize_stack_overflow.cpp.tmp --index | FileCheck /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/llvm/compiler-rt/test/hwasan/TestCases/hwasan_symbolize_stack_overflow.cpp --check-prefixes=CHECK,AFTER1 # RUN: at line 4
+ env HWASAN_OPTIONS=disable_allocator_tagging=1:random_tags=0:fail_without_syscall_abi=0:symbolize=0 not /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/stage1/runtimes/runtimes-bins/compiler-rt/test/hwasan/AARCH64/TestCases/Output/hwasan_symbolize_stack_overflow.cpp.tmp/hwasan_overflow 17
+ /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/stage1/bin/hwasan_symbolize --symbols /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/stage1/runtimes/runtimes-bins/compiler-rt/test/hwasan/AARCH64/TestCases/Output/hwasan_symbolize_stack_overflow.cpp.tmp --index
+ FileCheck /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/llvm/compiler-rt/test/hwasan/TestCases/hwasan_symbolize_stack_overflow.cpp --check-prefixes=CHECK,AFTER1
Could not find symbols for lib/aarch64-linux-gnu/libc.so.6 (Build ID: 034c9554b03099fc969ae2528ea6fd26286057d8)
env HWASAN_OPTIONS=disable_allocator_tagging=1:random_tags=0:fail_without_syscall_abi=0:symbolize=0 not  /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/stage1/runtimes/runtimes-bins/compiler-rt/test/hwasan/AARCH64/TestCases/Output/hwasan_symbolize_stack_overflow.cpp.tmp/hwasan_overflow -1 2>&1 | /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/stage1/bin/hwasan_symbolize --symbols /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/stage1/runtimes/runtimes-bins/compiler-rt/test/hwasan/AARCH64/TestCases/Output/hwasan_symbolize_stack_overflow.cpp.tmp --index | FileCheck /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/llvm/compiler-rt/test/hwasan/TestCases/hwasan_symbolize_stack_overflow.cpp --check-prefixes=CHECK,BEFORE1 # RUN: at line 5
+ env HWASAN_OPTIONS=disable_allocator_tagging=1:random_tags=0:fail_without_syscall_abi=0:symbolize=0 not /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/stage1/runtimes/runtimes-bins/compiler-rt/test/hwasan/AARCH64/TestCases/Output/hwasan_symbolize_stack_overflow.cpp.tmp/hwasan_overflow -1
+ /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/stage1/bin/hwasan_symbolize --symbols /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/stage1/runtimes/runtimes-bins/compiler-rt/test/hwasan/AARCH64/TestCases/Output/hwasan_symbolize_stack_overflow.cpp.tmp --index
+ FileCheck /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/llvm/compiler-rt/test/hwasan/TestCases/hwasan_symbolize_stack_overflow.cpp --check-prefixes=CHECK,BEFORE1
Could not find symbols for lib/aarch64-linux-gnu/libc.so.6 (Build ID: 034c9554b03099fc969ae2528ea6fd26286057d8)
env HWASAN_OPTIONS=disable_allocator_tagging=1:random_tags=0:fail_without_syscall_abi=0:symbolize=0 not  /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/stage1/runtimes/runtimes-bins/compiler-rt/test/hwasan/AARCH64/TestCases/Output/hwasan_symbolize_stack_overflow.cpp.tmp/hwasan_overflow -17 2>&1 | /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/stage1/bin/hwasan_symbolize --symbols /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/stage1/runtimes/runtimes-bins/compiler-rt/test/hwasan/AARCH64/TestCases/Output/hwasan_symbolize_stack_overflow.cpp.tmp --index | FileCheck /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/llvm/compiler-rt/test/hwasan/TestCases/hwasan_symbolize_stack_overflow.cpp --check-prefixes=CHECK,BEFORE17 # RUN: at line 6
+ env HWASAN_OPTIONS=disable_allocator_tagging=1:random_tags=0:fail_without_syscall_abi=0:symbolize=0 not /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/stage1/runtimes/runtimes-bins/compiler-rt/test/hwasan/AARCH64/TestCases/Output/hwasan_symbolize_stack_overflow.cpp.tmp/hwasan_overflow -17
+ /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/stage1/bin/hwasan_symbolize --symbols /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/stage1/runtimes/runtimes-bins/compiler-rt/test/hwasan/AARCH64/TestCases/Output/hwasan_symbolize_stack_overflow.cpp.tmp --index
+ FileCheck /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/llvm/compiler-rt/test/hwasan/TestCases/hwasan_symbolize_stack_overflow.cpp --check-prefixes=CHECK,BEFORE17
Could not find symbols for lib/aarch64-linux-gnu/libc.so.6 (Build ID: 034c9554b03099fc969ae2528ea6fd26286057d8)
env HWASAN_OPTIONS=disable_allocator_tagging=1:random_tags=0:fail_without_syscall_abi=0:symbolize=0 not  /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/stage1/runtimes/runtimes-bins/compiler-rt/test/hwasan/AARCH64/TestCases/Output/hwasan_symbolize_stack_overflow.cpp.tmp/hwasan_overflow 1016 2>&1 | /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/stage1/bin/hwasan_symbolize --symbols /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/stage1/runtimes/runtimes-bins/compiler-rt/test/hwasan/AARCH64/TestCases/Output/hwasan_symbolize_stack_overflow.cpp.tmp --index | FileCheck /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/llvm/compiler-rt/test/hwasan/TestCases/hwasan_symbolize_stack_overflow.cpp --check-prefixes=CHECK,AFTER1000 # RUN: at line 7
+ env HWASAN_OPTIONS=disable_allocator_tagging=1:random_tags=0:fail_without_syscall_abi=0:symbolize=0 not /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/stage1/runtimes/runtimes-bins/compiler-rt/test/hwasan/AARCH64/TestCases/Output/hwasan_symbolize_stack_overflow.cpp.tmp/hwasan_overflow 1016
+ FileCheck /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/llvm/compiler-rt/test/hwasan/TestCases/hwasan_symbolize_stack_overflow.cpp --check-prefixes=CHECK,AFTER1000
+ /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/stage1/bin/hwasan_symbolize --symbols /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/stage1/runtimes/runtimes-bins/compiler-rt/test/hwasan/AARCH64/TestCases/Output/hwasan_symbolize_stack_overflow.cpp.tmp --index
Could not find symbols for lib/aarch64-linux-gnu/libc.so.6 (Build ID: 034c9554b03099fc969ae2528ea6fd26286057d8)
env HWASAN_OPTIONS=disable_allocator_tagging=1:random_tags=0:fail_without_syscall_abi=0:symbolize=0 not  /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/stage1/runtimes/runtimes-bins/compiler-rt/test/hwasan/AARCH64/TestCases/Output/hwasan_symbolize_stack_overflow.cpp.tmp/hwasan_overflow -1000 2>&1 | /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/stage1/bin/hwasan_symbolize --symbols /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/stage1/runtimes/runtimes-bins/compiler-rt/test/hwasan/AARCH64/TestCases/Output/hwasan_symbolize_stack_overflow.cpp.tmp --index | FileCheck /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/llvm/compiler-rt/test/hwasan/TestCases/hwasan_symbolize_stack_overflow.cpp --check-prefixes=CHECK,BEFORE1000 # RUN: at line 8
+ env HWASAN_OPTIONS=disable_allocator_tagging=1:random_tags=0:fail_without_syscall_abi=0:symbolize=0 not /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/stage1/runtimes/runtimes-bins/compiler-rt/test/hwasan/AARCH64/TestCases/Output/hwasan_symbolize_stack_overflow.cpp.tmp/hwasan_overflow -1000
+ /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/stage1/bin/hwasan_symbolize --symbols /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/stage1/runtimes/runtimes-bins/compiler-rt/test/hwasan/AARCH64/TestCases/Output/hwasan_symbolize_stack_overflow.cpp.tmp --index
+ FileCheck /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/llvm/compiler-rt/test/hwasan/TestCases/hwasan_symbolize_stack_overflow.cpp --check-prefixes=CHECK,BEFORE1000
Could not find symbols for lib/aarch64-linux-gnu/libc.so.6 (Build ID: 034c9554b03099fc969ae2528ea6fd26286057d8)
/home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/llvm/compiler-rt/test/hwasan/TestCases/hwasan_symbolize_stack_overflow.cpp:21:12: error: CHECK: expected string not found in input
 // CHECK: Potentially referenced stack object:
           ^
<stdin>:1:1: note: scanning from here
==1901996==ERROR: HWAddressSanitizer: tag-mismatch on address 0xffffceafff78 at pc 0xbc2c9e1dc9f4
^
<stdin>:10:13: note: possible intended match here
Address 0xffffceafff78 is located in stack of thread T0
            ^

...

…pVF (llvm#156723) Transform TC and VF to same numerical space when they are different.

hassnaaHamdi marked this pull request as ready for review September 3, 2025 19:21

llvmbot added vectorizers llvm:transforms labels Sep 3, 2025

hassnaaHamdi requested review from david-arm and fhahn September 3, 2025 19:21

hassnaaHamdi mentioned this pull request Sep 3, 2025

LV]: consider scalable VF during deciding dead epilogue. #156724

Merged

david-arm requested a review from paulwalker-arm September 4, 2025 09:14

david-arm reviewed Sep 4, 2025

View reviewed changes

hassnaaHamdi force-pushed the lv-rem-iter branch 2 times, most recently from c4a830c to 388abf6 Compare September 7, 2025 22:39

hassnaaHamdi force-pushed the lv-rem-iter branch from 388abf6 to b1acc49 Compare September 7, 2025 23:02

paulwalker-arm reviewed Sep 10, 2025

View reviewed changes

hassnaaHamdi force-pushed the lv-rem-iter branch from b1acc49 to 02ce6db Compare September 29, 2025 08:58

david-arm reviewed Sep 30, 2025

View reviewed changes

hassnaaHamdi closed this Oct 2, 2025

hassnaaHamdi reopened this Oct 6, 2025

hassnaaHamdi force-pushed the lv-rem-iter branch 2 times, most recently from 7b7ff40 to 526f82f Compare October 6, 2025 13:44

hassnaaHamdi requested a review from david-arm October 9, 2025 10:39

david-arm reviewed Oct 10, 2025

View reviewed changes

david-arm reviewed Oct 16, 2025

View reviewed changes

paulwalker-arm reviewed Oct 16, 2025

View reviewed changes

hassnaaHamdi added 3 commits October 20, 2025 16:18

precommit -regen check lines

945728b

Change-Id: I5683aa86f39e92fdeea5089268cedc3c07f0bb8c

[LV]: Make TC and VF in same numerical space

ab7a404

Change-Id: I24d2181a61d8bcaaaf2c0efb4275226a6887aabc

resolve review comments

b18506c

hassnaaHamdi added 2 commits October 20, 2025 16:19

Use m_scev_c_Mul

d962751

rebase and update multiple-result-intrinsics.ll test

94058a7

hassnaaHamdi force-pushed the lv-rem-iter branch from 1d4c7cb to 94058a7 Compare October 20, 2025 16:28

hassnaaHamdi added 2 commits October 21, 2025 14:28

use CM.getVScaleForTuning().value_or(1)) as the vscale value could be…

3fa90d4

… not set

format

20510d7

hassnaaHamdi requested a review from david-arm October 22, 2025 16:22

david-arm approved these changes Oct 23, 2025

View reviewed changes

hassnaaHamdi merged commit be29f0d into llvm:main Oct 26, 2025
9 of 10 checks passed

varun-r-mallya pushed a commit to varun-r-mallya/llvm-project that referenced this pull request Oct 27, 2025

[LV]: Improve accuracy of calculating remaining iterations of MainLoo…

2c3381a

…pVF (llvm#156723) Transform TC and VF to same numerical space when they are different.

dvbuka pushed a commit to dvbuka/llvm-project that referenced this pull request Oct 27, 2025

[LV]: Improve accuracy of calculating remaining iterations of MainLoo…

b24ff0e

…pVF (llvm#156723) Transform TC and VF to same numerical space when they are different.

Lukacma pushed a commit to Lukacma/llvm-project that referenced this pull request Oct 29, 2025

[LV]: Improve accuracy of calculating remaining iterations of MainLoo…

6be6a22

…pVF (llvm#156723) Transform TC and VF to same numerical space when they are different.

aokblast pushed a commit to aokblast/llvm-project that referenced this pull request Oct 30, 2025

[LV]: Improve accuracy of calculating remaining iterations of MainLoo…

5d08931

…pVF (llvm#156723) Transform TC and VF to same numerical space when they are different.

		if (match(TC, m_scev_Mul(m_SCEV(), m_SCEVVScale()))) {
		if (std::optional<ElementCount> BestKnownTC =

		bool ScalableTC = match(TC, m_scev_Mul(m_SCEV(KnownMinTC), m_SCEVVScale())) \|\|
		match(TC, m_scev_Mul(m_SCEVVScale(), m_SCEV(KnownMinTC)));

	bool ScalableTC = match(TC, m_scev_Mul(m_SCEV(KnownMinTC), m_SCEVVScale())) \|\|
	match(TC, m_scev_Mul(m_SCEVVScale(), m_SCEV(KnownMinTC)));
	bool ScalableTC = match(TC, m_scev_c_Mul(m_SCEV(KnownMinTC), m_SCEVVScale()));

[LV]: Improve accuracy of calculating remaining iterations of MainLoopVF #156723

[LV]: Improve accuracy of calculating remaining iterations of MainLoopVF #156723

Uh oh!

Conversation

hassnaaHamdi commented Sep 3, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Sep 3, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

david-arm left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

github-actions bot commented Sep 7, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

hassnaaHamdi commented Sep 8, 2025

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

hassnaaHamdi commented Oct 2, 2025

Uh oh!

hassnaaHamdi commented Oct 6, 2025

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

david-arm left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

llvm-ci commented Oct 26, 2025

Uh oh!

llvm-ci commented Oct 26, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

5 participants

hassnaaHamdi commented Sep 3, 2025 •

edited

Loading

llvmbot commented Sep 3, 2025 •

edited

Loading

github-actions bot commented Sep 7, 2025 •

edited

Loading