Skip to content

Commit 2c3381a

Browse files
hassnaaHamdivarun-r-mallya
authored andcommitted
[LV]: Improve accuracy of calculating remaining iterations of MainLoopVF (llvm#156723)
Transform TC and VF to same numerical space when they are different.
1 parent 3b1e60b commit 2c3381a

File tree

4 files changed

+159
-185
lines changed

4 files changed

+159
-185
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4378,8 +4378,21 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
43784378
const SCEV *TC =
43794379
vputils::getSCEVExprForVPValue(getPlanFor(MainLoopVF).getTripCount(), SE);
43804380
assert(!isa<SCEVCouldNotCompute>(TC) && "Trip count SCEV must be computable");
4381-
RemainingIterations =
4382-
SE.getURemExpr(TC, SE.getElementCount(TCType, MainLoopVF * IC));
4381+
const SCEV *KnownMinTC;
4382+
bool ScalableTC = match(TC, m_scev_c_Mul(m_SCEV(KnownMinTC), m_SCEVVScale()));
4383+
// Use versions of TC and VF in which both are either scalable or fixed.
4384+
if (ScalableTC == MainLoopVF.isScalable())
4385+
RemainingIterations =
4386+
SE.getURemExpr(TC, SE.getElementCount(TCType, MainLoopVF * IC));
4387+
else if (ScalableTC) {
4388+
const SCEV *EstimatedTC = SE.getMulExpr(
4389+
KnownMinTC,
4390+
SE.getConstant(TCType, CM.getVScaleForTuning().value_or(1)));
4391+
RemainingIterations = SE.getURemExpr(
4392+
EstimatedTC, SE.getElementCount(TCType, MainLoopVF * IC));
4393+
} else
4394+
RemainingIterations =
4395+
SE.getURemExpr(TC, SE.getElementCount(TCType, EstimatedRuntimeVF * IC));
43834396

43844397
// No iterations left to process in the epilogue.
43854398
if (RemainingIterations->isZero())

llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll

Lines changed: 0 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,6 @@ define void @sincos_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noali
3737
;
3838
; CHECK-ARMPL-LABEL: define void @sincos_f32(
3939
; CHECK-ARMPL-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) #[[ATTR0:[0-9]+]] {
40-
; CHECK-ARMPL: [[ENTRY:.*:]]
4140
; CHECK-ARMPL: [[VECTOR_PH:.*:]]
4241
; CHECK-ARMPL: [[VECTOR_BODY:.*:]]
4342
; CHECK-ARMPL: [[VECTOR_BODY1:.*:]]
@@ -51,15 +50,6 @@ define void @sincos_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noali
5150
; CHECK-ARMPL: store <vscale x 4 x float> [[TMP15]], ptr [[TMP22:%.*]], align 4
5251
; CHECK-ARMPL: store <vscale x 4 x float> [[TMP16]], ptr [[TMP24:%.*]], align 4
5352
; CHECK-ARMPL: store <vscale x 4 x float> [[TMP17]], ptr [[TMP27:%.*]], align 4
54-
; CHECK-ARMPL: [[MIDDLE_BLOCK:.*:]]
55-
; CHECK-ARMPL: [[SCALAR_PH:.*:]]
56-
; CHECK-ARMPL: [[FOR_BODY:.*:]]
57-
; CHECK-ARMPL: [[VEC_EPILOG_VECTOR_BODY:.*:]]
58-
; CHECK-ARMPL: [[TMP29:%.*]] = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> [[WIDE_LOAD3:%.*]])
59-
; CHECK-ARMPL: [[TMP25:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 0
60-
; CHECK-ARMPL: [[TMP26:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 1
61-
; CHECK-ARMPL: store <4 x float> [[TMP25]], ptr [[TMP30:%.*]], align 4
62-
; CHECK-ARMPL: store <4 x float> [[TMP26]], ptr [[TMP28:%.*]], align 4
6353
; CHECK-ARMPL: [[VEC_EPILOG_MIDDLE_BLOCK:.*:]]
6454
; CHECK-ARMPL: [[VEC_EPILOG_SCALAR_PH:.*:]]
6555
; CHECK-ARMPL: [[FOR_BODY1:.*:]]
@@ -269,7 +259,6 @@ define void @modf_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias
269259
;
270260
; CHECK-ARMPL-LABEL: define void @modf_f32(
271261
; CHECK-ARMPL-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) #[[ATTR0]] {
272-
; CHECK-ARMPL: [[ENTRY:.*:]]
273262
; CHECK-ARMPL: [[VECTOR_PH:.*:]]
274263
; CHECK-ARMPL: [[VECTOR_BODY:.*:]]
275264
; CHECK-ARMPL: [[VECTOR_BODY1:.*:]]
@@ -283,15 +272,6 @@ define void @modf_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias
283272
; CHECK-ARMPL: store <vscale x 4 x float> [[TMP15]], ptr [[TMP22:%.*]], align 4
284273
; CHECK-ARMPL: store <vscale x 4 x float> [[TMP16]], ptr [[TMP24:%.*]], align 4
285274
; CHECK-ARMPL: store <vscale x 4 x float> [[TMP17]], ptr [[TMP27:%.*]], align 4
286-
; CHECK-ARMPL: [[MIDDLE_BLOCK:.*:]]
287-
; CHECK-ARMPL: [[SCALAR_PH:.*:]]
288-
; CHECK-ARMPL: [[FOR_BODY:.*:]]
289-
; CHECK-ARMPL: [[VEC_EPILOG_VECTOR_BODY:.*:]]
290-
; CHECK-ARMPL: [[TMP29:%.*]] = call { <4 x float>, <4 x float> } @llvm.modf.v4f32(<4 x float> [[WIDE_LOAD3:%.*]])
291-
; CHECK-ARMPL: [[TMP25:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 0
292-
; CHECK-ARMPL: [[TMP26:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 1
293-
; CHECK-ARMPL: store <4 x float> [[TMP25]], ptr [[TMP30:%.*]], align 4
294-
; CHECK-ARMPL: store <4 x float> [[TMP26]], ptr [[TMP28:%.*]], align 4
295275
; CHECK-ARMPL: [[VEC_EPILOG_MIDDLE_BLOCK:.*:]]
296276
; CHECK-ARMPL: [[VEC_EPILOG_SCALAR_PH:.*:]]
297277
; CHECK-ARMPL: [[FOR_BODY1:.*:]]
@@ -429,7 +409,6 @@ define void @sincospi_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noa
429409
;
430410
; CHECK-ARMPL-LABEL: define void @sincospi_f32(
431411
; CHECK-ARMPL-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) #[[ATTR0]] {
432-
; CHECK-ARMPL: [[ENTRY:.*:]]
433412
; CHECK-ARMPL: [[VECTOR_PH:.*:]]
434413
; CHECK-ARMPL: [[VECTOR_BODY:.*:]]
435414
; CHECK-ARMPL: [[VECTOR_BODY1:.*:]]
@@ -443,15 +422,6 @@ define void @sincospi_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noa
443422
; CHECK-ARMPL: store <vscale x 4 x float> [[TMP15]], ptr [[TMP22:%.*]], align 4
444423
; CHECK-ARMPL: store <vscale x 4 x float> [[TMP16]], ptr [[TMP24:%.*]], align 4
445424
; CHECK-ARMPL: store <vscale x 4 x float> [[TMP17]], ptr [[TMP27:%.*]], align 4
446-
; CHECK-ARMPL: [[MIDDLE_BLOCK:.*:]]
447-
; CHECK-ARMPL: [[SCALAR_PH:.*:]]
448-
; CHECK-ARMPL: [[FOR_BODY:.*:]]
449-
; CHECK-ARMPL: [[VEC_EPILOG_VECTOR_BODY:.*:]]
450-
; CHECK-ARMPL: [[TMP29:%.*]] = call { <4 x float>, <4 x float> } @llvm.sincospi.v4f32(<4 x float> [[WIDE_LOAD3:%.*]])
451-
; CHECK-ARMPL: [[TMP25:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 0
452-
; CHECK-ARMPL: [[TMP26:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 1
453-
; CHECK-ARMPL: store <4 x float> [[TMP25]], ptr [[TMP30:%.*]], align 4
454-
; CHECK-ARMPL: store <4 x float> [[TMP26]], ptr [[TMP28:%.*]], align 4
455425
; CHECK-ARMPL: [[VEC_EPILOG_MIDDLE_BLOCK:.*:]]
456426
; CHECK-ARMPL: [[VEC_EPILOG_SCALAR_PH:.*:]]
457427
; CHECK-ARMPL: [[FOR_BODY1:.*:]]

0 commit comments

Comments
 (0)