Skip to content
Merged
23 changes: 23 additions & 0 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4546,6 +4546,12 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
return false;
}

static bool hasReplicatorRegion(VPlan &Plan) {
return any_of(VPBlockUtils::blocksOnly<VPRegionBlock>(
vp_depth_first_deep(Plan.getEntry())),
[](auto *VPRB) { return VPRB->isReplicator(); });
}

#ifndef NDEBUG
VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1));
Expand Down Expand Up @@ -4618,6 +4624,15 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
continue;
}

if (CM.OptForSize && !ForceVectorization && hasReplicatorRegion(*P)) {
LLVM_DEBUG(
dbgs()
<< "LV: Not considering vector loop of width " << VF
<< " because it would cause replicated blocks to be generated,"
<< " which isn't allowed when optimizing for size.\n");
continue;
}

if (isMoreProfitable(Candidate, ChosenFactor))
ChosenFactor = Candidate;
}
Expand Down Expand Up @@ -7548,6 +7563,14 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
<< " because it will not generate any vector instructions.\n");
continue;
}
if (CM.OptForSize && !ForceVectorization && hasReplicatorRegion(*P)) {
LLVM_DEBUG(
dbgs()
<< "LV: Not considering vector loop of width " << VF
<< " because it would cause replicated blocks to be generated,"
<< " which isn't allowed when optimizing for size.\n");
continue;
}

InstructionCost Cost = cost(*P, VF);
VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,22 +1,22 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; REQUIRES: asserts
; RUN: opt < %s -passes=loop-vectorize -disable-output -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefix=COST
; RUN: opt < %s -passes=loop-vectorize,instcombine,simplifycfg -force-vector-width=2 -simplifycfg-require-and-preserve-domtree=1 -S | FileCheck %s
; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=1 -disable-output -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefix=COST
; RUN: opt < %s -passes=loop-vectorize,instcombine,simplifycfg -force-vector-interleave=1 -force-vector-width=2 -simplifycfg-require-and-preserve-domtree=1 -S | FileCheck %s

target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
target triple = "aarch64--linux-gnu"

; This test checks that we correctly compute the scalarized operands for a
; user-specified vectorization factor when interleaving is disabled. We use the
; "optsize" attribute to disable all interleaving calculations. A cost of 4
; for %var4 indicates that we would scalarize it's operand (%var3), giving
; user-specified vectorization factor when interleaving is disabled. We use
; -force-vector-interleave=1 to disable all interleaving calculations. A cost of
; 4 for %var4 indicates that we would scalarize it's operand (%var3), giving
; %var4 a lower scalarization overhead.
;
; COST-LABEL: predicated_udiv_scalarized_operand
; COST: Cost of 5 for VF 2: profitable to scalarize %var4 = udiv i64 %var2, %var3
;
;
define i64 @predicated_udiv_scalarized_operand(ptr %a, i64 %x) optsize {
define i64 @predicated_udiv_scalarized_operand(ptr %a, i64 %x) {
; CHECK-LABEL: @predicated_udiv_scalarized_operand(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1475,55 +1475,29 @@ exit:
ret void
}

define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) optsize {
define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) {
; DEFAULT-LABEL: define void @redundant_branch_and_tail_folding(
; DEFAULT-SAME: ptr [[DST:%.*]], i1 [[C:%.*]]) #[[ATTR4:[0-9]+]] {
; DEFAULT-SAME: ptr [[DST:%.*]], i1 [[C:%.*]]) {
; DEFAULT-NEXT: [[ENTRY:.*]]:
; DEFAULT-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; DEFAULT: [[VECTOR_PH]]:
; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]]
; DEFAULT: [[VECTOR_BODY]]:
; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE6:.*]] ]
; DEFAULT-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE6]] ]
; DEFAULT-NEXT: [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IND]], splat (i64 20)
; DEFAULT-NEXT: [[TMP1:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 1)
; DEFAULT-NEXT: [[TMP2:%.*]] = trunc <4 x i64> [[TMP1]] to <4 x i32>
; DEFAULT-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
; DEFAULT-NEXT: br i1 [[TMP3]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
; DEFAULT: [[PRED_STORE_IF]]:
; DEFAULT-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
; DEFAULT-NEXT: store i32 [[TMP4]], ptr [[DST]], align 4
; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE]]
; DEFAULT: [[PRED_STORE_CONTINUE]]:
; DEFAULT-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1
; DEFAULT-NEXT: br i1 [[TMP5]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
; DEFAULT: [[PRED_STORE_IF1]]:
; DEFAULT-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
; DEFAULT-NEXT: store i32 [[TMP6]], ptr [[DST]], align 4
; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE2]]
; DEFAULT: [[PRED_STORE_CONTINUE2]]:
; DEFAULT-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2
; DEFAULT-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
; DEFAULT: [[PRED_STORE_IF3]]:
; DEFAULT-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
; DEFAULT-NEXT: store i32 [[TMP8]], ptr [[DST]], align 4
; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE4]]
; DEFAULT: [[PRED_STORE_CONTINUE4]]:
; DEFAULT-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3
; DEFAULT-NEXT: br i1 [[TMP9]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6]]
; DEFAULT: [[PRED_STORE_IF5]]:
; DEFAULT-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
; DEFAULT-NEXT: store i32 [[TMP10]], ptr [[DST]], align 4
; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE6]]
; DEFAULT: [[PRED_STORE_CONTINUE6]]:
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
; DEFAULT-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
; DEFAULT-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; DEFAULT-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
; DEFAULT-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
; DEFAULT-NEXT: [[TMP0:%.*]] = add nuw nsw <4 x i64> [[STEP_ADD]], splat (i64 1)
; DEFAULT-NEXT: [[TMP1:%.*]] = trunc <4 x i64> [[TMP0]] to <4 x i32>
; DEFAULT-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
; DEFAULT-NEXT: store i32 [[TMP2]], ptr [[DST]], align 4
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
; DEFAULT-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
; DEFAULT-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
; DEFAULT: [[MIDDLE_BLOCK]]:
; DEFAULT-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
; DEFAULT-NEXT: br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
; DEFAULT: [[SCALAR_PH]]:
; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 24, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; DEFAULT-NEXT: br label %[[LOOP_HEADER:.*]]
; DEFAULT: [[LOOP_HEADER]]:
; DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
Expand All @@ -1540,7 +1514,7 @@ define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) optsize {
; DEFAULT-NEXT: ret void
;
; PRED-LABEL: define void @redundant_branch_and_tail_folding(
; PRED-SAME: ptr [[DST:%.*]], i1 [[C:%.*]]) #[[ATTR4:[0-9]+]] {
; PRED-SAME: ptr [[DST:%.*]], i1 [[C:%.*]]) {
; PRED-NEXT: [[ENTRY:.*]]:
; PRED-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; PRED: [[VECTOR_PH]]:
Expand Down
Loading