Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 41 additions & 9 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -578,8 +578,10 @@ class InnerLoopVectorizer {
/// The profitablity analysis.
LoopVectorizationCostModel *Cost;

/// BFI and PSI are used to check for profile guided size optimizations.
/// Used to calculate the probability of predicated blocks in
/// getPredBlockCostDivisor.
BlockFrequencyInfo *BFI;
/// Used to check for profile guided size optimizations.
ProfileSummaryInfo *PSI;

/// Structure to hold information about generated runtime checks, responsible
Expand Down Expand Up @@ -900,7 +902,7 @@ class LoopVectorizationCostModel {
InterleavedAccessInfo &IAI,
ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI)
: ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), BFI(BFI), TheFunction(F),
Hints(Hints), InterleaveInfo(IAI) {
if (TTI.supportsScalableVectors() || ForceTargetSupportsScalableVectors)
initializeVScaleForTuning();
Expand Down Expand Up @@ -1249,6 +1251,17 @@ class LoopVectorizationCostModel {
/// Superset of instructions that return true for isScalarWithPredication.
bool isPredicatedInst(Instruction *I) const;

/// A helper function that returns how much we should divide the cost of a
/// predicated block by. Typically this is the reciprocal of the block
/// probability, i.e. if we return X we are assuming the predicated block will
/// execute once for every X iterations of the loop header so the block should
/// only contribute 1/X of its cost to the total cost calculation, but when
/// optimizing for code size it will just be 1 as code size costs don't depend
/// on execution probabilities.
inline unsigned
getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind,
const BasicBlock *BB) const;

/// Return the costs for our two available strategies for lowering a
/// div/rem operation which requires speculating at least one lane.
/// First result is for scalarization (will be invalid for scalable
Expand Down Expand Up @@ -1711,6 +1724,8 @@ class LoopVectorizationCostModel {
/// Interface to emit optimization remarks.
OptimizationRemarkEmitter *ORE;

const BlockFrequencyInfo *BFI;

const Function *TheFunction;

/// Loop Vectorize Hint.
Expand Down Expand Up @@ -2866,6 +2881,19 @@ bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
}
}

unsigned LoopVectorizationCostModel::getPredBlockCostDivisor(
TargetTransformInfo::TargetCostKind CostKind, const BasicBlock *BB) const {
if (CostKind == TTI::TCK_CodeSize)
return 1;

uint64_t HeaderFreq = BFI->getBlockFreq(TheLoop->getHeader()).getFrequency();
uint64_t BBFreq = BFI->getBlockFreq(BB).getFrequency();
assert(HeaderFreq >= BBFreq &&
"Header has smaller block freq than dominated BB?");
return BFI->getBlockFreq(TheLoop->getHeader()).getFrequency() /
BFI->getBlockFreq(BB).getFrequency();
Comment on lines +2889 to +2894
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You already have HeaderFreq and BBFreq so I don't think you need to call getBlockFreq again when returning.

}

std::pair<InstructionCost, InstructionCost>
LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
ElementCount VF) const {
Expand Down Expand Up @@ -2902,7 +2930,8 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
// Scale the cost by the probability of executing the predicated blocks.
// This assumes the predicated block for each vector lane is equally
// likely.
ScalarizationCost = ScalarizationCost / getPredBlockCostDivisor(CostKind);
ScalarizationCost =
ScalarizationCost / getPredBlockCostDivisor(CostKind, I->getParent());
}

InstructionCost SafeDivisorCost = 0;
Expand Down Expand Up @@ -5035,7 +5064,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
}

// Scale the total scalar cost by block probability.
ScalarCost /= getPredBlockCostDivisor(CostKind);
ScalarCost /= getPredBlockCostDivisor(CostKind, PredInst->getParent());

// Compute the discount. A non-negative discount means the vector version
// of the instruction costs more, and scalarizing would be beneficial.
Expand Down Expand Up @@ -5088,7 +5117,7 @@ InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
// cost by the probability of executing it. blockNeedsPredication from
// Legal is used so as to not include all blocks in tail folded loops.
if (VF.isScalar() && Legal->blockNeedsPredication(BB))
BlockCost /= getPredBlockCostDivisor(CostKind);
BlockCost /= getPredBlockCostDivisor(CostKind, BB);

Cost += BlockCost;
}
Expand Down Expand Up @@ -5167,7 +5196,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
// conditional branches, but may not be executed for each vector lane. Scale
// the cost by the probability of executing the predicated block.
if (isPredicatedInst(I)) {
Cost /= getPredBlockCostDivisor(CostKind);
Cost /= getPredBlockCostDivisor(CostKind, I->getParent());

// Add the cost of an i1 extract and a branch
auto *VecI1Ty =
Expand Down Expand Up @@ -6727,6 +6756,11 @@ bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
SkipCostComputation.contains(UI);
}

unsigned VPCostContext::getPredBlockCostDivisor(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this needed when the caller could just use Ctx.CM.getPredBlockCostDivisor?

TargetTransformInfo::TargetCostKind CostKind, const BasicBlock *BB) const {
return CM.getPredBlockCostDivisor(CostKind, BB);
}

InstructionCost
LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
VPCostContext &CostCtx) const {
Expand Down Expand Up @@ -10310,9 +10344,7 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,

auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
BFI = nullptr;
if (PSI && PSI->hasProfileSummary())
BFI = &AM.getResult<BlockFrequencyAnalysis>(F);
BFI = &AM.getResult<BlockFrequencyAnalysis>(F);
LoopVectorizeResult Result = runImpl(F);
if (!Result.MadeAnyChange)
return PreservedAnalyses::all();
Expand Down
4 changes: 3 additions & 1 deletion llvm/lib/Transforms/Vectorize/VPlan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -855,7 +855,9 @@ InstructionCost VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx) {
// For the scalar case, we may not always execute the original predicated
// block, Thus, scale the block's cost by the probability of executing it.
if (VF.isScalar())
return ThenCost / getPredBlockCostDivisor(Ctx.CostKind);
if (auto *VPIRBB = dyn_cast<VPIRBasicBlock>(Then))
return ThenCost / Ctx.getPredBlockCostDivisor(Ctx.CostKind,
VPIRBB->getIRBasicBlock());

return ThenCost;
}
Expand Down
18 changes: 3 additions & 15 deletions llvm/lib/Transforms/Vectorize/VPlanHelpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,21 +50,6 @@ Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF);
Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
int64_t Step);

/// A helper function that returns how much we should divide the cost of a
/// predicated block by. Typically this is the reciprocal of the block
/// probability, i.e. if we return X we are assuming the predicated block will
/// execute once for every X iterations of the loop header so the block should
/// only contribute 1/X of its cost to the total cost calculation, but when
/// optimizing for code size it will just be 1 as code size costs don't depend
/// on execution probabilities.
///
/// TODO: We should use actual block probability here, if available. Currently,
/// we always assume predicated blocks have a 50% chance of executing.
inline unsigned
getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind) {
return CostKind == TTI::TCK_CodeSize ? 1 : 2;
}

/// A range of powers-of-2 vectorization factors with fixed start and
/// adjustable end. The range includes start and excludes end, e.g.,:
/// [1, 16) = {1, 2, 4, 8}
Expand Down Expand Up @@ -378,6 +363,9 @@ struct VPCostContext {
InstructionCost getScalarizationOverhead(Type *ResultTy,
ArrayRef<const VPValue *> Operands,
ElementCount VF);

unsigned getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind,
const BasicBlock *BB) const;
};

/// This class can be used to assign names to VPValues. For VPValues without
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3170,7 +3170,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
// Scale the cost by the probability of executing the predicated blocks.
// This assumes the predicated block for each vector lane is equally
// likely.
ScalarCost /= getPredBlockCostDivisor(Ctx.CostKind);
ScalarCost /= Ctx.getPredBlockCostDivisor(Ctx.CostKind, UI->getParent());
return ScalarCost;
}
case Instruction::Load:
Expand Down
2 changes: 2 additions & 0 deletions llvm/test/Other/new-pm-defaults.ll
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,8 @@
; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis on foo
; CHECK-O-NEXT: Running pass: InjectTLIMappings
; CHECK-O-NEXT: Running pass: LoopVectorizePass
; CHECK-O-NEXT: Running analysis: BlockFrequencyAnalysis on foo
; CHECK-O-NEXT: Running analysis: BranchProbabilityAnalysis on foo
; CHECK-O-NEXT: Running pass: InferAlignmentPass
; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
; CHECK-O-NEXT: Running pass: InstCombinePass
Expand Down
2 changes: 2 additions & 0 deletions llvm/test/Other/new-pm-lto-defaults.ll
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,8 @@
; CHECK-O23SZ-NEXT: Running analysis: LoopAccessAnalysis on foo
; CHECK-O23SZ-NEXT: Running pass: LoopVectorizePass on foo
; CHECK-O23SZ-NEXT: Running analysis: DemandedBitsAnalysis on foo
; CHECK-O23SZ-NEXT: Running analysis: BlockFrequencyAnalysis on foo
; CHECK-O23SZ-NEXT: Running analysis: BranchProbabilityAnalysis on foo
; CHECK-O23SZ-NEXT: Running pass: InferAlignmentPass on foo
; CHECK-O23SZ-NEXT: Running pass: LoopUnrollPass on foo
; CHECK-O23SZ-NEXT: WarnMissedTransformationsPass on foo
Expand Down
2 changes: 2 additions & 0 deletions llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,8 @@
; CHECK-POSTLINK-O-NEXT: Running analysis: LoopAccessAnalysis on foo
; CHECK-POSTLINK-O-NEXT: Running pass: InjectTLIMappings
; CHECK-POSTLINK-O-NEXT: Running pass: LoopVectorizePass
; CHECK-POSTLINK-O-NEXT: Running analysis: BlockFrequencyAnalysis on foo
; CHECK-POSTLINK-O-NEXT: Running analysis: BranchProbabilityAnalysis on foo
; CHECK-POSTLINK-O-NEXT: Running pass: InferAlignmentPass
; CHECK-POSTLINK-O-NEXT: Running pass: LoopLoadEliminationPass
; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ define i64 @predicated_udiv_scalarized_operand(ptr %a, i64 %x) {
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[TMP17:%.*]], [[PRED_UDIV_CONTINUE2]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <2 x i64> [[WIDE_LOAD]], zeroinitializer
; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <2 x i64> [[WIDE_LOAD]], splat (i64 1)
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i64 0
; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
; CHECK: pred.udiv.if:
Expand Down Expand Up @@ -65,7 +65,7 @@ for.body:
%r = phi i64 [ 0, %entry ], [ %var6, %for.inc ]
%var0 = getelementptr inbounds i64, ptr %a, i64 %i
%var2 = load i64, ptr %var0, align 4
%cond0 = icmp sgt i64 %var2, 0
%cond0 = icmp sgt i64 %var2, 1
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This constant was changed to keep the old branch probability the same and keep the block scalarized, since with BFI icmp sgt %x, 0 is predicted to be slightly > 50%.

br i1 %cond0, label %if.then, label %for.inc

if.then:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -612,63 +612,18 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
;
; COMMON-LABEL: define void @low_trip_count_fold_tail_scalarized_store(
; COMMON-SAME: ptr [[DST:%.*]]) {
; COMMON-NEXT: [[ENTRY:.*:]]
; COMMON-NEXT: br label %[[VECTOR_PH:.*]]
; COMMON: [[VECTOR_PH]]:
; COMMON-NEXT: br label %[[VECTOR_BODY:.*]]
; COMMON: [[VECTOR_BODY]]:
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
; COMMON: [[PRED_STORE_IF]]:
; COMMON-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[DST]], i64 0
; COMMON-NEXT: store i8 0, ptr [[TMP0]], align 1
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE]]
; COMMON: [[PRED_STORE_CONTINUE]]:
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
; COMMON: [[PRED_STORE_IF1]]:
; COMMON-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[DST]], i64 1
; COMMON-NEXT: store i8 1, ptr [[TMP1]], align 1
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE2]]
; COMMON: [[PRED_STORE_CONTINUE2]]:
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
; COMMON: [[PRED_STORE_IF3]]:
; COMMON-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[DST]], i64 2
; COMMON-NEXT: store i8 2, ptr [[TMP2]], align 1
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE4]]
; COMMON: [[PRED_STORE_CONTINUE4]]:
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
; COMMON: [[PRED_STORE_IF5]]:
; COMMON-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 3
; COMMON-NEXT: store i8 3, ptr [[TMP3]], align 1
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE6]]
; COMMON: [[PRED_STORE_CONTINUE6]]:
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
; COMMON: [[PRED_STORE_IF7]]:
; COMMON-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[DST]], i64 4
; COMMON-NEXT: store i8 4, ptr [[TMP4]], align 1
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE8]]
; COMMON: [[PRED_STORE_CONTINUE8]]:
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
; COMMON: [[PRED_STORE_IF9]]:
; COMMON-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DST]], i64 5
; COMMON-NEXT: store i8 5, ptr [[TMP5]], align 1
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE10]]
; COMMON: [[PRED_STORE_CONTINUE10]]:
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
; COMMON: [[PRED_STORE_IF11]]:
; COMMON-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[DST]], i64 6
; COMMON-NEXT: store i8 6, ptr [[TMP6]], align 1
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE12]]
; COMMON: [[PRED_STORE_CONTINUE12]]:
; COMMON-NEXT: br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[EXIT:.*]]
; COMMON: [[PRED_STORE_IF13]]:
; COMMON-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], i64 7
; COMMON-NEXT: store i8 7, ptr [[TMP7]], align 1
; COMMON-NEXT: br label %[[EXIT]]
; COMMON: [[EXIT]]:
; COMMON-NEXT: br label %[[SCALAR_PH:.*]]
; COMMON: [[SCALAR_PH]]:
; COMMON-NEXT: br [[EXIT1:label %.*]]
; COMMON: [[SCALAR_PH1:.*:]]
; COMMON-NEXT: [[ENTRY:.*]]:
; COMMON-NEXT: br label %[[EXIT1:.*]]
; COMMON: [[EXIT1]]:
; COMMON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[EXIT1]] ]
; COMMON-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i8
; COMMON-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]]
; COMMON-NEXT: store i8 [[IV_TRUNC]], ptr [[GEP]], align 1
; COMMON-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; COMMON-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 7
; COMMON-NEXT: br i1 [[EC]], label %[[SCALAR_PH1:.*]], label %[[EXIT1]]
; COMMON: [[SCALAR_PH1]]:
; COMMON-NEXT: ret void
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Previously all the predicated scalar stores were discounted by x0.5 in computePredInstDiscount. BFI now correctly returns that this block is always executed so the VF=1 plan is no longer discounted.

;
entry:
br label %loop
Expand Down Expand Up @@ -1241,8 +1196,7 @@ define void @pred_udiv_select_cost(ptr %A, ptr %B, ptr %C, i64 %n, i8 %y) #1 {
; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
; DEFAULT-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
; DEFAULT-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 2
; DEFAULT-NEXT: [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP2]], i64 8)
; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; DEFAULT: [[VECTOR_MEMCHECK]]:
; DEFAULT-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@ define i64 @same_exit_block_pre_inc_use1_nosve() {
; CHECK-NEXT: Cost of 48 for VF 16: EMIT vp<{{.*}}> = first-active-lane ir<%cmp3>
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.*}}> = add
; CHECK-NEXT: Cost of 0 for VF 16: vp<{{.*}}> = DERIVED-IV
; CHECK: LV: Minimum required TC for runtime checks to be profitable:160
; CHECK-NEXT: LV: Vectorization is not beneficial: expected trip count < minimum profitable VF (64 < 160)
; CHECK: LV: Minimum required TC for runtime checks to be profitable:128
; CHECK-NEXT: LV: Vectorization is not beneficial: expected trip count < minimum profitable VF (64 < 128)
; CHECK-NEXT: LV: Too many memory checks needed.
entry:
%p1 = alloca [1024 x i8]
Expand Down Expand Up @@ -105,7 +105,7 @@ loop.header:
%gep.src = getelementptr inbounds i64, ptr %src, i64 %iv
%l = load i64, ptr %gep.src, align 1
%t = trunc i64 %l to i1
br i1 %t, label %exit.0, label %loop.latch
br i1 %t, label %exit.0, label %loop.latch, !prof !0
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added to retain the old branch probability. With BFI, the probability of exit.0 being taken is computed as 3% (why that is, I'm not sure) and the IV increment isn't discounted in the VF=1 plan.


loop.latch:
%iv.next = add i64 %iv, 1
Expand All @@ -120,4 +120,6 @@ exit.1:
ret i64 0
}

!0 = !{!"branch_weights", i32 1, i32 1}

attributes #1 = { "target-features"="+sve" vscale_range(1,16) }
Loading
Loading