diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 5d3b233ed6b6a..e63889c9fd2a1 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1852,6 +1852,12 @@ class TargetTransformInfo { /// maximum register pressure exceeds getNumberOfRegisters. LLVM_ABI bool shouldConsiderVectorizationRegPressure() const; + /// Return true if the loop vectorizer can generate control flow (conditional + /// blocks) inside the vector region. Otherwise, the loop vectorizer will + /// generate a single block for the vector region and handle control flow via + /// a mask. + LLVM_ABI bool preferControlFlow() const; + /// \returns True if the target wants to expand the given reduction intrinsic /// into a shuffle sequence. LLVM_ABI bool shouldExpandReduction(const IntrinsicInst *II) const; diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 4cd607c0d0c8d..5dd418294dad0 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -1099,6 +1099,8 @@ class TargetTransformInfoImplBase { virtual bool shouldConsiderVectorizationRegPressure() const { return false; } + virtual bool preferControlFlow() const { return false; } + virtual bool shouldExpandReduction(const IntrinsicInst *II) const { return true; } diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 42ddb32d24093..f776dc64b89e7 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -793,6 +793,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { return BaseT::preferPredicateOverEpilogue(TFI); } + bool preferControlFlow() const override { return BaseT::preferControlFlow(); } + TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) const override { return BaseT::getPreferredTailFoldingStyle(IVUpdateMayOverflow); diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index bf62623099a97..8590f667d7e89 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -372,6 +372,10 @@ bool TargetTransformInfo::preferPredicateOverEpilogue( return TTIImpl->preferPredicateOverEpilogue(TFI); } +bool TargetTransformInfo::preferControlFlow() const { + return TTIImpl->preferControlFlow(); +} + TailFoldingStyle TargetTransformInfo::getPreferredTailFoldingStyle( bool IVUpdateMayOverflow) const { return TTIImpl->getPreferredTailFoldingStyle(IVUpdateMayOverflow); diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 6886e8964e29e..e4db87065bbd3 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -143,6 +143,8 @@ class RISCVTTIImpl final : public BasicTTIImplBase { bool shouldConsiderVectorizationRegPressure() const override { return true; } + bool preferControlFlow() const override { return false; } + InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index febdc54e666a9..8c620bf14ae24 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -345,6 +345,10 @@ static cl::opt PreferPredicatedReductionSelect( cl::desc( "Prefer predicating a reduction operation over an after loop select.")); +static cl::opt PreferControlFlow( + "prefer-control-flow", cl::init(false), cl::Hidden, + cl::desc("Generate control flow inside the vector region.")); + cl::opt llvm::EnableVPlanNativePath( "enable-vplan-native-path", cl::Hidden, cl::desc("Enable VPlan-native vectorization path with " @@ -4202,6 +4206,10 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { case VPInstruction::ExplicitVectorLength: C += VPI->cost(VF, CostCtx); break; + case VPInstruction::AnyOf: + if (!VPI->getUnderlyingValue()) + C += VPI->cost(VF, CostCtx); + break; default: break; } @@ -8198,6 +8206,8 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, if (CM.foldTailWithEVL()) VPlanTransforms::runPass(VPlanTransforms::addExplicitVectorLength, *Plan, CM.getMaxSafeElements()); + if (PreferControlFlow || TTI.preferControlFlow()) + VPlanTransforms::optimizeConditionalVPBB(*Plan); assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); VPlans.push_back(std::move(Plan)); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index fa1fdaf7b5ce0..8f9d7e73f0480 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -4486,3 +4486,160 @@ void VPlanTransforms::addExitUsersForFirstOrderRecurrences(VPlan &Plan, } } } + +void VPlanTransforms::optimizeConditionalVPBB(VPlan &Plan) { + VPDominatorTree VPDT(Plan); + + VPValue *HeaderMask = findHeaderMask(Plan); + + // Get the mask from the store recipes. + auto GetMask = [&HeaderMask](VPRecipeBase &R) -> VPValue * { + using namespace llvm::VPlanPatternMatch; + if (isa(R)) { + VPValue *OrigMask = cast(R).getMask(); + if (!OrigMask || OrigMask == HeaderMask || + match(OrigMask, m_VPInstruction( + m_VPValue(), m_VPValue()))) + return nullptr; + + return OrigMask; + } + return nullptr; + }; + + // First, collect all masked stores. + SmallVector> MaskedStores; + ReversePostOrderTraversal> RPOT( + Plan.getEntry()); + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(RPOT)) { + for (VPRecipeBase &R : *VPBB) { + if (VPValue *Mask = GetMask(R)) + MaskedStores.emplace_back(&R, Mask); + } + } + + if (MaskedStores.empty()) + return; + + DenseSet Candidates; + auto AddOperandsToCandidates = [&Candidates](VPRecipeBase *R) { + for (VPValue *Op : R->operands()) + if (VPRecipeBase *OpR = Op->getDefiningRecipe()) + Candidates.insert(OpR); + }; + + SmallVector> Tries; + while (!MaskedStores.empty()) { + auto [SR, M] = MaskedStores.pop_back_val(); + Candidates.clear(); + AddOperandsToCandidates(SR); + + SetVector CurrentTree; + CurrentTree.insert(SR); + + VPBasicBlock *MaskBlock = + M->hasDefiningRecipe() ? M->getDefiningRecipe()->getParent() : nullptr; + + // Don't move recipes before the mask and PHI recipes. + auto End = MaskBlock == SR->getParent() + ? M->getDefiningRecipe()->getReverseIterator() + : SR->getParent()->getFirstNonPhi()->getReverseIterator(); + // Also don't move the recipes through any recipe that may have side effects + // or write to memory. + for (auto It = std::next(SR->getReverseIterator()); It != End; ++It) { + if (It->mayHaveSideEffects() || It->mayWriteToMemory()) { + End = It; + break; + } + } + + // Greedily add all recipes that are used to compute the stored value to the + // tree. All users of the added recipe must dominate the store + // recipe. + for (VPRecipeBase &R : make_range(SR->getReverseIterator(), End)) { + // Recipe is not part of the tree + if (!Candidates.contains(&R)) + continue; + + if (any_of(R.definedValues(), [&SR = SR, &VPDT](VPValue *Def) { + for (VPUser *U : Def->users()) { + if (auto *UR = dyn_cast(U)) { + if (UR == SR || VPDT.properlyDominates(UR, SR)) + continue; + } + return true; + } + return false; + })) + continue; + + CurrentTree.insert(&R); + AddOperandsToCandidates(&R); + } + // The previous traversal could have added recipes that are used by + // non-added recipes, which need to be removed from the list. + SmallDenseSet ToRemove; + bool Changed; + do { + Changed = false; + for (VPRecipeBase *R : CurrentTree) { + if (ToRemove.contains(R)) + continue; + if (any_of(R->definedValues(), [&](VPValue *Def) { + for (VPUser *U : Def->users()) { + if (auto *UR = dyn_cast(U)) + if (!CurrentTree.contains(UR) || ToRemove.contains(UR)) + return true; + } + return false; + })) { + Changed = true; + ToRemove.insert(R); + } + } + } while (Changed); + + for (VPRecipeBase *R : ToRemove) + CurrentTree.remove(R); + + if (CurrentTree.size() > 1) + Tries.push_back(CurrentTree); + } + + for (const auto &List : Tries) { + VPRecipeBase *SR = List.front(); + VPValue *M = cast(SR)->getMask(); + assert(M && "Mask VPValue must exist at this point"); + auto Recipes = reverse(List.getArrayRef()); + + // Split the current basic block at the store recipe point so that + // a predicated block can be added in between. + VPBasicBlock *ParentBB = SR->getParent(); + VPBasicBlock *ContBB = ParentBB->splitAt(SR->getIterator()); + + // Create VPBB and insert it between ParentBB and ContBB. + VPBasicBlock *IfBB = Plan.createVPBasicBlock("vector.if.bb"); + VPBlockUtils::insertBlockAfter(IfBB, ParentBB); + if (ContBB->getNumSuccessors() == 0) + ParentBB->getEnclosingLoopRegion()->setExiting(ContBB); + + // Copy recipes into the conditional block. + for (VPRecipeBase *R : Recipes) + R->moveBefore(*IfBB, IfBB->end()); + + // Add the condition and branch in the parent block. + auto *ActiveLane = + new VPInstruction(VPInstruction::AnyOf, {M}, nullptr, "any.of.mask"); + + auto *BranchOnCond = + new VPInstruction(VPInstruction::BranchOnCond, ActiveLane); + ParentBB->appendRecipe(ActiveLane); + ParentBB->appendRecipe(BranchOnCond); + + // Set proper predecessors and successors for the conditional block. + ParentBB->clearSuccessors(); + ParentBB->setSuccessors({IfBB, ContBB}); + ContBB->clearPredecessors(); + ContBB->setPredecessors({ParentBB, IfBB}); + } +} diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index b28559b620e13..56a75c1ddfc3d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -377,6 +377,29 @@ struct VPlanTransforms { /// users in the original exit block using the VPIRInstruction wrapping to the /// LCSSA phi. static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range); + + /// Try to convert flattened control flow into a conditional vector basic + /// block. If there are no active bits in the mask, it will skip all masked + /// operations. This transformation will collect all masked operations + /// bottom-up from the masked stores and put all masked operations in a new + /// vector basic block. The original vector.loop will be split and the newly + /// created basic block will be inserted in between. + /// + /// + /// [ ] <-- vector.loop + /// ^ | %any.active.mask = any-of(%Mask) + /// / | Branch-On-Count %any.active.mask, 0 + /// / |\ + /// | (T)| \ (F) + /// | | v + /// | | [ ] <-- vector.if.bb (masked operations) + /// | | | + /// | | v + /// | +-->[ ] <-- vector.loop.split + /// | | | + /// +---------+ v + /// [ ] <-- middle.block + static void optimizeConditionalVPBB(VPlan &Plan); }; } // namespace llvm diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll new file mode 100644 index 0000000000000..99f03723c567b --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll @@ -0,0 +1,126 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -p loop-vectorize -force-vector-width=4 -S -mtriple=riscv64 -mattr=+v -prefer-control-flow %s | FileCheck %s + +define void @test(i32 %control1, i32 %control2, i32 %target, i32 %reg.4.val, ptr %reg.24.val) { +; CHECK-LABEL: define void @test( +; CHECK-SAME: i32 [[CONTROL1:%.*]], i32 [[CONTROL2:%.*]], i32 [[TARGET:%.*]], i32 [[REG_4_VAL:%.*]], ptr [[REG_24_VAL:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[REG_4_VAL]], 0 +; CHECK-NEXT: br i1 [[CMP1]], label %[[FOR_BODY_LR_PH:.*]], label %[[FOR_END:.*]] +; CHECK: [[FOR_BODY_LR_PH]]: +; CHECK-NEXT: [[SH_PROM:%.*]] = zext nneg i32 [[CONTROL1]] to i64 +; CHECK-NEXT: [[SHL:%.*]] = shl nuw i64 1, [[SH_PROM]] +; CHECK-NEXT: [[SH_PROM5:%.*]] = zext nneg i32 [[CONTROL2]] to i64 +; CHECK-NEXT: [[SHL6:%.*]] = shl nuw i64 1, [[SH_PROM5]] +; CHECK-NEXT: [[SH_PROM10:%.*]] = zext nneg i32 [[TARGET]] to i64 +; CHECK-NEXT: [[SHL11:%.*]] = shl nuw nsw i64 1, [[SH_PROM10]] +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[REG_4_VAL]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = freeze i64 [[SHL6]] +; CHECK-NEXT: [[TMP1:%.*]] = or i64 [[SHL]], [[TMP0]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[SHL11]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP1]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY_SPLIT:.*]] ] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[REG_24_VAL]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[TMP2]], i32 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = and <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP6:%.*]] = and <4 x i64> [[WIDE_LOAD3]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <4 x i64> [[TMP5]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <4 x i64> [[TMP6]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP12:%.*]] = freeze <4 x i1> [[TMP7]] +; CHECK-NEXT: [[TMP13:%.*]] = freeze <4 x i1> [[TMP8]] +; CHECK-NEXT: [[TMP14:%.*]] = or <4 x i1> [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP14]]) +; CHECK-NEXT: br i1 [[TMP11]], label %[[VECTOR_IF_BB:.*]], label %[[VECTOR_BODY_SPLIT]] +; CHECK: [[VECTOR_IF_BB]]: +; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i64> [[WIDE_LOAD3]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP9]], ptr [[TMP2]], i32 8, <4 x i1> [[TMP7]]) +; CHECK-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP10]], ptr [[TMP4]], i32 8, <4 x i1> [[TMP8]]) +; CHECK-NEXT: br label %[[VECTOR_BODY_SPLIT]] +; CHECK: [[VECTOR_BODY_SPLIT]]: +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_LR_PH]] ] +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[REG_24_VAL]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP27:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[TMP28:%.*]] = and i64 [[TMP27]], [[TMP1]] +; CHECK-NEXT: [[OR_COND_NOT:%.*]] = icmp eq i64 [[TMP28]], [[TMP1]] +; CHECK-NEXT: br i1 [[OR_COND_NOT]], label %[[IF_THEN9:.*]], label %[[FOR_INC]] +; CHECK: [[IF_THEN9]]: +; CHECK-NEXT: [[XOR:%.*]] = xor i64 [[TMP27]], [[SHL11]] +; CHECK-NEXT: store i64 [[XOR]], ptr [[ARRAYIDX]], align 8 +; CHECK-NEXT: br label %[[FOR_INC]] +; CHECK: [[FOR_INC]]: +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[FOR_END_LOOPEXIT]]: +; CHECK-NEXT: br label %[[FOR_END]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: ret void +; +entry: + %cmp1 = icmp sgt i32 %reg.4.val, 0 + br i1 %cmp1, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: + %sh_prom = zext nneg i32 %control1 to i64 + %shl = shl nuw i64 1, %sh_prom + %sh_prom5 = zext nneg i32 %control2 to i64 + %shl6 = shl nuw i64 1, %sh_prom5 + %sh_prom10 = zext nneg i32 %target to i64 + %shl11 = shl nuw nsw i64 1, %sh_prom10 + %wide.trip.count = zext nneg i32 %reg.4.val to i64 + %0 = freeze i64 %shl6 + %1 = or i64 %shl, %0 + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.inc ] + %arrayidx = getelementptr inbounds i64, ptr %reg.24.val, i64 %indvars.iv + %2 = load i64, ptr %arrayidx, align 8 + %3 = and i64 %2, %1 + %or.cond.not = icmp eq i64 %3, %1 + br i1 %or.cond.not, label %if.then9, label %for.inc + +if.then9: + %xor = xor i64 %2, %shl11 + store i64 %xor, ptr %arrayidx, align 8 + br label %for.inc + +for.inc: + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.end.loopexit, label %for.body + +for.end.loopexit: + br label %for.end + +for.end: + ret void +} +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +;.