diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index e9ace195684b3..87c1370a7aad3 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7992,9 +7992,9 @@ void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { }); } -VPWidenMemoryRecipe * -VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef Operands, - VFRange &Range) { +VPWidenMemoryRecipe *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI, + VFRange &Range) { + Instruction *I = VPI->getUnderlyingInstr(); assert((isa(I) || isa(I)) && "Must be called with either a load or store"); @@ -8016,7 +8016,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef Operands, VPValue *Mask = nullptr; if (Legal->isMaskRequired(I)) - Mask = getBlockInMask(Builder.getInsertBlock()); + Mask = VPI->getMask(); // Determine if the pointer operand of the access is either consecutive or // reverse consecutive. @@ -8026,7 +8026,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef Operands, bool Consecutive = Reverse || Decision == LoopVectorizationCostModel::CM_Widen; - VPValue *Ptr = isa(I) ? Operands[0] : Operands[1]; + VPValue *Ptr = isa(I) ? VPI->getOperand(0) : VPI->getOperand(1); if (Consecutive) { auto *GEP = dyn_cast( Ptr->getUnderlyingValue()->stripPointerCasts()); @@ -8055,9 +8055,9 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef Operands, VPIRMetadata(*Load, LVer), I->getDebugLoc()); StoreInst *Store = cast(I); - return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive, - Reverse, VPIRMetadata(*Store, LVer), - I->getDebugLoc()); + return new VPWidenStoreRecipe(*Store, Ptr, VPI->getOperand(0), Mask, + Consecutive, Reverse, + VPIRMetadata(*Store, LVer), I->getDebugLoc()); } /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also @@ -8136,9 +8136,9 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( return nullptr; } -VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, - ArrayRef Operands, +VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(VPInstruction *VPI, VFRange &Range) { + CallInst *CI = cast(VPI->getUnderlyingInstr()); bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI, VF); @@ -8155,7 +8155,8 @@ VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, ID == Intrinsic::experimental_noalias_scope_decl)) return nullptr; - SmallVector Ops(Operands.take_front(CI->arg_size())); + SmallVector Operands(VPI->operands()); + SmallVector Ops(ArrayRef(Operands).take_front(CI->arg_size())); // Is it beneficial to perform intrinsic call compared to lib call? bool ShouldUseVectorIntrinsic = @@ -8201,6 +8202,9 @@ VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, }, Range); if (ShouldUseVectorCall) { + VPValue *Mask = nullptr; + if (VPI->isMasked()) + Mask = Operands.pop_back_val(); if (MaskPos.has_value()) { // We have 2 cases that would require a mask: // 1) The block needs to be predicated, either due to a conditional @@ -8209,10 +8213,7 @@ VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, // 2) No mask is required for the block, but the only available // vector variant at this VF requires a mask, so we synthesize an // all-true mask. - VPValue *Mask = nullptr; - if (Legal->isMaskRequired(CI)) - Mask = getBlockInMask(Builder.getInsertBlock()); - else + if (!Legal->isMaskRequired(CI)) Mask = Plan.getOrAddLiveIn( ConstantInt::getTrue(IntegerType::getInt1Ty(CI->getContext()))); @@ -8240,20 +8241,22 @@ bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { Range); } -VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, - ArrayRef Operands) { - switch (I->getOpcode()) { +VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPInstruction *VPI) { + ArrayRef Operands(VPI->operands()); + switch (VPI->getOpcode()) { default: return nullptr; case Instruction::SDiv: case Instruction::UDiv: case Instruction::SRem: case Instruction::URem: { + VPValue *Mask = Operands.back(); + if (VPI->isMasked()) + Operands = Operands.drop_back(); // If not provably safe, use a select to form a safe divisor before widening the // div/rem operation itself. Otherwise fall through to general handling below. if (CM.isPredicatedInst(I)) { SmallVector Ops(Operands); - VPValue *Mask = getBlockInMask(Builder.getInsertBlock()); VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false)); auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc()); @@ -8318,9 +8321,8 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, }; } -VPHistogramRecipe * -VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI, - ArrayRef Operands) { +VPHistogramRecipe *VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI, + VPInstruction *VPI) { // FIXME: Support other operations. unsigned Opcode = HI->Update->getOpcode(); assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) && @@ -8328,14 +8330,14 @@ VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI, SmallVector HGramOps; // Bucket address. - HGramOps.push_back(Operands[1]); + HGramOps.push_back(VPI->getOperand(1)); // Increment value. HGramOps.push_back(getVPValueOrAddLiveIn(HI->Update->getOperand(1))); // In case of predicated execution (due to tail-folding, or conditional // execution, or both), pass the relevant mask. if (Legal->isMaskRequired(HI->Store)) - HGramOps.push_back(getBlockInMask(Builder.getInsertBlock())); + HGramOps.push_back(VPI->getMask()); return new VPHistogramRecipe(Opcode, HGramOps, HI->Store->getDebugLoc()); } @@ -8567,6 +8569,10 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R, return PhiRecipe; } + auto *VPI = cast(R); + if (VPI->isMasked()) + Operands.pop_back(); + if (isa(Instr) && (Recipe = tryToOptimizeInductionTruncate( cast(Instr), Operands, Range))) return Recipe; @@ -8576,18 +8582,19 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R, [&](ElementCount VF) { return VF.isScalar(); }, Range)) return nullptr; - if (auto *CI = dyn_cast(Instr)) - return tryToWidenCall(CI, Operands, Range); + if (VPI->getOpcode() == Instruction::Call) + return tryToWidenCall(VPI, Range); if (StoreInst *SI = dyn_cast(Instr)) if (auto HistInfo = Legal->getHistogramInfo(SI)) - return tryToWidenHistogram(*HistInfo, Operands); + return tryToWidenHistogram(*HistInfo, VPI); - if (isa(Instr) || isa(Instr)) - return tryToWidenMemory(Instr, Operands, Range); + if (VPI->getOpcode() == Instruction::Load || + VPI->getOpcode() == Instruction::Store) + return tryToWidenMemory(VPI, Range); if (std::optional ScaleFactor = getScalingForReduction(Instr)) - return tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value()); + return tryToCreatePartialReduction(VPI, ScaleFactor.value()); if (!shouldWiden(Instr, Range)) return nullptr; @@ -8600,51 +8607,48 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R, } if (auto *CI = dyn_cast(Instr)) { - return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(), - *CI); + return new VPWidenCastRecipe(CI->getOpcode(), VPI->getOperand(0), + CI->getType(), *CI); } - return tryToWiden(Instr, Operands); + return tryToWiden(Instr, VPI); } VPRecipeBase * -VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction, - ArrayRef Operands, +VPRecipeBuilder::tryToCreatePartialReduction(VPInstruction *Reduction, unsigned ScaleFactor) { - assert(Operands.size() == 2 && - "Unexpected number of operands for partial reduction"); - - VPValue *BinOp = Operands[0]; - VPValue *Accumulator = Operands[1]; + VPValue *BinOp = Reduction->getOperand(0); + VPValue *Accumulator = Reduction->getOperand(0); VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe(); if (isa(BinOpRecipe) || isa(BinOpRecipe)) std::swap(BinOp, Accumulator); unsigned ReductionOpcode = Reduction->getOpcode(); + auto *ReductionI = Reduction->getUnderlyingInstr(); if (ReductionOpcode == Instruction::Sub) { - auto *const Zero = ConstantInt::get(Reduction->getType(), 0); + auto *const Zero = ConstantInt::get(ReductionI->getType(), 0); SmallVector Ops; Ops.push_back(Plan.getOrAddLiveIn(Zero)); Ops.push_back(BinOp); - BinOp = new VPWidenRecipe(*Reduction, Ops); + BinOp = new VPWidenRecipe(*ReductionI, Ops); Builder.insert(BinOp->getDefiningRecipe()); ReductionOpcode = Instruction::Add; } VPValue *Cond = nullptr; - if (CM.blockNeedsPredicationForAnyReason(Reduction->getParent())) { + if (Reduction->isMasked()) { assert((ReductionOpcode == Instruction::Add || ReductionOpcode == Instruction::Sub) && "Expected an ADD or SUB operation for predicated partial " "reductions (because the neutral element in the mask is zero)!"); - Cond = getBlockInMask(Builder.getInsertBlock()); + Cond = Reduction->getMask(); VPValue *Zero = - Plan.getOrAddLiveIn(ConstantInt::get(Reduction->getType(), 0)); + Plan.getOrAddLiveIn(ConstantInt::get(ReductionI->getType(), 0)); BinOp = Builder.createSelect(Cond, BinOp, Zero, Reduction->getDebugLoc()); } return new VPPartialReductionRecipe(ReductionOpcode, Accumulator, BinOp, Cond, - ScaleFactor, Reduction); + ScaleFactor, ReductionI); } void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, @@ -9067,8 +9071,11 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) { // Only create recipe for the final invariant store of the reduction. if (Legal->isInvariantStoreOfReduction(SI)) { + auto Ops = R.operands(); + if (cast(R).isMasked()) + Ops = drop_end(Ops); auto *Recipe = - new VPReplicateRecipe(SI, R.operands(), true /* IsUniform */, + new VPReplicateRecipe(SI, Ops, true /* IsUniform */, nullptr /*Mask*/, VPIRMetadata(*SI, LVer)); Recipe->insertBefore(*MiddleVPBB, MBIP); } @@ -9080,6 +9087,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( RecipeBuilder.tryToCreateWidenRecipe(SingleDef, Range); if (!Recipe) { SmallVector Operands(R.operands()); + if (cast(R).isMasked()) { + Operands.pop_back(); + } Recipe = RecipeBuilder.handleReplication(Instr, Operands, Range); } diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index 8369c78a2d78f..293d2991990dd 100644 --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -92,12 +92,10 @@ class VPRecipeBuilder { /// Range. The function should not be called for memory instructions or calls. bool shouldWiden(Instruction *I, VFRange &Range) const; - /// Check if the load or store instruction \p I should widened for \p + /// Check if the load or store instruction \p VPI should widened for \p /// Range.Start and potentially masked. Such instructions are handled by a /// recipe that takes an additional VPInstruction for the mask. - VPWidenMemoryRecipe *tryToWidenMemory(Instruction *I, - ArrayRef Operands, - VFRange &Range); + VPWidenMemoryRecipe *tryToWidenMemory(VPInstruction *VPI, VFRange &Range); /// Check if an induction recipe should be constructed for \p Phi. If so build /// and return it. If not, return null. @@ -114,20 +112,19 @@ class VPRecipeBuilder { /// Handle call instructions. If \p CI can be widened for \p Range.Start, /// return a new VPWidenCallRecipe or VPWidenIntrinsicRecipe. Range.End may be /// decreased to ensure same decision from \p Range.Start to \p Range.End. - VPSingleDefRecipe *tryToWidenCall(CallInst *CI, ArrayRef Operands, - VFRange &Range); + VPSingleDefRecipe *tryToWidenCall(VPInstruction *VPI, VFRange &Range); /// Check if \p I has an opcode that can be widened and return a VPWidenRecipe /// if it can. The function should only be called if the cost-model indicates /// that widening should be performed. - VPWidenRecipe *tryToWiden(Instruction *I, ArrayRef Operands); + VPWidenRecipe *tryToWiden(Instruction *I, VPInstruction *VPI); /// Makes Histogram count operations safe for vectorization, by emitting a /// llvm.experimental.vector.histogram.add intrinsic in place of the /// Load + Add|Sub + Store operations that perform the histogram in the /// original scalar loop. VPHistogramRecipe *tryToWidenHistogram(const HistogramInfo *HI, - ArrayRef Operands); + VPInstruction *VPI); /// Examines reduction operations to see if the target can use a cheaper /// operation with a wider per-iteration input VF and narrower PHI VF. @@ -170,8 +167,7 @@ class VPRecipeBuilder { /// Create and return a partial reduction recipe for a reduction instruction /// along with binary operation and reduction phi operands. - VPRecipeBase *tryToCreatePartialReduction(Instruction *Reduction, - ArrayRef Operands, + VPRecipeBase *tryToCreatePartialReduction(VPInstruction *Reduction, unsigned ScaleFactor); /// Set the recipe created for given ingredient. diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 44f0b6d964a6e..9d610cbdf4f82 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -967,6 +967,13 @@ class VPInstruction : public VPRecipeWithIRFlags, /// value for lane \p Lane. Value *generatePerLane(VPTransformState &State, const VPLane &Lane); +#if !defined(NDEBUG) + /// Return the number of operands determined by the opcode of the + /// VPInstruction. Returns -1 if the number of operands cannot be determined + /// directly by the opcode. + unsigned getNumOperandsForOpcode() const; +#endif + public: VPInstruction(unsigned Opcode, ArrayRef Operands, DebugLoc DL = {}, const Twine &Name = "") @@ -1029,6 +1036,41 @@ class VPInstruction : public VPRecipeWithIRFlags, } } + bool isMasked() const { + return getNumOperandsForOpcode() + 1 == getNumOperands(); + } + + bool needsMask() const { + if (getNumOperandsForOpcode() == -1u) + return false; + if (Opcode == VPInstruction::BranchOnCond || + Opcode == VPInstruction::BranchOnCount || + Opcode == VPInstruction::Not || Opcode == Instruction::ExtractValue || + Opcode == Instruction::FNeg) + return false; + + switch (Opcode) { + case Instruction::SDiv: + case Instruction::SRem: + case Instruction::UDiv: + case Instruction::URem: + return true; + default: + return mayReadFromMemory() || mayWriteToMemory() || mayHaveSideEffects(); + } + } + + void addMask(VPValue *Mask) { + if (!needsMask()) + return; + assert(!isMasked() && "recipe is already masked"); + addOperand(Mask); + } + + VPValue *getMask() const { + return isMasked() ? getOperand(getNumOperands() - 1) : nullptr; + } + /// Returns true if the underlying opcode may read from or write to memory. bool opcodeMayReadOrWriteFromMemory() const; diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 926490bfad7d0..62314722253e9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -56,7 +56,9 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { // other operands match and cache them. auto SetResultTyFromOp = [this, R]() { Type *ResTy = inferScalarType(R->getOperand(0)); - for (unsigned Op = 1; Op != R->getNumOperands(); ++Op) { + unsigned NumOperands = + R->isMasked() ? R->getNumOperands() - 1 : R->getNumOperands(); + for (unsigned Op = 1; Op != NumOperands; ++Op) { VPValue *OtherV = R->getOperand(Op); assert(inferScalarType(OtherV) == ResTy && "different types inferred for different operands"); diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp index f0cab79197b4d..b678b0df33226 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp @@ -72,7 +72,7 @@ class VPPredicator { } /// Compute and return the mask for the vector loop header block. - void createHeaderMask(VPBasicBlock *HeaderVPBB, bool FoldTail); + VPValue *createHeaderMask(VPBasicBlock *HeaderVPBB, bool FoldTail); /// Compute and return the predicate of \p VPBB, assuming that the header /// block of the loop is set to True, or to the loop mask when tail folding. @@ -154,10 +154,11 @@ VPValue *VPPredicator::createBlockInMask(VPBasicBlock *VPBB) { return BlockMask; } -void VPPredicator::createHeaderMask(VPBasicBlock *HeaderVPBB, bool FoldTail) { +VPValue *VPPredicator::createHeaderMask(VPBasicBlock *HeaderVPBB, + bool FoldTail) { if (!FoldTail) { setBlockInMask(HeaderVPBB, nullptr); - return; + return nullptr; } // Introduce the early-exit compare IV <= BTC to form header block mask. @@ -173,6 +174,7 @@ void VPPredicator::createHeaderMask(VPBasicBlock *HeaderVPBB, bool FoldTail) { VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); VPValue *BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC); setBlockInMask(HeaderVPBB, BlockMask); + return BlockMask; } void VPPredicator::createSwitchEdgeMasks(VPInstruction *SI) { @@ -272,16 +274,27 @@ VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan, bool FoldTail) { for (VPBlockBase *VPB : RPOT) { // Non-outer regions with VPBBs only are supported at the moment. auto *VPBB = cast(VPB); + auto FirstNonPhi = VPBB->getFirstNonPhi(); // Introduce the mask for VPBB, which may introduce needed edge masks, and // convert all phi recipes of VPBB to blend recipes unless VPBB is the // header. + VPValue *BlockInMask = nullptr; if (VPBB == Header) { - Predicator.createHeaderMask(Header, FoldTail); - continue; + BlockInMask = Predicator.createHeaderMask(Header, FoldTail); + } else { + BlockInMask = Predicator.createBlockInMask(VPBB); + Predicator.convertPhisToBlends(VPBB); } - Predicator.createBlockInMask(VPBB); - Predicator.convertPhisToBlends(VPBB); + if (!BlockInMask) + continue; + + for (VPRecipeBase &R : make_range(FirstNonPhi, VPBB->end())) { + if (isa(&R)) + continue; + auto *VPI = cast(&R); + VPI->addMask(BlockInMask); + } } // Linearize the blocks of the loop into one serial chain. diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index a4831ea7c11f7..bfcce789a9282 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -413,8 +413,70 @@ VPInstruction::VPInstruction(unsigned Opcode, ArrayRef Operands, Opcode(Opcode), Name(Name.str()) { assert(flagsValidForOpcode(getOpcode()) && "Set flags not supported for the provided opcode"); + assert((getNumOperandsForOpcode() == -1u || + getNumOperandsForOpcode() == getNumOperands()) && + "number of operands does not match opcode"); } +#ifndef NDEBUG +unsigned VPInstruction::getNumOperandsForOpcode() const { + if (Instruction::isUnaryOp(getOpcode()) || Instruction::isCast(getOpcode())) + return 1; + + if (Instruction::isBinaryOp(getOpcode())) + return 2; + + switch (getOpcode()) { + case VPInstruction::StepVector: + return 0; + case Instruction::Alloca: + case Instruction::ExtractValue: + case Instruction::Freeze: + case Instruction::Load: + case VPInstruction::AnyOf: + case VPInstruction::BranchOnCond: + case VPInstruction::CalculateTripCountMinusVF: + case VPInstruction::CanonicalIVIncrementForPart: + case VPInstruction::ExplicitVectorLength: + case VPInstruction::ExtractLastElement: + case VPInstruction::ExtractPenultimateElement: + case VPInstruction::FirstActiveLane: + case VPInstruction::Not: + return 1; + + case Instruction::ICmp: + case Instruction::FCmp: + case Instruction::Store: + case VPInstruction::ActiveLaneMask: + case VPInstruction::BranchOnCount: + case VPInstruction::ComputeReductionResult: + case VPInstruction::FirstOrderRecurrenceSplice: + case VPInstruction::LogicalAnd: + case VPInstruction::WideIVStep: + case VPInstruction::PtrAdd: + return 2; + case Instruction::Select: + case VPInstruction::ComputeFindLastIVResult: + return 3; + case Instruction::Call: { + VPValue *LastOp = getOperand(getNumOperands() - 1); + if (LastOp->isLiveIn() && isa(LastOp->getLiveInIRValue())) + return getNumOperands(); + assert( + isa(getOperand(getNumOperands() - 2)->getLiveInIRValue()) && + "Called function must either be the last or second-to-last operand"); + return getNumOperands() - 1; + } + case Instruction::PHI: + case Instruction::GetElementPtr: + case Instruction::Switch: + // Cannot determine the number of operands from the opcode. + return -1u; + } + llvm_unreachable("all cases should be handled above"); +} +#endif + bool VPInstruction::doesGeneratePerAllLanes() const { return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this); } @@ -2706,7 +2768,10 @@ static void scalarizeInstruction(const Instruction *Instr, // Replace the operands of the cloned instructions with their scalar // equivalents in the new loop. - for (const auto &I : enumerate(RepRecipe->operands())) { + auto OpRange = RepRecipe->operands(); + if (isa(Cloned)) + OpRange = drop_end(OpRange); + for (const auto &I : enumerate(OpRange)) { auto InputLane = Lane; VPValue *Operand = I.value(); if (vputils::isSingleScalar(Operand))