llvm · nasherm · Oct 13, 2025 · Oct 15, 2025 · Oct 15, 2025 · Oct 24, 2025
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7212,6 +7212,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
 
   VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
   VPlanTransforms::simplifyRecipes(BestVPlan);
+  VPlanTransforms::removeRedundantAndMasks(BestVPlan);
   VPlanTransforms::removeBranchOnConst(BestVPlan);
   if (BestVPlan.getEntry()->getSingleSuccessor() ==
       BestVPlan.getScalarPreheader()) {

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -4406,3 +4406,146 @@ void VPlanTransforms::addExitUsersForFirstOrderRecurrences(VPlan &Plan,
     }
   }
 }
+
+// Use vector.check block to determine if we can eliminate a bounds check on
+// the IV if we know that we can only enter the vector block if the tripcount
+// is within certain bounds.
+static bool canElimAndMaskOnPHI(Instruction *I, VPIRBasicBlock *SCEVCheckBB,
+                                Value *SCEVCheckConditional) {
+
+  if (I->getOpcode() != Instruction::And)
+    return false;
+
+  Value *Op0 = I->getOperand(0);
+  Value *Op1 = I->getOperand(1);
+
+  PHINode *IndVar;
+  ConstantInt *Mask;
+
+  if (Mask = dyn_cast<ConstantInt>(Op0))
+    IndVar = dyn_cast<PHINode>(Op1);
+  else if (Mask = dyn_cast<ConstantInt>(Op1))
+    IndVar = dyn_cast<PHINode>(Op0);
+
+  if (!Mask || !IndVar)
+    return false;
+
+  if (auto *CmpI = dyn_cast<CmpInst>(SCEVCheckConditional)) {
+    // Check if the condition for the terminating instruction
+    // is doing some comparison with a constant integer. If not
+    // we can't elim our AND mask
+    Value *CmpOp0 = CmpI->getOperand(0);
+    Value *CmpOp1 = CmpI->getOperand(1);
+    auto *CmpConstant = (dyn_cast<ConstantInt>(CmpOp0))
+                            ? dyn_cast<ConstantInt>(CmpOp0)
+                            : dyn_cast<ConstantInt>(CmpOp1);
+    if (!CmpConstant)
+      return false;
+
+    unsigned CmpIOpcode = CmpI->getPredicate();
+    if (((CmpConstant == CmpOp1 && CmpIOpcode == CmpInst::ICMP_UGT) ||
+         (CmpConstant == CmpOp0 && CmpIOpcode == CmpInst::ICMP_ULT)) &&
+        (CmpConstant->uge(Mask->getZExtValue())))
+      return true;
+  }
+  return false;
+}
+
+// Check that there's a path from the src BB to the dest BB
+static bool CheckPathFromSrcBBToDestBB(VPBlockBase *Src, VPBlockBase *Dest) {
+  if (!Src || !Dest)
+    return false;
+
+  for (auto *VPBB : Src->getSuccessors()) {
+    if (VPBB == Dest) {
+      return true;
+    } else if (VPBB->getNumSuccessors() > 0 &&
+               CheckPathFromSrcBBToDestBB(VPBB, Dest))
+      return true;
+  }
+  return false;
+};
+
+// Attempt to spot and eliminate no-op AND operations in loop bodies.
+// For example loop Vectorization may create loops like the following.
+//
+// vector.scevcheck:
+//   %1 = add i64 %flatten.tripcount, -1
+//   %2 = icmp ugt i64 %1, 4294967295
+//   br i1 %2, label %scalar.ph, label %vector.ph
+// vector.ph:
+//    %iv = phi i64 [ 0, %vector.scevcheck], [ %iv.next, %vector.ph ]
+//    %m  = and i64 %iv, 4294967295 ; 0xffff_fffe  no op
+//    %p  = getelementptr inbounds <4 x i32>, ptr %A, i64 %m
+//    %load = load <4 x i32>, ptr %p, align 4
+//    %1 = add <4 x i32> %load,  %X
+//    store <4 x i32> %1, ptr %p, align 4
+//    %iv.next = add nuw i64 %iv, 4
+//    %c  = icmp ult i64 %iv.next, %N
+//    br i1 %c, label %vector.ph, label %exit
+//  exit:
+//    ret void
+//
+// The vectorizer creates the SCEV check block to perform
+// runtime IV checks. This block can be used to determine true
+// range of the the IV as entry into the vector loop is only possible
+// for certain tripcount values.
+//
+void VPlanTransforms::removeRedundantAndMasks(VPlan &Plan) {
+  if (!Plan.getVectorLoopRegion())
+    return;
+
+  auto FindSCEVCheckBlock = [&]() -> VPIRBasicBlock * {
+    for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+             vp_depth_first_deep(Plan.getEntry()))) {
+      if (auto *IRBB = dyn_cast<VPIRBasicBlock>(VPBB))
+        if (IRBB->getIRBasicBlock()->getName() == "vector.scevcheck")
+          return IRBB;
+    }
+    return nullptr;
+  };
+
+  auto FindPHIRecipeToReplaceAnd = [&](VPBasicBlock *VPBB,
+                                       VPSingleDefRecipe *ToReplace) -> void {
+    VPRecipeBase *PredRecipe = nullptr;
+    for (auto &PHI : VPBB->phis()) {
+      if (auto *VPI = dyn_cast<VPSingleDefRecipe>(&PHI))
+        if (ToReplace->getOperand(0) == VPI ||
+            ToReplace->getOperand(1) == VPI) {
+          ToReplace->replaceAllUsesWith(VPI);
+          return;
+        }
+    }
+  };
+
+  if (VPIRBasicBlock *SCEVCheckBB = FindSCEVCheckBlock()) {
+    VPBasicBlock *VPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
+
+    // Determine if the SCEV check BB branches to the loop preheader
+    // or header
+    if (!CheckPathFromSrcBBToDestBB(SCEVCheckBB, Plan.getVectorPreheader()) &&
+        !CheckPathFromSrcBBToDestBB(SCEVCheckBB, Plan.getVectorLoopRegion()))
+      return;
+
+    if (auto *SCEVCheckTerminatorRecipe =
+            dyn_cast<VPInstruction>(SCEVCheckBB->getTerminator())) {
+      if (SCEVCheckTerminatorRecipe->getOpcode() != VPInstruction::BranchOnCond)
+        return;
+
+      VPValue *SCEVCheckCondRecipe = SCEVCheckTerminatorRecipe->getOperand(0);
+
+      for (auto &R : VPBB->getRecipeList()) {
+        if (auto *VPI = dyn_cast<VPSingleDefRecipe>(&R)) {
+          Value *V = VPI->getUnderlyingValue();
+          if (!V)
+            continue;
+
+          if (Instruction *I = dyn_cast<Instruction>(V))
+            if (canElimAndMaskOnPHI(I, SCEVCheckBB,
+                                    SCEVCheckCondRecipe->getLiveInIRValue()))
+              return FindPHIRecipeToReplaceAnd(VPBB, VPI);
+        }
+      }
+    }
+  }
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -376,6 +376,11 @@ struct VPlanTransforms {
   /// users in the original exit block using the VPIRInstruction wrapping to the
   /// LCSSA phi.
   static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range);
+
+  // Make use of runtime SCEV check blocks generated by the LoopVectorizer
+  // to see if we can eliminate bounds checking AND instructions in loop
+  // blocks
+  static void removeRedundantAndMasks(VPlan &Plan);
 };
 
 } // namespace llvm

diff --git a/llvm/test/Transforms/LoopVectorize/vplan-transforms-remove-redundant-masks.ll b/llvm/test/Transforms/LoopVectorize/vplan-transforms-remove-redundant-masks.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes=loop-vectorize -force-vector-width=2 < %s | FileCheck %s
+
+define void @elim_no_op_and(i32 %N, ptr  %A, i32 %val) {
+; CHECK-LABEL: define void @elim_no_op_and(
+; CHECK-SAME: i32 [[N:%.*]], ptr [[A:%.*]], i32 [[VAL:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP16_NOT:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP16_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_COND1_PREHEADER_LR_PH:.*]]
+; CHECK:       [[FOR_COND1_PREHEADER_LR_PH]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[FLATTEN_TRIPCOUNT:%.*]] = mul nuw i64 [[TMP0]], [[TMP0]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[FLATTEN_TRIPCOUNT]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; CHECK:       [[VECTOR_SCEVCHECK]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[FLATTEN_TRIPCOUNT]], -1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ugt i64 [[TMP1]], 4294967295
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[FLATTEN_TRIPCOUNT]], 2
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[FLATTEN_TRIPCOUNT]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[VAL]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    store <2 x i32> [[TMP5]], ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP3]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[FLATTEN_TRIPCOUNT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_COND1_PREHEADER_LR_PH]] ], [ 0, %[[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    br label %[[FOR_COND1_PREHEADER:.*]]
+; CHECK:       [[FOR_COND1_PREHEADER]]:
+; CHECK-NEXT:    [[INDVAR18:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVAR_NEXT19:%.*]], %[[FOR_COND1_PREHEADER]] ]
+; CHECK-NEXT:    [[IDXPROM:%.*]] = and i64 [[INDVAR18]], 4294967295
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ADD5:%.*]] = add i32 [[TMP7]], [[VAL]]
+; CHECK-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[INDVAR_NEXT19]] = add nuw i64 [[INDVAR18]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVAR_NEXT19]], [[FLATTEN_TRIPCOUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_COND1_PREHEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp16.not = icmp eq i32 %N, 0
+  br i1 %cmp16.not, label %for.cond.cleanup, label %for.cond1.preheader.lr.ph
+
+for.cond1.preheader.lr.ph:                        ; preds = %entry
+  %0 = zext i32 %N to i64
+  %flatten.tripcount = mul nuw i64 %0, %0
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader.lr.ph, %for.cond1.preheader
+  %indvar18 = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvar.next19, %for.cond1.preheader ]
+  %idxprom = and i64 %indvar18, 4294967295
+  %arrayidx = getelementptr inbounds nuw i32, ptr %A, i64 %idxprom
+  %1 = load i32, ptr %arrayidx, align 4
+  %add5 = add i32 %1, %val
+  store i32 %add5, ptr %arrayidx, align 4
+  %indvar.next19 = add nuw i64 %indvar18, 1
+  %exitcond.not = icmp eq i64 %indvar.next19, %flatten.tripcount
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.cond1.preheader
+
+for.cond.cleanup.loopexit:                        ; preds = %for.cond1.preheader
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+}
+
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
+;.