llvm · fhahn · Nov 25, 2025
@@ -3492,17 +3492,27 @@ const SCEV *ScalarEvolution::getUDivExpr(const SCEV *LHS,
           /// {X,+,N}/C => {Y,+,N}/C where Y=X-(X%N). Safe when C%N=0.
           // We can currently only fold X%N if X is constant.
           const SCEVConstant *StartC = dyn_cast<SCEVConstant>(AR->getStart());
-          if (StartC && !DivInt.urem(StepInt) &&
-              getZeroExtendExpr(AR, ExtTy) ==
-              getAddRecExpr(getZeroExtendExpr(AR->getStart(), ExtTy),
-                            getZeroExtendExpr(Step, ExtTy),
-                            AR->getLoop(), SCEV::FlagAnyWrap)) {
+          if (StartC && !DivInt.urem(StepInt)) {
             const APInt &StartInt = StartC->getAPInt();
             const APInt &StartRem = StartInt.urem(StepInt);
-            if (StartRem != 0) {
-              const SCEV *NewLHS =
-                  getAddRecExpr(getConstant(StartInt - StartRem), Step,
-                                AR->getLoop(), SCEV::FlagNW);
+            bool NoWrap =
+                getZeroExtendExpr(AR, ExtTy) ==
+                getAddRecExpr(getZeroExtendExpr(AR->getStart(), ExtTy),
+                              getZeroExtendExpr(Step, ExtTy), AR->getLoop(),
+                              SCEV::FlagAnyWrap);
+
+            // With X < N <= C and both N, C as powers-of-2, the transformation
+            // {X,+,N}/C => {(X - X%N),+,N}/C preserves division results even
+            // if wrapping occurs, as the division results remain equivalent for
+            // all offsets in [[(X - X%N), X).
+            bool CanFoldWithWrap = StepInt.isStrictlyPositive() &&
+                                   StartInt.ult(StepInt) && // X < N
+                                   StepInt.sle(DivInt) &&   // N <= C
+                                   StepInt.isPowerOf2() && DivInt.isPowerOf2();
+            if (StartRem != 0 && (NoWrap || CanFoldWithWrap)) {
+              const SCEV *NewLHS = getAddRecExpr(
+                  getConstant(StartInt - StartRem), Step, AR->getLoop(),
+                  NoWrap ? SCEV::FlagNW : SCEV::FlagAnyWrap);
               if (LHS != NewLHS) {
                 LHS = NewLHS;
 

diff --git a/llvm/test/Analysis/ScalarEvolution/addrec-may-wrap-udiv-canonicalize.ll b/llvm/test/Analysis/ScalarEvolution/addrec-may-wrap-udiv-canonicalize.ll
@@ -13,7 +13,7 @@ define void @test_step2_div4(i64 %n) {
 ; CHECK-NEXT:    %iv.1 = add i64 %iv, 1
 ; CHECK-NEXT:    --> {1,+,2}<%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %div.1 = udiv i64 %iv.1, 4
-; CHECK-NEXT:    --> ({1,+,2}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> ({0,+,2}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %iv.2 = add i64 %iv, 2
 ; CHECK-NEXT:    --> {2,+,2}<%loop> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %div.2 = udiv i64 %iv.2, 4
@@ -114,15 +114,15 @@ define void @test_step4_div4(i64 %n) {
 ; CHECK-NEXT:    %iv.1 = add i64 %iv, 1
 ; CHECK-NEXT:    --> {1,+,4}<%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %div.1 = udiv i64 %iv.1, 4
-; CHECK-NEXT:    --> ({1,+,4}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> ({0,+,4}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %iv.2 = add i64 %iv, 2
 ; CHECK-NEXT:    --> {2,+,4}<%loop> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %div.2 = udiv i64 %iv.2, 4
-; CHECK-NEXT:    --> ({2,+,4}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> ({0,+,4}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %iv.3 = add i64 %iv, 3
 ; CHECK-NEXT:    --> {3,+,4}<%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %div.3 = udiv i64 %iv.3, 4
-; CHECK-NEXT:    --> ({3,+,4}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> ({0,+,4}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %iv.4 = add i64 %iv, 4
 ; CHECK-NEXT:    --> {4,+,4}<%loop> U: [0,-3) S: [-9223372036854775808,9223372036854775805) Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %div.4 = udiv i64 %iv.4, 4

diff --git a/llvm/test/Transforms/LoopVectorize/X86/uniformshift.ll b/llvm/test/Transforms/LoopVectorize/X86/uniformshift.ll
@@ -5,7 +5,7 @@
 ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %shift = ashr i32 %val, %k
 ; CHECK: Cost of 2 for VF 2: WIDEN ir<%shift> = ashr ir<%val>, ir<%k>
 ; CHECK: Cost of 2 for VF 4: WIDEN ir<%shift> = ashr ir<%val>, ir<%k>
-define void @foo(ptr nocapture %p, i32 %k) local_unnamed_addr #0 {
+define void @foo(ptr nocapture %p, i32 %k) local_unnamed_addr {
 entry:
   br label %body
 
@@ -21,5 +21,31 @@ body:
 
 exit:
   ret void
+}
+
+; CHECK: 'shift_and_masked_load_store'
+; CHECK: Cost of 1 for VF 2: CLONE ir<%shifted> = lshr vp<{{.+}}>, ir<2>
+; CHECK: Cost of 1 for VF 4: CLONE ir<%shifted> = lshr vp<{{.+}}>, ir<2>
+; CHECK: Cost of 4 for VF 8: WIDEN ir<%shifted> = lshr ir<%iv>, ir<2>
+define void @shift_and_masked_load_store(i64 %trip.count) #0 {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %shifted = lshr i64 %iv, 2
+  %masked.idx = and i64 %shifted, 1
+  %load.ptr = getelementptr i16, ptr poison, i64 %masked.idx
+  %val = load i16, ptr %load.ptr, align 2
+  %store.idx = shl nuw i64 %iv, 2
+  %store.ptr = getelementptr i8, ptr poison, i64 %store.idx
+  store i16 %val, ptr %store.ptr, align 2
+  %iv.next = add i64 %iv, 1
+  %cmp = icmp eq i64 %iv, %trip.count
+  br i1 %cmp, label %exit, label %loop
 
+exit:
+  ret void
 }
+
+attributes #0 = { "target-features"="+avx2" "tune-cpu"="alderlake" }