[SLPVectorizer] Widen run-time-strided loads.

Mikhail Gudim · Mikhail Gudim · commit b64fd18f195e · 2025-10-07T02:52:24.000-07:00
Suppose we are given pointers of the form: `%b + x * %s + y * %c_i`
where `%c_i`s are constants and %s is a run-time fixed value.
If the pointers can be rearranged as follows:

```
 %b + 0 * %s + 0
 %b + 0 * %s + 1
 %b + 0 * %s + 2
 ...
 %b + 0 * %s + w

 %b + 1 * %s + 0
 %b + 1 * %s + 1
 %b + 1 * %s + 2
 ...
 %b + 1 * %s + w
 ...
```

It means that the memory can be accessed with a strided loads of width `w`
and stride `%s`.

This is motivated by x264 benchmark.
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2266,7 +2266,7 @@ class BoUpSLP {
                                       int64_t Diff, Value *Ptr0, Value *PtrN,
                                       StridedPtrInfo &SPtrInfo) const;
 
-  bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ScalarTy,
+  bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ElemTy,
                                 Align CommonAlignment,
                                 SmallVectorImpl<unsigned> &SortedIndices,
                                 StridedPtrInfo &SPtrInfo) const;
@@ -6403,7 +6403,8 @@ static bool isReverseOrder(ArrayRef<unsigned> Order) {
 /// Otherwise, SCEV* of the stride value is returned.
 static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
                                      const DataLayout &DL, ScalarEvolution &SE,
-                                     SmallVectorImpl<unsigned> &SortedIndices) {
+                                     SmallVectorImpl<unsigned> &SortedIndices,
+                                     SmallVectorImpl<int64_t> &Coeffs) {
   SmallVector<const SCEV *> SCEVs;
   const SCEV *PtrSCEVLowest = nullptr;
   const SCEV *PtrSCEVHighest = nullptr;
@@ -6478,12 +6479,14 @@ static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
       const auto *SC = dyn_cast<SCEVConstant>(Coeff);
       if (!SC || isa<SCEVCouldNotCompute>(SC))
         return nullptr;
+      Coeffs.push_back((int64_t)SC->getAPInt().getLimitedValue());
       if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
                                                   SE.getMulExpr(Stride, SC)))
                ->isZero())
         return nullptr;
       Dist = SC->getAPInt().getZExtValue();
-    }
+    } else
+      Coeffs.push_back(0);
     // If the strides are not the same or repeated, we can't vectorize.
     if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
       return nullptr;
@@ -6965,23 +6968,124 @@ bool BoUpSLP::analyzeConstantStrideCandidate(
 }
 
 bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
-                                       Type *ScalarTy, Align CommonAlignment,
+                                       Type *ElemTy, Align CommonAlignment,
                                        SmallVectorImpl<unsigned> &SortedIndices,
                                        StridedPtrInfo &SPtrInfo) const {
+  // Group the pointers by constant offset.
+  SmallDenseMap<int64_t, std::pair<SmallVector<Value *>, SmallVector<unsigned>>>
+      OffsetToPointerOpIdxMap;
+  for (auto [Idx, Ptr] : enumerate(PointerOps)) {
+    const SCEV *PtrSCEV = SE->getSCEV(Ptr);
+    if (!PtrSCEV)
+      return false;
+
+    const auto *Add = dyn_cast<SCEVAddExpr>(PtrSCEV);
+    int64_t Offset = 0;
+    if (Add) {
+      for (int I : seq<int>(Add->getNumOperands())) {
+        const auto *SC = dyn_cast<SCEVConstant>(Add->getOperand(I));
+        if (!SC)
+          continue;
+        Offset = SC->getAPInt().getSExtValue();
+        break;
+      }
+    }
+    OffsetToPointerOpIdxMap[Offset].first.push_back(Ptr);
+    OffsetToPointerOpIdxMap[Offset].second.push_back(Idx);
+  }
+  int NumOffsets = OffsetToPointerOpIdxMap.size();
+
   const unsigned Sz = PointerOps.size();
-  // TODO: VecSz may change if we widen the strided load.
   unsigned VecSz = Sz;
+  Type *ScalarTy = ElemTy;
+  if (NumOffsets > 1) {
+    if (Sz % NumOffsets != 0)
+      return false;
+    VecSz = Sz / NumOffsets;
+    ScalarTy = Type::getIntNTy(SE->getContext(),
+                               DL->getTypeSizeInBits(ElemTy).getFixedValue() *
+                                   NumOffsets);
+  }
   FixedVectorType *StridedLoadTy = getWidenedType(ScalarTy, VecSz);
   if (!(Sz > MinProfitableStridedLoads && TTI->isTypeLegal(StridedLoadTy) &&
         TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment)))
     return false;
-  if (const SCEV *Stride =
-          calculateRtStride(PointerOps, ScalarTy, *DL, *SE, SortedIndices)) {
-    SPtrInfo.Ty = getWidenedType(ScalarTy, PointerOps.size());
-    SPtrInfo.StrideSCEV = Stride;
-    return true;
+
+  SmallVector<int64_t> SortedOffsetsV;
+  for (auto [K, _] : OffsetToPointerOpIdxMap)
+    SortedOffsetsV.push_back(K);
+  sort(SortedOffsetsV);
+  if (NumOffsets > 1) {
+    int64_t CommonDiff = SortedOffsetsV[1] - SortedOffsetsV[0];
+    if (CommonDiff != 1)
+      return false;
+    for (int I : seq<int>(1, SortedOffsetsV.size() - 1)) {
+      if (SortedOffsetsV[I + 1] - SortedOffsetsV[I] != CommonDiff)
+        return false;
+    }
   }
-  return false;
+
+  int64_t LowestOffset = SortedOffsetsV[0];
+  SmallVector<Value *> &PointerOps0 =
+      OffsetToPointerOpIdxMap[LowestOffset].first;
+  SmallVector<unsigned> &IndicesInAllPointerOps0 =
+      OffsetToPointerOpIdxMap[LowestOffset].second;
+
+  SmallVector<int64_t> Coeffs0;
+  SmallVector<unsigned> SortedIndicesForOffset0;
+  const SCEV *Stride0 = calculateRtStride(PointerOps0, ElemTy, *DL, *SE,
+                                          SortedIndicesForOffset0, Coeffs0);
+  if (!Stride0)
+    return false;
+  unsigned NumCoeffs0 = Coeffs0.size();
+  if (NumCoeffs0 * NumOffsets != Sz)
+    return false;
+  sort(Coeffs0);
+
+  SmallVector<unsigned> SortedIndicesDraft;
+  SortedIndicesDraft.resize(Sz);
+  auto UpdateSortedIndices =
+      [&](SmallVectorImpl<unsigned> &SortedIndicesForOffset,
+          SmallVectorImpl<unsigned> &IndicesInAllPointerOps,
+          int64_t OffsetNum) {
+        for (const auto [Num, Idx] : enumerate(SortedIndicesForOffset)) {
+          SortedIndicesDraft[Num * NumOffsets + OffsetNum] =
+              IndicesInAllPointerOps[Idx];
+        }
+      };
+
+  UpdateSortedIndices(SortedIndicesForOffset0, IndicesInAllPointerOps0, 0);
+
+  SmallVector<int64_t> Coeffs;
+  SmallVector<unsigned> SortedIndicesForOffset;
+  for (int I : seq<int>(1, NumOffsets)) {
+    Coeffs.clear();
+    SortedIndicesForOffset.clear();
+
+    int64_t Offset = SortedOffsetsV[I];
+    SmallVector<Value *> &PointerOpsForOffset =
+        OffsetToPointerOpIdxMap[Offset].first;
+    SmallVector<unsigned> &IndicesInAllPointerOps =
+        OffsetToPointerOpIdxMap[Offset].second;
+    const SCEV *StrideWithinGroup = calculateRtStride(
+        PointerOpsForOffset, ElemTy, *DL, *SE, SortedIndicesForOffset, Coeffs);
+
+    if (!StrideWithinGroup || StrideWithinGroup != Stride0)
+      return false;
+    if (Coeffs.size() != NumCoeffs0)
+      return false;
+    sort(Coeffs);
+    if (Coeffs != Coeffs0)
+      return false;
+
+    UpdateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps, I);
+  }
+
+  SortedIndices.clear();
+  SortedIndices = SortedIndicesDraft;
+  SPtrInfo.StrideSCEV = Stride0;
+  SPtrInfo.Ty = StridedLoadTy;
+  return true;
 }
 
 BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll
@@ -332,11 +332,85 @@ define void @rt_stride_1_no_reordering(ptr %pl, i64 %stride, ptr %ps) {
 ; CHECK-LABEL: define void @rt_stride_1_no_reordering(
 ; CHECK-SAME: ptr [[PL:%.*]], i64 [[STRIDE:%.*]], ptr [[PS:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[STRIDE0:%.*]] = mul nsw i64 [[STRIDE]], 0
+; CHECK-NEXT:    [[STRIDE1:%.*]] = mul nsw i64 [[STRIDE]], 1
+; CHECK-NEXT:    [[STRIDE2:%.*]] = mul nsw i64 [[STRIDE]], 2
+; CHECK-NEXT:    [[STRIDE3:%.*]] = mul nsw i64 [[STRIDE]], 3
+; CHECK-NEXT:    [[STRIDE4:%.*]] = mul nsw i64 [[STRIDE]], 4
+; CHECK-NEXT:    [[STRIDE5:%.*]] = mul nsw i64 [[STRIDE]], 5
+; CHECK-NEXT:    [[STRIDE6:%.*]] = mul nsw i64 [[STRIDE]], 6
+; CHECK-NEXT:    [[STRIDE7:%.*]] = mul nsw i64 [[STRIDE]], 7
+; CHECK-NEXT:    [[STRIDE8:%.*]] = mul nsw i64 [[STRIDE]], 8
+; CHECK-NEXT:    [[STRIDE9:%.*]] = mul nsw i64 [[STRIDE]], 9
+; CHECK-NEXT:    [[STRIDE10:%.*]] = mul nsw i64 [[STRIDE]], 10
+; CHECK-NEXT:    [[STRIDE11:%.*]] = mul nsw i64 [[STRIDE]], 11
+; CHECK-NEXT:    [[STRIDE12:%.*]] = mul nsw i64 [[STRIDE]], 12
+; CHECK-NEXT:    [[STRIDE13:%.*]] = mul nsw i64 [[STRIDE]], 13
+; CHECK-NEXT:    [[STRIDE14:%.*]] = mul nsw i64 [[STRIDE]], 14
+; CHECK-NEXT:    [[STRIDE15:%.*]] = mul nsw i64 [[STRIDE]], 15
 ; CHECK-NEXT:    [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE0]]
+; CHECK-NEXT:    [[GEP_L1:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE1]]
+; CHECK-NEXT:    [[GEP_L2:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE2]]
+; CHECK-NEXT:    [[GEP_L3:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE3]]
+; CHECK-NEXT:    [[GEP_L4:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE4]]
+; CHECK-NEXT:    [[GEP_L5:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE5]]
+; CHECK-NEXT:    [[GEP_L6:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE6]]
+; CHECK-NEXT:    [[GEP_L7:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE7]]
+; CHECK-NEXT:    [[GEP_L8:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE8]]
+; CHECK-NEXT:    [[GEP_L9:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE9]]
+; CHECK-NEXT:    [[GEP_L10:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE10]]
+; CHECK-NEXT:    [[GEP_L11:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE11]]
+; CHECK-NEXT:    [[GEP_L12:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE12]]
+; CHECK-NEXT:    [[GEP_L13:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE13]]
+; CHECK-NEXT:    [[GEP_L14:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE14]]
+; CHECK-NEXT:    [[GEP_L15:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE15]]
+; CHECK-NEXT:    [[LOAD0:%.*]] = load i8, ptr [[GEP_L0]], align 1
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i8, ptr [[GEP_L1]], align 1
+; CHECK-NEXT:    [[LOAD2:%.*]] = load i8, ptr [[GEP_L2]], align 1
+; CHECK-NEXT:    [[LOAD3:%.*]] = load i8, ptr [[GEP_L3]], align 1
+; CHECK-NEXT:    [[LOAD4:%.*]] = load i8, ptr [[GEP_L4]], align 1
+; CHECK-NEXT:    [[LOAD5:%.*]] = load i8, ptr [[GEP_L5]], align 1
+; CHECK-NEXT:    [[LOAD6:%.*]] = load i8, ptr [[GEP_L6]], align 1
+; CHECK-NEXT:    [[LOAD7:%.*]] = load i8, ptr [[GEP_L7]], align 1
+; CHECK-NEXT:    [[LOAD8:%.*]] = load i8, ptr [[GEP_L8]], align 1
+; CHECK-NEXT:    [[LOAD9:%.*]] = load i8, ptr [[GEP_L9]], align 1
+; CHECK-NEXT:    [[LOAD10:%.*]] = load i8, ptr [[GEP_L10]], align 1
+; CHECK-NEXT:    [[LOAD11:%.*]] = load i8, ptr [[GEP_L11]], align 1
+; CHECK-NEXT:    [[LOAD12:%.*]] = load i8, ptr [[GEP_L12]], align 1
+; CHECK-NEXT:    [[LOAD13:%.*]] = load i8, ptr [[GEP_L13]], align 1
+; CHECK-NEXT:    [[LOAD14:%.*]] = load i8, ptr [[GEP_L14]], align 1
+; CHECK-NEXT:    [[LOAD15:%.*]] = load i8, ptr [[GEP_L15]], align 1
 ; CHECK-NEXT:    [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[STRIDE]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.experimental.vp.strided.load.v16i8.p0.i64(ptr align 1 [[GEP_L0]], i64 [[TMP1]], <16 x i1> splat (i1 true), i32 16)
-; CHECK-NEXT:    store <16 x i8> [[TMP2]], ptr [[GEP_S0]], align 1
+; CHECK-NEXT:    [[GEP_S1:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 1
+; CHECK-NEXT:    [[GEP_S2:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 2
+; CHECK-NEXT:    [[GEP_S3:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 3
+; CHECK-NEXT:    [[GEP_S4:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 4
+; CHECK-NEXT:    [[GEP_S5:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 5
+; CHECK-NEXT:    [[GEP_S6:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 6
+; CHECK-NEXT:    [[GEP_S7:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 7
+; CHECK-NEXT:    [[GEP_S8:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 8
+; CHECK-NEXT:    [[GEP_S9:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 9
+; CHECK-NEXT:    [[GEP_S10:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 10
+; CHECK-NEXT:    [[GEP_S11:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 11
+; CHECK-NEXT:    [[GEP_S12:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 12
+; CHECK-NEXT:    [[GEP_S13:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 13
+; CHECK-NEXT:    [[GEP_S14:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 14
+; CHECK-NEXT:    [[GEP_S15:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 15
+; CHECK-NEXT:    store i8 [[LOAD0]], ptr [[GEP_S0]], align 1
+; CHECK-NEXT:    store i8 [[LOAD1]], ptr [[GEP_S1]], align 1
+; CHECK-NEXT:    store i8 [[LOAD2]], ptr [[GEP_S2]], align 1
+; CHECK-NEXT:    store i8 [[LOAD3]], ptr [[GEP_S3]], align 1
+; CHECK-NEXT:    store i8 [[LOAD4]], ptr [[GEP_S4]], align 1
+; CHECK-NEXT:    store i8 [[LOAD5]], ptr [[GEP_S5]], align 1
+; CHECK-NEXT:    store i8 [[LOAD6]], ptr [[GEP_S6]], align 1
+; CHECK-NEXT:    store i8 [[LOAD7]], ptr [[GEP_S7]], align 1
+; CHECK-NEXT:    store i8 [[LOAD8]], ptr [[GEP_S8]], align 1
+; CHECK-NEXT:    store i8 [[LOAD9]], ptr [[GEP_S9]], align 1
+; CHECK-NEXT:    store i8 [[LOAD10]], ptr [[GEP_S10]], align 1
+; CHECK-NEXT:    store i8 [[LOAD11]], ptr [[GEP_S11]], align 1
+; CHECK-NEXT:    store i8 [[LOAD12]], ptr [[GEP_S12]], align 1
+; CHECK-NEXT:    store i8 [[LOAD13]], ptr [[GEP_S13]], align 1
+; CHECK-NEXT:    store i8 [[LOAD14]], ptr [[GEP_S14]], align 1
+; CHECK-NEXT:    store i8 [[LOAD15]], ptr [[GEP_S15]], align 1
 ; CHECK-NEXT:    ret void
 ;
   %stride0  = mul nsw i64 %stride, 0