[Transform][LoadStoreVectorizer] allow redundant in Chain #163019

cmc-rep · 2025-10-11T20:54:10Z

This can absorb redundant loads when forming vector load. Can be used to fix the situation created by VectorCombine. See: https://discourse.llvm.org/t/what-is-the-purpose-of-vectorizeloadinsert-in-the-vectorcombine-pass/88532

llvmbot · 2025-10-11T20:54:45Z

@llvm/pr-subscribers-llvm-globalisel

@llvm/pr-subscribers-llvm-transforms

Author: Gang Chen (cmc-rep)

Changes

This can absorb redundant loads when forming vector load. Can be used to fix the situation created by VectorCombine. See: https://discourse.llvm.org/t/what-is-the-purpose-of-vectorizeloadinsert-in-the-vectorcombine-pass/88532

Full diff: https://github.com/llvm/llvm-project/pull/163019.diff

1 Files Affected:

(modified) llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp (+31-20)

diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 7b5137b0185ab..484a0b762ad12 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -157,6 +157,7 @@ using EqClassKey =
 struct ChainElem {
   Instruction *Inst;
   APInt OffsetFromLeader;
+  bool Redundant = false; // Set to true when load is redundant.
   ChainElem(Instruction *Inst, APInt OffsetFromLeader)
       : Inst(std::move(Inst)), OffsetFromLeader(std::move(OffsetFromLeader)) {}
 };
@@ -626,26 +627,33 @@ std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
   std::vector<Chain> Ret;
   Ret.push_back({C.front()});
 
+  APInt PrevReadEnd = C[0].OffsetFromLeader +
+                      DL.getTypeSizeInBits(getLoadStoreType(&*C[0].Inst)) / 8;
   for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {
     // `prev` accesses offsets [PrevDistFromBase, PrevReadEnd).
     auto &CurChain = Ret.back();
-    const ChainElem &Prev = CurChain.back();
-    unsigned SzBits = DL.getTypeSizeInBits(getLoadStoreType(&*Prev.Inst));
+    unsigned SzBits = DL.getTypeSizeInBits(getLoadStoreType(&*It->Inst));
     assert(SzBits % 8 == 0 && "Non-byte sizes should have been filtered out by "
                               "collectEquivalenceClass");
-    APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / 8;
 
     // Add this instruction to the end of the current chain, or start a new one.
+    APInt ReadEnd = It->OffsetFromLeader + SzBits / 8;
+    bool IsRedundant = ReadEnd.sle(PrevReadEnd);
     bool AreContiguous = It->OffsetFromLeader == PrevReadEnd;
-    LLVM_DEBUG(dbgs() << "LSV: Instructions are "
-                      << (AreContiguous ? "" : "not ") << "contiguous: "
-                      << *Prev.Inst << " (ends at offset " << PrevReadEnd
-                      << ") -> " << *It->Inst << " (starts at offset "
+
+    LLVM_DEBUG(dbgs() << "LSV: Instruction is "
+                      << (AreContiguous
+                              ? "contiguous"
+                              : ((IsRedundant ? "redundant" : "chain-breaker")))
+                      << *It->Inst << " (starts at offset "
                       << It->OffsetFromLeader << ")\n");
-    if (AreContiguous)
+
+    It->Redundant = IsRedundant;
+    if (AreContiguous || IsRedundant)
       CurChain.push_back(*It);
     else
       Ret.push_back({*It});
+    PrevReadEnd = APIntOps::smax(PrevReadEnd, ReadEnd);
   }
 
   // Filter out length-1 chains, these are uninteresting.
@@ -874,10 +882,12 @@ bool Vectorizer::vectorizeChain(Chain &C) {
   Type *VecElemTy = getChainElemTy(C);
   bool IsLoadChain = isa<LoadInst>(C[0].Inst);
   unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
-  unsigned ChainBytes = std::accumulate(
-      C.begin(), C.end(), 0u, [&](unsigned Bytes, const ChainElem &E) {
-        return Bytes + DL.getTypeStoreSize(getLoadStoreType(E.Inst));
-      });
+  unsigned ChainBytes = 0;
+  for (auto &E : C) {
+    if (E.Redundant)
+      continue;
+    ChainBytes += DL.getTypeStoreSize(getLoadStoreType(E.Inst));
+  }
   assert(ChainBytes % DL.getTypeStoreSize(VecElemTy) == 0);
   // VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller
   // than 1 byte (e.g. VecTy == <32 x i1>).
@@ -916,20 +926,19 @@ bool Vectorizer::vectorizeChain(Chain &C) {
                                         getLoadStorePointerOperand(C[0].Inst),
                                         Alignment);
 
-    unsigned VecIdx = 0;
     for (const ChainElem &E : C) {
       Instruction *I = E.Inst;
       Value *V;
       Type *T = getLoadStoreType(I);
+      int EOffset = (E.OffsetFromLeader - C[0].OffsetFromLeader).getSExtValue();
+      int VecIdx = 8 * EOffset / DL.getTypeSizeInBits(VecElemTy);
       if (auto *VT = dyn_cast<FixedVectorType>(T)) {
         auto Mask = llvm::to_vector<8>(
             llvm::seq<int>(VecIdx, VecIdx + VT->getNumElements()));
         V = Builder.CreateShuffleVector(VecInst, Mask, I->getName());
-        VecIdx += VT->getNumElements();
       } else {
         V = Builder.CreateExtractElement(VecInst, Builder.getInt32(VecIdx),
                                          I->getName());
-        ++VecIdx;
       }
       if (V->getType() != I->getType())
         V = Builder.CreateBitOrPointerCast(V, I->getType());
@@ -964,22 +973,24 @@ bool Vectorizer::vectorizeChain(Chain &C) {
 
     // Build the vector to store.
     Value *Vec = PoisonValue::get(VecTy);
-    unsigned VecIdx = 0;
-    auto InsertElem = [&](Value *V) {
+    auto InsertElem = [&](Value *V, unsigned VecIdx) {
       if (V->getType() != VecElemTy)
         V = Builder.CreateBitOrPointerCast(V, VecElemTy);
-      Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx++));
+      Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx));
     };
     for (const ChainElem &E : C) {
       auto *I = cast<StoreInst>(E.Inst);
+      int EOffset = (E.OffsetFromLeader - C[0].OffsetFromLeader).getSExtValue();
+      int VecIdx = 8 * EOffset / DL.getTypeSizeInBits(VecElemTy);
       if (FixedVectorType *VT =
               dyn_cast<FixedVectorType>(getLoadStoreType(I))) {
         for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) {
           InsertElem(Builder.CreateExtractElement(I->getValueOperand(),
-                                                  Builder.getInt32(J)));
+                                                  Builder.getInt32(J)),
+                     VecIdx++);
         }
       } else {
-        InsertElem(I->getValueOperand());
+        InsertElem(I->getValueOperand(), VecIdx);
       }
     }

llvmbot · 2025-10-11T20:54:46Z

@llvm/pr-subscribers-vectorizers

Author: Gang Chen (cmc-rep)

Changes

This can absorb redundant loads when forming vector load. Can be used to fix the situation created by VectorCombine. See: https://discourse.llvm.org/t/what-is-the-purpose-of-vectorizeloadinsert-in-the-vectorcombine-pass/88532

Full diff: https://github.com/llvm/llvm-project/pull/163019.diff

1 Files Affected:

(modified) llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp (+31-20)

diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 7b5137b0185ab..484a0b762ad12 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -157,6 +157,7 @@ using EqClassKey =
 struct ChainElem {
   Instruction *Inst;
   APInt OffsetFromLeader;
+  bool Redundant = false; // Set to true when load is redundant.
   ChainElem(Instruction *Inst, APInt OffsetFromLeader)
       : Inst(std::move(Inst)), OffsetFromLeader(std::move(OffsetFromLeader)) {}
 };
@@ -626,26 +627,33 @@ std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
   std::vector<Chain> Ret;
   Ret.push_back({C.front()});
 
+  APInt PrevReadEnd = C[0].OffsetFromLeader +
+                      DL.getTypeSizeInBits(getLoadStoreType(&*C[0].Inst)) / 8;
   for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {
     // `prev` accesses offsets [PrevDistFromBase, PrevReadEnd).
     auto &CurChain = Ret.back();
-    const ChainElem &Prev = CurChain.back();
-    unsigned SzBits = DL.getTypeSizeInBits(getLoadStoreType(&*Prev.Inst));
+    unsigned SzBits = DL.getTypeSizeInBits(getLoadStoreType(&*It->Inst));
     assert(SzBits % 8 == 0 && "Non-byte sizes should have been filtered out by "
                               "collectEquivalenceClass");
-    APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / 8;
 
     // Add this instruction to the end of the current chain, or start a new one.
+    APInt ReadEnd = It->OffsetFromLeader + SzBits / 8;
+    bool IsRedundant = ReadEnd.sle(PrevReadEnd);
     bool AreContiguous = It->OffsetFromLeader == PrevReadEnd;
-    LLVM_DEBUG(dbgs() << "LSV: Instructions are "
-                      << (AreContiguous ? "" : "not ") << "contiguous: "
-                      << *Prev.Inst << " (ends at offset " << PrevReadEnd
-                      << ") -> " << *It->Inst << " (starts at offset "
+
+    LLVM_DEBUG(dbgs() << "LSV: Instruction is "
+                      << (AreContiguous
+                              ? "contiguous"
+                              : ((IsRedundant ? "redundant" : "chain-breaker")))
+                      << *It->Inst << " (starts at offset "
                       << It->OffsetFromLeader << ")\n");
-    if (AreContiguous)
+
+    It->Redundant = IsRedundant;
+    if (AreContiguous || IsRedundant)
       CurChain.push_back(*It);
     else
       Ret.push_back({*It});
+    PrevReadEnd = APIntOps::smax(PrevReadEnd, ReadEnd);
   }
 
   // Filter out length-1 chains, these are uninteresting.
@@ -874,10 +882,12 @@ bool Vectorizer::vectorizeChain(Chain &C) {
   Type *VecElemTy = getChainElemTy(C);
   bool IsLoadChain = isa<LoadInst>(C[0].Inst);
   unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
-  unsigned ChainBytes = std::accumulate(
-      C.begin(), C.end(), 0u, [&](unsigned Bytes, const ChainElem &E) {
-        return Bytes + DL.getTypeStoreSize(getLoadStoreType(E.Inst));
-      });
+  unsigned ChainBytes = 0;
+  for (auto &E : C) {
+    if (E.Redundant)
+      continue;
+    ChainBytes += DL.getTypeStoreSize(getLoadStoreType(E.Inst));
+  }
   assert(ChainBytes % DL.getTypeStoreSize(VecElemTy) == 0);
   // VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller
   // than 1 byte (e.g. VecTy == <32 x i1>).
@@ -916,20 +926,19 @@ bool Vectorizer::vectorizeChain(Chain &C) {
                                         getLoadStorePointerOperand(C[0].Inst),
                                         Alignment);
 
-    unsigned VecIdx = 0;
     for (const ChainElem &E : C) {
       Instruction *I = E.Inst;
       Value *V;
       Type *T = getLoadStoreType(I);
+      int EOffset = (E.OffsetFromLeader - C[0].OffsetFromLeader).getSExtValue();
+      int VecIdx = 8 * EOffset / DL.getTypeSizeInBits(VecElemTy);
       if (auto *VT = dyn_cast<FixedVectorType>(T)) {
         auto Mask = llvm::to_vector<8>(
             llvm::seq<int>(VecIdx, VecIdx + VT->getNumElements()));
         V = Builder.CreateShuffleVector(VecInst, Mask, I->getName());
-        VecIdx += VT->getNumElements();
       } else {
         V = Builder.CreateExtractElement(VecInst, Builder.getInt32(VecIdx),
                                          I->getName());
-        ++VecIdx;
       }
       if (V->getType() != I->getType())
         V = Builder.CreateBitOrPointerCast(V, I->getType());
@@ -964,22 +973,24 @@ bool Vectorizer::vectorizeChain(Chain &C) {
 
     // Build the vector to store.
     Value *Vec = PoisonValue::get(VecTy);
-    unsigned VecIdx = 0;
-    auto InsertElem = [&](Value *V) {
+    auto InsertElem = [&](Value *V, unsigned VecIdx) {
       if (V->getType() != VecElemTy)
         V = Builder.CreateBitOrPointerCast(V, VecElemTy);
-      Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx++));
+      Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx));
     };
     for (const ChainElem &E : C) {
       auto *I = cast<StoreInst>(E.Inst);
+      int EOffset = (E.OffsetFromLeader - C[0].OffsetFromLeader).getSExtValue();
+      int VecIdx = 8 * EOffset / DL.getTypeSizeInBits(VecElemTy);
       if (FixedVectorType *VT =
               dyn_cast<FixedVectorType>(getLoadStoreType(I))) {
         for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) {
           InsertElem(Builder.CreateExtractElement(I->getValueOperand(),
-                                                  Builder.getInt32(J)));
+                                                  Builder.getInt32(J)),
+                     VecIdx++);
         }
       } else {
-        InsertElem(I->getValueOperand());
+        InsertElem(I->getValueOperand(), VecIdx);
       }
     }

github-actions · 2025-10-12T04:34:15Z

⚠️ undef deprecator found issues in your code. ⚠️

You can test this locally with the following command:

git diff -U0 --pickaxe-regex -S '([^a-zA-Z0-9#_-]undef([^a-zA-Z0-9_-]|$)|UndefValue::get)' 'HEAD~1' HEAD llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll llvm/test/CodeGen/AMDGPU/mad_uint24.ll llvm/test/CodeGen/AMDGPU/sad.ll llvm/test/CodeGen/AMDGPU/simplifydemandedbits-recursion.ll llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/vect-ptr-ptr-size-mismatch.ll llvm/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll

The following files introduce new uses of undef:

llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll

Undef is now deprecated and should only be used in the rare cases where no replacement is possible. For example, a load of uninitialized memory yields undef. You should use poison values for placeholders instead.

In tests, avoid using undef and having tests that trigger undefined behavior. If you need an operand with some unimportant value, you can add a new argument to the function and use that instead.

For example, this is considered a bad practice:

define void @fn() {
  ...
  br i1 undef, ...
}

Please use the following instead:

define void @fn(i1 %cond) {
  ...
  br i1 %cond, ...
}

Please refer to the Undefined Behavior Manual for more information.

dakersnar · 2025-10-14T16:30:01Z

I took a look at your linked discussion thread. Is there a reason this problem cannot be solved in VectorCombine itself? This feels like it might be a band-aid solution.

cmc-rep · 2025-10-14T16:34:46Z

No one answers my question in discourse. I feel it may be difficult to convince people that that specific function in VectorCombine only creates noise. My change to LoadStoreVectorizer not only clean up that case, it also brings some benefit as some test changes are showing. I won't call it a band-aid. It is an enhancement to LoadStoreVectorizer

llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll

llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/vect-ptr-ptr-size-mismatch.ll

Artem-B · 2025-10-14T19:22:00Z

Is there a reason this problem cannot be solved in VectorCombine itself? This feels like it might be a band-aid solution.
...
It is an enhancement to LoadStoreVectorizer

I think both points have merit, but...
VectorCombine code explicitly says that they "convert to the vector form because the backend can invert this transform if it does not result in a performance win":

llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Line 328 in 975fba1

// We can aggressively convert to the vector form because the backend can

It does sound like it's not a bug, but a feature. Whether vector load is profitable would generally require target-specific knowledge. E.g. on NVPTX, some vector loads may be nearly free, while others costly, if too many loaded elements are unused. LoadStoreVectorizer may be a reasonable place to handle some cases that we determine to be unprofitable. And we may still need to handle even more cases in the back-end.

jayfoad · 2025-10-15T10:32:27Z

It does sound like it's not a bug, but a feature. Whether vector load is profitable would generally require target-specific knowledge. E.g. on NVPTX, some vector loads may be nearly free, while others costly, if too many loaded elements are unused. LoadStoreVectorizer may be a reasonable place to handle some cases that we determine to be unprofitable. And we may still need to handle even more cases in the back-end.

But VectorCombine already uses TTI to get target-specific cost information.

github-actions · 2025-10-15T19:59:10Z

✅ With the latest revision this PR passed the C/C++ code formatter.

cmc-rep · 2025-10-15T20:10:48Z

It does sound like it's not a bug, but a feature. Whether vector load is profitable would generally require target-specific knowledge. E.g. on NVPTX, some vector loads may be nearly free, while others costly, if too many loaded elements are unused. LoadStoreVectorizer may be a reasonable place to handle some cases that we determine to be unprofitable. And we may still need to handle even more cases in the back-end.

But VectorCombine already uses TTI to get target-specific cost information.

I don't feel that VectorCombine is well equiped to understand the cost during "vectorizeLoadInsert". It has TII, however, it only has a very narrow view of the IR, i.e. it does not look beyond the current load+insertelement. LoadStoreVectorizer is better equipped for this.

llvm/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll

llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp

dakersnar · 2025-10-17T20:55:09Z

Heads up, I'm working on a change that overlaps with your change: #159388. Conceptually I don't see an issue with them both coexisting. One of us will have to resolve some conflicts but it shouldn't be a big deal.

cmc-rep · 2025-11-07T16:58:35Z

Ping, would like to get this in.

nhaehnle

LGTM, I do have some minor suggestions for improvement though.

nhaehnle · 2025-11-12T20:09:00Z

llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp

-                      << *Prev.Inst << " (ends at offset " << PrevReadEnd
-                      << ") -> " << *It->Inst << " (starts at offset "
+    APInt ReadEnd = It->OffsetFromLeader + SzBits / 8;
+    // Alllow redundancy: partial or full overlaping counts as contiguous.


Typo: *Allow

nhaehnle · 2025-11-12T20:18:46Z

llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp

+    APInt ReadEnd = It->OffsetFromLeader + SzBits / 8;
+    // Alllow redundancy: partial or full overlaping counts as contiguous.
+    int ExtraBytes =
+        PrevReadEnd.sle(ReadEnd) ? (ReadEnd - PrevReadEnd).getSExtValue() : 0;
+    bool AreContiguous = It->OffsetFromLeader.sle(PrevReadEnd) &&
+                         SzBits % ElemBytes == 0 && ExtraBytes % ElemBytes == 0;


Use an assertion where it feels like it should be applicable (element size is part of the equivalence class key during gatherChains) and simplify the logic a little bit.

Suggested change

APInt ReadEnd = It->OffsetFromLeader + SzBits / 8;

// Alllow redundancy: partial or full overlaping counts as contiguous.

int ExtraBytes =

PrevReadEnd.sle(ReadEnd) ? (ReadEnd - PrevReadEnd).getSExtValue() : 0;

bool AreContiguous = It->OffsetFromLeader.sle(PrevReadEnd) &&

SzBits % ElemBytes == 0 && ExtraBytes % ElemBytes == 0;

uint64_t SzBytes = SzBits / 8;

assert(SzBytes % ElemBytes == 0);

APInt ReadEnd = It->OffsetFromLeader + SzBits / 8;

// Allow redundancy: partial or full overlap counts as contiguous.

bool AreContiguous = false;

if (It->OffsetFromLeader.sle(PrevReadEnd) {

uint64_t Overlap = (PrevReadEnd - It->OffsetFromLeader).getZExtValue();

if (Overlap % ElemBytes == 0)

AreContiguous = true;

}

nhaehnle · 2025-11-12T20:36:51Z

llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp

+  int BytesAdded = DL.getTypeSizeInBits(getLoadStoreType(&*C[0].Inst)) / 8;
+  APInt PrevReadEnd = C[0].OffsetFromLeader + BytesAdded;
+  int ChainBytes = BytesAdded;
+  for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {
+    unsigned SzBits = DL.getTypeSizeInBits(getLoadStoreType(&*It->Inst));
+    APInt ReadEnd = It->OffsetFromLeader + SzBits / 8;
+    // Update ChainBytes considering possible overlap.
+    BytesAdded =
+        PrevReadEnd.sle(ReadEnd) ? (ReadEnd - PrevReadEnd).getSExtValue() : 0;
+    ChainBytes += BytesAdded;
+    PrevReadEnd = APIntOps::smax(PrevReadEnd, ReadEnd);
+  }


Instead of the loop, couldn't you just look at the first and last chain elements? (This applies to the old code as well, but while we're touching it...)

I think we need to look at every chain element because we could have a chain element in the middle that covers the rest of the chain and determines the ChainBytes.

nhaehnle · 2025-11-12T20:38:34Z

llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp

+    // This can happen due to a chain of redundant loads.
+    // In this case, just use the element-type, and avoid ExtractElement.
+    if (NumElem == 1)
+      VecTy = VecElemTy;


Updating the type here in the middle of the code is unfortunate. Better to move it to the top where VecTy is defined.

I only want to apply the type change for load-chain. Right now, we perhaps won't see redundancy in store chain due to aliasing check. When that part is improved, we can have the following:
vec1 = insert-element <1 x i32> vec0, v0, 0
vec2 = insert-element <1 x i32> vec1, v1, 0
store <1xi32> vec2, addr
Having those insert-elements preserves the correct write order.

…undant

…izeChain

…ctor load

dakersnar · 2025-11-14T17:35:06Z

@cmc-rep

I thought a lot of the unresolved feedback from @arsenm and others was reasonable, and I'm interested in why you chose to merge without addressing them.
Unfortunately, I think the commit you ended up with is incorrect. The addition of redundant loads to the chain in splitChainByContiguity is not being considered in splitChainByAlignment. Consider this section:
https://github.com/cmc-rep/llvm-project/blob/76de96d450aeab4e585ee629376f2ecac5ca69cd/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp#L734-L747

What happens if you have a chain that looks like this:

define void @test(ptr %ptr) {
  %ld0 = load i32, ptr %ptr, align 16
  %gep1 = getelementptr inbounds i8, ptr %ptr, i32 4
  %ld1 = load i32, ptr %gep1, align 4
  %gep2 = getelementptr inbounds i8, ptr %ptr, i32 8
  %ld2 = load <2 x i32>, ptr %gep2, align 8
  %gep3 = getelementptr inbounds i8, ptr %ptr, i32 8
  %ld3 = load i32, ptr %gep3, align 4
  ret void
}

Here is the debug output when splitChainByAlignment considers the candidate chain that contains all four loads:

LSV: splitChainByContiguity considering chain:
    %ld0 = load i32, ptr %ptr, align 16 (offset 0)
    %ld1 = load i32, ptr %gep1, align 4 (offset 4)
    %ld2 = load <2 x i32>, ptr %gep2, align 8 (offset 8)
    %ld3 = load i32, ptr %gep3, align 4 (offset 8)
LSV: Instruction is contiguous  %ld1 = load i32, ptr %gep1, align 4 (starts at offset 4)
LSV: Instruction is contiguous  %ld2 = load <2 x i32>, ptr %gep2, align 8 (starts at offset 8)
LSV: Instruction is contiguous  %ld3 = load i32, ptr %gep3, align 4 (starts at offset 8)
LSV: splitChainByAlignment considering chain:
    %ld0 = load i32, ptr %ptr, align 16 (offset 0)
    %ld1 = load i32, ptr %gep1, align 4 (offset 4)
    %ld2 = load <2 x i32>, ptr %gep2, align 8 (offset 8)
    %ld3 = load i32, ptr %gep3, align 4 (offset 8)
LSV: splitChainByAlignment considering candidate chain [  %ld0 = load i32, ptr %ptr, align 16 ...   %ld3 = load i32, ptr %gep3, align 4]
LSV: Access of 12B in addrspace 0 with alignment 16 is misaligned, and therefore can't be vectorized.

Note that last line. The algorithm believes that that chain is only accessing 12B, which is wrong, it is accessing 16B. In this case, it only results in a false negative, that chain does not get vectorized. But there is probably a scenario where this bug causes a false positive and generates incorrect IR.

Given this, I personally think this should be reverted and reworked. Even if the above issues can be resolved in a follow up patch, I'm still hesitant to believe that this is the best solution, or that the LSV is the right place to solve this problem. I think it would be a good idea to create a new LSV-specific lit test (which I think is what was requested here: #163019 (comment)) that tests a few scenarios that your patch is targeting, so that we can properly arrive at the cleanest correct solution. FWIW, I have tinkered with similar duplicate load problems in the LSV and determined that EarlyCSE/GVN were better ways to solve this problem. But if you can simplify the cases your patch is improving into a new unit test, I could be convinced the LSV is the place for this. Just throwing out ideas: maybe duplicate loads can be simply removed from the chain, and the chain can continue onward without them in splitChainContinuity? This would maintain the LSV's existing assumptions about what it means to be contiguous and thus allow much less churn to the LSV's overall code.

cmc-rep · 2025-11-14T18:03:00Z

I did resolve many Matt's comment in the code in the last update. However, I didn't add a new IR test, not sure what kind of new test should be written.

cmc-rep · 2025-11-14T18:18:16Z

Not sure I have fully understand your test case. Let me try it. However, I don't think my change will cause false positive. Notice that what the chain size is recomputed during vectorizeChain (not just based upon the splitChainByContiguity). Please let me know if you come up with a false-positive case.

dakersnar · 2025-11-14T18:20:22Z

I did resolve many Matt's comment in the code in the last update. However, I didn't add a new IR test, not sure what kind of new test should be written.

I see you addressed some but not all of them, but I particularly think a new test case that contains all the cases that you want to improve would be really helpful to ensure this is the best solution. This could be done by extracting the pre-LSV IR from the cases where you are seeing improvement in your compiler, especially IR that represents the original motivating case.

cmc-rep · 2025-11-14T18:36:07Z

I did resolve many Matt's comment in the code in the last update. However, I didn't add a new IR test, not sure what kind of new test should be written.

I see you addressed some but not all of them, but I particularly think a new test case that contains all the cases that you want to improve would be really helpful to ensure this is the best solution. This could be done by extracting the pre-LSV IR from the cases where you are seeing improvement in your compiler, especially IR that represents the original motivating case.

I thought about that, the case I had in discourse is fairly simple. I felt it has been covered by the existing test change I have seen.

dakersnar · 2025-11-14T19:03:28Z

Here's a case that demonstrates the issue on x86. Unfortunately, it is a little convoluted, SizeBytes in splitChainByAlignment is certainly wrong in many cases, but finding a mis-compile that is caused by SizeBytes being wrong was harder. The first test and second test are identical, except the second test has an extra redundant load:

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -mtriple=x86_64-unknown-linux-gnu -passes=load-store-vectorizer -S < %s | FileCheck %s

define void @test_no_redundant(ptr %ptr) {
; CHECK-LABEL: define void @test_no_redundant(
; CHECK-SAME: ptr [[PTR:%.*]]) {
; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[PTR]], align 8
; CHECK-NEXT:    [[LD01:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
; CHECK-NEXT:    [[LD12:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 8
; CHECK-NEXT:    [[LD2:%.*]] = load <2 x i32>, ptr [[GEP2]], align 8
; CHECK-NEXT:    ret void
;
  %ld0 = load i32, ptr %ptr, align 8
  %gep1 = getelementptr inbounds i8, ptr %ptr, i32 4
  %ld1 = load i32, ptr %gep1, align 4
  %gep2 = getelementptr inbounds i8, ptr %ptr, i32 8
  %ld2 = load <2 x i32>, ptr %gep2, align 8
  ret void
}

define void @test_redundant(ptr %ptr) {
; CHECK-LABEL: define void @test_redundant(
; CHECK-SAME: ptr [[PTR:%.*]]) {
; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[PTR]], align 8
; CHECK-NEXT:    [[LD01:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
; CHECK-NEXT:    [[LD12:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
; CHECK-NEXT:    [[LD23:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
; CHECK-NEXT:    [[LD34:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
; CHECK-NEXT:    ret void
;
  %ld0 = load i32, ptr %ptr, align 8
  %gep1 = getelementptr inbounds i8, ptr %ptr, i32 4
  %ld1 = load i32, ptr %gep1, align 4
  %gep2 = getelementptr inbounds i8, ptr %ptr, i32 8
  %ld2 = load <2 x i32>, ptr %gep2, align 8
  %gep3 = getelementptr inbounds i8, ptr %ptr, i32 8
  %ld3 = load i32, ptr %gep3, align 4
  ret void
}

x86 allows misaligned loads up to a certain size if it determines it is fast. My testing indicates that it allows 12B misaligned vectors but not 16B.

In the first test, it determines that 16B is too large and splits up the vector. See this debug output, with the last line being the justification for why it does not create a v4.

LSV: splitChainByAlignment considering chain:
    %ld0 = load i32, ptr %ptr, align 8 (offset 0)
    %ld1 = load i32, ptr %gep1, align 4 (offset 4)
    %ld2 = load <2 x i32>, ptr %gep2, align 8 (offset 8)
LSV: splitChainByAlignment considering candidate chain [  %ld0 = load i32, ptr %ptr, align 8 ...   %ld2 = load <2 x i32>, ptr %gep2, align 8]
LSV: Access of 16B in addrspace 0 with alignment 8 has relative speed 0, which is lower than the elementwise speed of 1.  Therefore this access won't be vectorized.

But for the second test, the additional load causes SizeBytes to be incorrectly computed as 12 instead of 16, even though the size of the chain is 16 bytes. Thus, allowsMisalignedMemoryAccesses returns true, and a v4 is created.

cmc-rep · 2025-11-14T19:11:36Z

Thanks for all the explanation and cases. I will revert the change, and add the patch for SizeBytes, plus the these cases.

dakersnar · 2025-11-14T19:15:26Z

Sounds good, thank you! I'll be more active with feedback when you put up that patch. Like I said, I suspect this can be done in a cleaner and safer way, but I need to look at the important cases you are trying to tackle more closely first.

…vm#163019)" This reverts commit 92e5608.

cmc-rep · 2025-11-14T19:21:45Z

Here is the reverting PR #168105

This is the fixed version of llvm#163019

…68135) This is the fixed version of #163019

…n Chain (#168135) This is the fixed version of llvm/llvm-project#163019

…68135) This is the fixed version of llvm/llvm-project#163019 Signed-off-by: Hafidz Muzakky <[email protected]>

…vm#168135) This is the fixed version of llvm#163019

llvmbot added vectorizers llvm:transforms labels Oct 11, 2025

llvmbot added backend:AMDGPU llvm:globalisel labels Oct 12, 2025

cmc-rep requested review from arsenm and jrbyrnes October 13, 2025 02:34

jayfoad requested review from Artem-B, alinas, dakersnar and jlebar October 14, 2025 16:03

Artem-B reviewed Oct 14, 2025

View reviewed changes

llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll Outdated Show resolved Hide resolved

Artem-B reviewed Oct 14, 2025

View reviewed changes

llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/vect-ptr-ptr-size-mismatch.ll Outdated Show resolved Hide resolved

cmc-rep force-pushed the load-store-vect branch 2 times, most recently from 3331fe8 to ec6cce8 Compare October 15, 2025 22:26

nhaehnle reviewed Oct 16, 2025

View reviewed changes

llvm/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll Outdated Show resolved Hide resolved

llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp Outdated Show resolved Hide resolved

dakersnar reviewed Oct 17, 2025

View reviewed changes

llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp Show resolved Hide resolved

cmc-rep force-pushed the load-store-vect branch from 18e0bab to ccd258f Compare October 17, 2025 20:46

cmc-rep requested a review from PeddleSpam October 30, 2025 00:47

nhaehnle approved these changes Nov 12, 2025

View reviewed changes

cmc-rep added 9 commits November 13, 2025 11:06

[Transform][LoadStoreVectorizer] Allow partial redundant

9bba9c7

[Transform][LoadStoreVectorizer] fix tests after allowing partial red…

c93bad9

…undant

[Transform][LoadStoreVectorizer] move redundancy calculation to Vecto…

05bb268

…izeChain

[Transform][LoadStoreVectorizer] fix merging result of one-element ve…

72d2db1

…ctor load

followup fixing tests

57aa3b6

[LoadStoreVectorizer] review update

6ef6230

[LoadStoreVectorizer] update test after rebase

052ed61

[LoadStoreVectorizer] More review updates

ac27f5f

[LoadStoreVectorizer] Rebase and update test.

76de96d

cmc-rep force-pushed the load-store-vect branch from 0a1295a to 76de96d Compare November 13, 2025 19:29

cmc-rep merged commit 92e5608 into llvm:main Nov 13, 2025
7 of 9 checks passed

cmc-rep added a commit to cmc-rep/llvm-project that referenced this pull request Nov 14, 2025

Revert "[Transform][LoadStoreVectorizer] allow redundant in Chain (ll…

d5be23d

…vm#163019)" This reverts commit 92e5608.

cmc-rep added a commit to cmc-rep/llvm-project that referenced this pull request Nov 14, 2025

Re-land [Transform][LoadStoreVectorizer] allow redundant in Chain

82a279a

This is the fixed version of llvm#163019

cmc-rep mentioned this pull request Nov 14, 2025

Re-land [Transform][LoadStoreVectorizer] allow redundant in Chain #168135

Merged

cmc-rep added a commit that referenced this pull request Nov 20, 2025

Re-land [Transform][LoadStoreVectorizer] allow redundant in Chain (#1…

9e9fe08

…68135) This is the fixed version of #163019

llvm-sync bot pushed a commit to arm/arm-toolchain that referenced this pull request Nov 20, 2025

Automerge: Re-land [Transform][LoadStoreVectorizer] allow redundant i…

e394b7a

…n Chain (#168135) This is the fixed version of llvm/llvm-project#163019

aadeshps-mcw pushed a commit to aadeshps-mcw/llvm-project that referenced this pull request Nov 26, 2025

Re-land [Transform][LoadStoreVectorizer] allow redundant in Chain (ll…

cb1ecc1

…vm#168135) This is the fixed version of llvm#163019

Priyanshu3820 pushed a commit to Priyanshu3820/llvm-project that referenced this pull request Nov 26, 2025

Re-land [Transform][LoadStoreVectorizer] allow redundant in Chain (ll…

7726f80

…vm#168135) This is the fixed version of llvm#163019

ronlieb pushed a commit to ROCm/llvm-project that referenced this pull request Nov 27, 2025

Re-land [Transform][LoadStoreVectorizer] allow redundant in Chain (ll…

66296d9

…vm#168135) This is the fixed version of llvm#163019

ronlieb pushed a commit to ROCm/llvm-project that referenced this pull request Nov 27, 2025

Re-land [Transform][LoadStoreVectorizer] allow redundant in Chain (ll…

a65297b

…vm#168135) This is the fixed version of llvm#163019

-    APInt ReadEnd = It->OffsetFromLeader + SzBits / 8;
-    // Alllow redundancy: partial or full overlaping counts as contiguous.
-    int ExtraBytes =
-        PrevReadEnd.sle(ReadEnd) ? (ReadEnd - PrevReadEnd).getSExtValue() : 0;
-    bool AreContiguous = It->OffsetFromLeader.sle(PrevReadEnd) &&
-                         SzBits % ElemBytes == 0 && ExtraBytes % ElemBytes == 0;
+    uint64_t SzBytes = SzBits / 8;
+    assert(SzBytes % ElemBytes == 0);
+    APInt ReadEnd = It->OffsetFromLeader + SzBits / 8;
+    // Allow redundancy: partial or full overlap counts as contiguous.
+    bool AreContiguous = false;
+    if (It->OffsetFromLeader.sle(PrevReadEnd) {
+        uint64_t Overlap = (PrevReadEnd - It->OffsetFromLeader).getZExtValue();
+        if (Overlap % ElemBytes == 0)
+          AreContiguous = true;
+    }

[Transform][LoadStoreVectorizer] allow redundant in Chain #163019

[Transform][LoadStoreVectorizer] allow redundant in Chain #163019

Uh oh!

Conversation

cmc-rep commented Oct 11, 2025

Uh oh!

llvmbot commented Oct 11, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Oct 11, 2025

Uh oh!

github-actions bot commented Oct 12, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

dakersnar commented Oct 14, 2025

Uh oh!

cmc-rep commented Oct 14, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Artem-B commented Oct 14, 2025

Uh oh!

jayfoad commented Oct 15, 2025

Uh oh!

github-actions bot commented Oct 15, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

cmc-rep commented Oct 15, 2025

Uh oh!

Uh oh!

Uh oh!

Uh oh!

dakersnar commented Oct 17, 2025

Uh oh!

cmc-rep commented Nov 7, 2025

Uh oh!

nhaehnle left a comment

Choose a reason for hiding this comment

Uh oh!

nhaehnle Nov 12, 2025

Choose a reason for hiding this comment

Uh oh!

nhaehnle Nov 12, 2025

Choose a reason for hiding this comment

Uh oh!

nhaehnle Nov 12, 2025

Choose a reason for hiding this comment

Uh oh!

cmc-rep Nov 13, 2025

Choose a reason for hiding this comment

Uh oh!

nhaehnle Nov 12, 2025

Choose a reason for hiding this comment

Uh oh!

cmc-rep Nov 13, 2025

Choose a reason for hiding this comment

Uh oh!

Uh oh!

dakersnar commented Nov 14, 2025

Uh oh!

cmc-rep commented Nov 14, 2025

Uh oh!

cmc-rep commented Nov 14, 2025

Uh oh!

dakersnar commented Nov 14, 2025

Uh oh!

cmc-rep commented Nov 14, 2025

Uh oh!

dakersnar commented Nov 14, 2025

Uh oh!

cmc-rep commented Nov 14, 2025

Uh oh!

dakersnar commented Nov 14, 2025

Uh oh!

cmc-rep commented Nov 14, 2025

Uh oh!

Reviewers

Assignees

Labels

llvmbot commented Oct 11, 2025 •

edited

Loading

github-actions bot commented Oct 12, 2025 •

edited

Loading

cmc-rep commented Oct 14, 2025 •

edited

Loading

github-actions bot commented Oct 15, 2025 •

edited

Loading