Fix bug in alignment derive, update test to show improvement

dakersnar · dakersnar · commit 8240ccb03178 · 2025-11-05T18:39:37.000Z
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -690,8 +690,8 @@ std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
 
   // Cache the best aligned element in the chain for use when creating extra
   // elements.
-  Align BestAlignedElemAlign;
-  APInt OffsetOfBestAlignedElemFromLeader;
+  Align BestAlignedElemAlign = getLoadStoreAlignment(C[0].Inst);
+  APInt OffsetOfBestAlignedElemFromLeader = C[0].OffsetFromLeader;
   for (const auto &E : C) {
     Align ElementAlignment = getLoadStoreAlignment(E.Inst);
     if (ElementAlignment > BestAlignedElemAlign) {
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/many_loads_stores.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/many_loads_stores.ll
@@ -1,9 +1,10 @@
 ; This is an end-to-end test that checks that LSV succeeds at vectorizing a
 ; large program with many loads.
 ; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S -o - %s > %t
-; RUN: grep 'load i8' < %t | count 18
-; RUN: grep 'load <2 x i8>' < %t | count 9
+; RUN: grep 'load i8' < %t | count 12
+; RUN: grep 'load <2 x i8>' < %t | count 3
 ; RUN: grep 'load <4 x i8>' < %t | count 27
+; RUN: grep 'call <4 x i8> @llvm.masked.load.v4i8.p1.*<4 x i1> <i1 false, i1 true, i1 true, i1 true>' < %t | count 6
 
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"