-
Notifications
You must be signed in to change notification settings - Fork 14.8k
[Transform][LoadStoreVectorizer] allow redundant in Chain #163019
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
This can absorb redundant loads when forming vector load. Can be used to fix the situation created by VectorCombine. See: https://discourse.llvm.org/t/what-is-the-purpose-of-vectorizeloadinsert-in-the-vectorcombine-pass/88532
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-llvm-transforms Author: Gang Chen (cmc-rep) ChangesThis can absorb redundant loads when forming vector load. Can be used to fix the situation created by VectorCombine. See: https://discourse.llvm.org/t/what-is-the-purpose-of-vectorizeloadinsert-in-the-vectorcombine-pass/88532 Full diff: https://github.com/llvm/llvm-project/pull/163019.diff 1 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 7b5137b0185ab..484a0b762ad12 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -157,6 +157,7 @@ using EqClassKey =
struct ChainElem {
Instruction *Inst;
APInt OffsetFromLeader;
+ bool Redundant = false; // Set to true when load is redundant.
ChainElem(Instruction *Inst, APInt OffsetFromLeader)
: Inst(std::move(Inst)), OffsetFromLeader(std::move(OffsetFromLeader)) {}
};
@@ -626,26 +627,33 @@ std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
std::vector<Chain> Ret;
Ret.push_back({C.front()});
+ APInt PrevReadEnd = C[0].OffsetFromLeader +
+ DL.getTypeSizeInBits(getLoadStoreType(&*C[0].Inst)) / 8;
for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {
// `prev` accesses offsets [PrevDistFromBase, PrevReadEnd).
auto &CurChain = Ret.back();
- const ChainElem &Prev = CurChain.back();
- unsigned SzBits = DL.getTypeSizeInBits(getLoadStoreType(&*Prev.Inst));
+ unsigned SzBits = DL.getTypeSizeInBits(getLoadStoreType(&*It->Inst));
assert(SzBits % 8 == 0 && "Non-byte sizes should have been filtered out by "
"collectEquivalenceClass");
- APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / 8;
// Add this instruction to the end of the current chain, or start a new one.
+ APInt ReadEnd = It->OffsetFromLeader + SzBits / 8;
+ bool IsRedundant = ReadEnd.sle(PrevReadEnd);
bool AreContiguous = It->OffsetFromLeader == PrevReadEnd;
- LLVM_DEBUG(dbgs() << "LSV: Instructions are "
- << (AreContiguous ? "" : "not ") << "contiguous: "
- << *Prev.Inst << " (ends at offset " << PrevReadEnd
- << ") -> " << *It->Inst << " (starts at offset "
+
+ LLVM_DEBUG(dbgs() << "LSV: Instruction is "
+ << (AreContiguous
+ ? "contiguous"
+ : ((IsRedundant ? "redundant" : "chain-breaker")))
+ << *It->Inst << " (starts at offset "
<< It->OffsetFromLeader << ")\n");
- if (AreContiguous)
+
+ It->Redundant = IsRedundant;
+ if (AreContiguous || IsRedundant)
CurChain.push_back(*It);
else
Ret.push_back({*It});
+ PrevReadEnd = APIntOps::smax(PrevReadEnd, ReadEnd);
}
// Filter out length-1 chains, these are uninteresting.
@@ -874,10 +882,12 @@ bool Vectorizer::vectorizeChain(Chain &C) {
Type *VecElemTy = getChainElemTy(C);
bool IsLoadChain = isa<LoadInst>(C[0].Inst);
unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
- unsigned ChainBytes = std::accumulate(
- C.begin(), C.end(), 0u, [&](unsigned Bytes, const ChainElem &E) {
- return Bytes + DL.getTypeStoreSize(getLoadStoreType(E.Inst));
- });
+ unsigned ChainBytes = 0;
+ for (auto &E : C) {
+ if (E.Redundant)
+ continue;
+ ChainBytes += DL.getTypeStoreSize(getLoadStoreType(E.Inst));
+ }
assert(ChainBytes % DL.getTypeStoreSize(VecElemTy) == 0);
// VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller
// than 1 byte (e.g. VecTy == <32 x i1>).
@@ -916,20 +926,19 @@ bool Vectorizer::vectorizeChain(Chain &C) {
getLoadStorePointerOperand(C[0].Inst),
Alignment);
- unsigned VecIdx = 0;
for (const ChainElem &E : C) {
Instruction *I = E.Inst;
Value *V;
Type *T = getLoadStoreType(I);
+ int EOffset = (E.OffsetFromLeader - C[0].OffsetFromLeader).getSExtValue();
+ int VecIdx = 8 * EOffset / DL.getTypeSizeInBits(VecElemTy);
if (auto *VT = dyn_cast<FixedVectorType>(T)) {
auto Mask = llvm::to_vector<8>(
llvm::seq<int>(VecIdx, VecIdx + VT->getNumElements()));
V = Builder.CreateShuffleVector(VecInst, Mask, I->getName());
- VecIdx += VT->getNumElements();
} else {
V = Builder.CreateExtractElement(VecInst, Builder.getInt32(VecIdx),
I->getName());
- ++VecIdx;
}
if (V->getType() != I->getType())
V = Builder.CreateBitOrPointerCast(V, I->getType());
@@ -964,22 +973,24 @@ bool Vectorizer::vectorizeChain(Chain &C) {
// Build the vector to store.
Value *Vec = PoisonValue::get(VecTy);
- unsigned VecIdx = 0;
- auto InsertElem = [&](Value *V) {
+ auto InsertElem = [&](Value *V, unsigned VecIdx) {
if (V->getType() != VecElemTy)
V = Builder.CreateBitOrPointerCast(V, VecElemTy);
- Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx++));
+ Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx));
};
for (const ChainElem &E : C) {
auto *I = cast<StoreInst>(E.Inst);
+ int EOffset = (E.OffsetFromLeader - C[0].OffsetFromLeader).getSExtValue();
+ int VecIdx = 8 * EOffset / DL.getTypeSizeInBits(VecElemTy);
if (FixedVectorType *VT =
dyn_cast<FixedVectorType>(getLoadStoreType(I))) {
for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) {
InsertElem(Builder.CreateExtractElement(I->getValueOperand(),
- Builder.getInt32(J)));
+ Builder.getInt32(J)),
+ VecIdx++);
}
} else {
- InsertElem(I->getValueOperand());
+ InsertElem(I->getValueOperand(), VecIdx);
}
}
|
@llvm/pr-subscribers-vectorizers Author: Gang Chen (cmc-rep) ChangesThis can absorb redundant loads when forming vector load. Can be used to fix the situation created by VectorCombine. See: https://discourse.llvm.org/t/what-is-the-purpose-of-vectorizeloadinsert-in-the-vectorcombine-pass/88532 Full diff: https://github.com/llvm/llvm-project/pull/163019.diff 1 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 7b5137b0185ab..484a0b762ad12 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -157,6 +157,7 @@ using EqClassKey =
struct ChainElem {
Instruction *Inst;
APInt OffsetFromLeader;
+ bool Redundant = false; // Set to true when load is redundant.
ChainElem(Instruction *Inst, APInt OffsetFromLeader)
: Inst(std::move(Inst)), OffsetFromLeader(std::move(OffsetFromLeader)) {}
};
@@ -626,26 +627,33 @@ std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
std::vector<Chain> Ret;
Ret.push_back({C.front()});
+ APInt PrevReadEnd = C[0].OffsetFromLeader +
+ DL.getTypeSizeInBits(getLoadStoreType(&*C[0].Inst)) / 8;
for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {
// `prev` accesses offsets [PrevDistFromBase, PrevReadEnd).
auto &CurChain = Ret.back();
- const ChainElem &Prev = CurChain.back();
- unsigned SzBits = DL.getTypeSizeInBits(getLoadStoreType(&*Prev.Inst));
+ unsigned SzBits = DL.getTypeSizeInBits(getLoadStoreType(&*It->Inst));
assert(SzBits % 8 == 0 && "Non-byte sizes should have been filtered out by "
"collectEquivalenceClass");
- APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / 8;
// Add this instruction to the end of the current chain, or start a new one.
+ APInt ReadEnd = It->OffsetFromLeader + SzBits / 8;
+ bool IsRedundant = ReadEnd.sle(PrevReadEnd);
bool AreContiguous = It->OffsetFromLeader == PrevReadEnd;
- LLVM_DEBUG(dbgs() << "LSV: Instructions are "
- << (AreContiguous ? "" : "not ") << "contiguous: "
- << *Prev.Inst << " (ends at offset " << PrevReadEnd
- << ") -> " << *It->Inst << " (starts at offset "
+
+ LLVM_DEBUG(dbgs() << "LSV: Instruction is "
+ << (AreContiguous
+ ? "contiguous"
+ : ((IsRedundant ? "redundant" : "chain-breaker")))
+ << *It->Inst << " (starts at offset "
<< It->OffsetFromLeader << ")\n");
- if (AreContiguous)
+
+ It->Redundant = IsRedundant;
+ if (AreContiguous || IsRedundant)
CurChain.push_back(*It);
else
Ret.push_back({*It});
+ PrevReadEnd = APIntOps::smax(PrevReadEnd, ReadEnd);
}
// Filter out length-1 chains, these are uninteresting.
@@ -874,10 +882,12 @@ bool Vectorizer::vectorizeChain(Chain &C) {
Type *VecElemTy = getChainElemTy(C);
bool IsLoadChain = isa<LoadInst>(C[0].Inst);
unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
- unsigned ChainBytes = std::accumulate(
- C.begin(), C.end(), 0u, [&](unsigned Bytes, const ChainElem &E) {
- return Bytes + DL.getTypeStoreSize(getLoadStoreType(E.Inst));
- });
+ unsigned ChainBytes = 0;
+ for (auto &E : C) {
+ if (E.Redundant)
+ continue;
+ ChainBytes += DL.getTypeStoreSize(getLoadStoreType(E.Inst));
+ }
assert(ChainBytes % DL.getTypeStoreSize(VecElemTy) == 0);
// VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller
// than 1 byte (e.g. VecTy == <32 x i1>).
@@ -916,20 +926,19 @@ bool Vectorizer::vectorizeChain(Chain &C) {
getLoadStorePointerOperand(C[0].Inst),
Alignment);
- unsigned VecIdx = 0;
for (const ChainElem &E : C) {
Instruction *I = E.Inst;
Value *V;
Type *T = getLoadStoreType(I);
+ int EOffset = (E.OffsetFromLeader - C[0].OffsetFromLeader).getSExtValue();
+ int VecIdx = 8 * EOffset / DL.getTypeSizeInBits(VecElemTy);
if (auto *VT = dyn_cast<FixedVectorType>(T)) {
auto Mask = llvm::to_vector<8>(
llvm::seq<int>(VecIdx, VecIdx + VT->getNumElements()));
V = Builder.CreateShuffleVector(VecInst, Mask, I->getName());
- VecIdx += VT->getNumElements();
} else {
V = Builder.CreateExtractElement(VecInst, Builder.getInt32(VecIdx),
I->getName());
- ++VecIdx;
}
if (V->getType() != I->getType())
V = Builder.CreateBitOrPointerCast(V, I->getType());
@@ -964,22 +973,24 @@ bool Vectorizer::vectorizeChain(Chain &C) {
// Build the vector to store.
Value *Vec = PoisonValue::get(VecTy);
- unsigned VecIdx = 0;
- auto InsertElem = [&](Value *V) {
+ auto InsertElem = [&](Value *V, unsigned VecIdx) {
if (V->getType() != VecElemTy)
V = Builder.CreateBitOrPointerCast(V, VecElemTy);
- Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx++));
+ Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx));
};
for (const ChainElem &E : C) {
auto *I = cast<StoreInst>(E.Inst);
+ int EOffset = (E.OffsetFromLeader - C[0].OffsetFromLeader).getSExtValue();
+ int VecIdx = 8 * EOffset / DL.getTypeSizeInBits(VecElemTy);
if (FixedVectorType *VT =
dyn_cast<FixedVectorType>(getLoadStoreType(I))) {
for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) {
InsertElem(Builder.CreateExtractElement(I->getValueOperand(),
- Builder.getInt32(J)));
+ Builder.getInt32(J)),
+ VecIdx++);
}
} else {
- InsertElem(I->getValueOperand());
+ InsertElem(I->getValueOperand(), VecIdx);
}
}
|
You can test this locally with the following command:git diff -U0 --pickaxe-regex -S '([^a-zA-Z0-9#_-]undef[^a-zA-Z0-9_-]|UndefValue::get)' 'HEAD~1' HEAD llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll llvm/test/CodeGen/AMDGPU/mad_uint24.ll llvm/test/CodeGen/AMDGPU/sad.ll llvm/test/CodeGen/AMDGPU/simplifydemandedbits-recursion.ll llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/vect-ptr-ptr-size-mismatch.ll llvm/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll The following files introduce new uses of undef:
Undef is now deprecated and should only be used in the rare cases where no replacement is possible. For example, a load of uninitialized memory yields In tests, avoid using For example, this is considered a bad practice: define void @fn() {
...
br i1 undef, ...
} Please use the following instead: define void @fn(i1 %cond) {
...
br i1 %cond, ...
} Please refer to the Undefined Behavior Manual for more information. |
This can absorb redundant loads when forming vector load. Can be used to fix the situation created by VectorCombine. See: https://discourse.llvm.org/t/what-is-the-purpose-of-vectorizeloadinsert-in-the-vectorcombine-pass/88532