Skip to content

Commit 446b06e

Browse files
committed
Revert "Re-land [Transform][LoadStoreVectorizer] allow redundant in Chain (llvm#168135)"
breaks build of CK This reverts commit 9e9fe08.
1 parent 9664939 commit 446b06e

16 files changed

+336
-473
lines changed

llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp

Lines changed: 34 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -626,38 +626,26 @@ std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
626626
std::vector<Chain> Ret;
627627
Ret.push_back({C.front()});
628628

629-
unsigned ChainElemTyBits = DL.getTypeSizeInBits(getChainElemTy(C));
630-
APInt PrevReadEnd = C[0].OffsetFromLeader +
631-
DL.getTypeStoreSize(getLoadStoreType(&*C[0].Inst));
632629
for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {
630+
// `prev` accesses offsets [PrevDistFromBase, PrevReadEnd).
633631
auto &CurChain = Ret.back();
634-
unsigned SzBytes = DL.getTypeStoreSize(getLoadStoreType(&*It->Inst));
632+
const ChainElem &Prev = CurChain.back();
633+
unsigned SzBits = DL.getTypeSizeInBits(getLoadStoreType(&*Prev.Inst));
634+
assert(SzBits % 8 == 0 && "Non-byte sizes should have been filtered out by "
635+
"collectEquivalenceClass");
636+
APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / 8;
635637

636638
// Add this instruction to the end of the current chain, or start a new one.
637-
assert(
638-
8 * SzBytes % ChainElemTyBits == 0 &&
639-
"Every chain-element size must be a multiple of the element size after "
640-
"vectorization.");
641-
APInt ReadEnd = It->OffsetFromLeader + SzBytes;
642-
// Allow redundancy: partial or full overlap counts as contiguous.
643-
bool AreContiguous = false;
644-
if (It->OffsetFromLeader.sle(PrevReadEnd)) {
645-
// Check overlap is a multiple of the element size after vectorization.
646-
uint64_t Overlap = (PrevReadEnd - It->OffsetFromLeader).getZExtValue();
647-
if (8 * Overlap % ChainElemTyBits == 0)
648-
AreContiguous = true;
649-
}
650-
651-
LLVM_DEBUG(dbgs() << "LSV: Instruction is "
652-
<< (AreContiguous ? "contiguous" : "chain-breaker")
653-
<< *It->Inst << " (starts at offset "
639+
bool AreContiguous = It->OffsetFromLeader == PrevReadEnd;
640+
LLVM_DEBUG(dbgs() << "LSV: Instructions are "
641+
<< (AreContiguous ? "" : "not ") << "contiguous: "
642+
<< *Prev.Inst << " (ends at offset " << PrevReadEnd
643+
<< ") -> " << *It->Inst << " (starts at offset "
654644
<< It->OffsetFromLeader << ")\n");
655-
656645
if (AreContiguous)
657646
CurChain.push_back(*It);
658647
else
659648
Ret.push_back({*It});
660-
PrevReadEnd = APIntOps::smax(PrevReadEnd, ReadEnd);
661649
}
662650

663651
// Filter out length-1 chains, these are uninteresting.
@@ -739,20 +727,14 @@ std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
739727
// These chains are over the closed interval [CBegin, CEnd].
740728
SmallVector<std::pair<unsigned /*CEnd*/, unsigned /*SizeBytes*/>, 8>
741729
CandidateChains;
742-
// Need to compute the size of every candidate chain from its beginning
743-
// because of possible overlapping among chain elements.
744-
unsigned Sz = DL.getTypeStoreSize(getLoadStoreType(C[CBegin].Inst));
745-
APInt PrevReadEnd = C[CBegin].OffsetFromLeader + Sz;
746730
for (unsigned CEnd = CBegin + 1, Size = C.size(); CEnd < Size; ++CEnd) {
747-
APInt ReadEnd = C[CEnd].OffsetFromLeader +
748-
DL.getTypeStoreSize(getLoadStoreType(C[CEnd].Inst));
749-
unsigned BytesAdded =
750-
PrevReadEnd.sle(ReadEnd) ? (ReadEnd - PrevReadEnd).getSExtValue() : 0;
751-
Sz += BytesAdded;
752-
if (Sz > VecRegBytes)
731+
APInt Sz = C[CEnd].OffsetFromLeader +
732+
DL.getTypeStoreSize(getLoadStoreType(C[CEnd].Inst)) -
733+
C[CBegin].OffsetFromLeader;
734+
if (Sz.sgt(VecRegBytes))
753735
break;
754-
CandidateChains.emplace_back(CEnd, Sz);
755-
PrevReadEnd = APIntOps::smax(PrevReadEnd, ReadEnd);
736+
CandidateChains.emplace_back(CEnd,
737+
static_cast<unsigned>(Sz.getLimitedValue()));
756738
}
757739

758740
// Consider the longest chain first.
@@ -892,24 +874,15 @@ bool Vectorizer::vectorizeChain(Chain &C) {
892874
Type *VecElemTy = getChainElemTy(C);
893875
bool IsLoadChain = isa<LoadInst>(C[0].Inst);
894876
unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
895-
unsigned BytesAdded = DL.getTypeStoreSize(getLoadStoreType(&*C[0].Inst));
896-
APInt PrevReadEnd = C[0].OffsetFromLeader + BytesAdded;
897-
unsigned ChainBytes = BytesAdded;
898-
for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {
899-
unsigned SzBytes = DL.getTypeStoreSize(getLoadStoreType(&*It->Inst));
900-
APInt ReadEnd = It->OffsetFromLeader + SzBytes;
901-
// Update ChainBytes considering possible overlap.
902-
BytesAdded =
903-
PrevReadEnd.sle(ReadEnd) ? (ReadEnd - PrevReadEnd).getSExtValue() : 0;
904-
ChainBytes += BytesAdded;
905-
PrevReadEnd = APIntOps::smax(PrevReadEnd, ReadEnd);
906-
}
907-
908-
assert(8 * ChainBytes % DL.getTypeSizeInBits(VecElemTy) == 0);
877+
unsigned ChainBytes = std::accumulate(
878+
C.begin(), C.end(), 0u, [&](unsigned Bytes, const ChainElem &E) {
879+
return Bytes + DL.getTypeStoreSize(getLoadStoreType(E.Inst));
880+
});
881+
assert(ChainBytes % DL.getTypeStoreSize(VecElemTy) == 0);
909882
// VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller
910883
// than 1 byte (e.g. VecTy == <32 x i1>).
911-
unsigned NumElem = 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy);
912-
Type *VecTy = FixedVectorType::get(VecElemTy, NumElem);
884+
Type *VecTy = FixedVectorType::get(
885+
VecElemTy, 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy));
913886

914887
Align Alignment = getLoadStoreAlignment(C[0].Inst);
915888
// If this is a load/store of an alloca, we might have upgraded the alloca's
@@ -936,32 +909,27 @@ bool Vectorizer::vectorizeChain(Chain &C) {
936909
llvm::min_element(C, [](const auto &A, const auto &B) {
937910
return A.Inst->comesBefore(B.Inst);
938911
})->Inst);
939-
// This can happen due to a chain of redundant loads.
940-
// In this case, just use the element-type, and avoid ExtractElement.
941-
if (NumElem == 1)
942-
VecTy = VecElemTy;
912+
943913
// Chain is in offset order, so C[0] is the instr with the lowest offset,
944914
// i.e. the root of the vector.
945915
VecInst = Builder.CreateAlignedLoad(VecTy,
946916
getLoadStorePointerOperand(C[0].Inst),
947917
Alignment);
948918

919+
unsigned VecIdx = 0;
949920
for (const ChainElem &E : C) {
950921
Instruction *I = E.Inst;
951922
Value *V;
952923
Type *T = getLoadStoreType(I);
953-
unsigned EOffset =
954-
(E.OffsetFromLeader - C[0].OffsetFromLeader).getZExtValue();
955-
unsigned VecIdx = 8 * EOffset / DL.getTypeSizeInBits(VecElemTy);
956924
if (auto *VT = dyn_cast<FixedVectorType>(T)) {
957925
auto Mask = llvm::to_vector<8>(
958926
llvm::seq<int>(VecIdx, VecIdx + VT->getNumElements()));
959927
V = Builder.CreateShuffleVector(VecInst, Mask, I->getName());
960-
} else if (VecTy != VecElemTy) {
928+
VecIdx += VT->getNumElements();
929+
} else {
961930
V = Builder.CreateExtractElement(VecInst, Builder.getInt32(VecIdx),
962931
I->getName());
963-
} else {
964-
V = VecInst;
932+
++VecIdx;
965933
}
966934
if (V->getType() != I->getType())
967935
V = Builder.CreateBitOrPointerCast(V, I->getType());
@@ -996,25 +964,22 @@ bool Vectorizer::vectorizeChain(Chain &C) {
996964

997965
// Build the vector to store.
998966
Value *Vec = PoisonValue::get(VecTy);
999-
auto InsertElem = [&](Value *V, unsigned VecIdx) {
967+
unsigned VecIdx = 0;
968+
auto InsertElem = [&](Value *V) {
1000969
if (V->getType() != VecElemTy)
1001970
V = Builder.CreateBitOrPointerCast(V, VecElemTy);
1002-
Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx));
971+
Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx++));
1003972
};
1004973
for (const ChainElem &E : C) {
1005974
auto *I = cast<StoreInst>(E.Inst);
1006-
unsigned EOffset =
1007-
(E.OffsetFromLeader - C[0].OffsetFromLeader).getZExtValue();
1008-
unsigned VecIdx = 8 * EOffset / DL.getTypeSizeInBits(VecElemTy);
1009975
if (FixedVectorType *VT =
1010976
dyn_cast<FixedVectorType>(getLoadStoreType(I))) {
1011977
for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) {
1012978
InsertElem(Builder.CreateExtractElement(I->getValueOperand(),
1013-
Builder.getInt32(J)),
1014-
VecIdx++);
979+
Builder.getInt32(J)));
1015980
}
1016981
} else {
1017-
InsertElem(I->getValueOperand(), VecIdx);
982+
InsertElem(I->getValueOperand());
1018983
}
1019984
}
1020985

llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3850,9 +3850,8 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_p3_p5() #0 {
38503850
; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
38513851
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (invariant load (p1) from `ptr addrspace(4) poison`, addrspace 4)
38523852
; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<32 x s32>) = G_LOAD [[LOAD]](p1) :: ("amdgpu-noclobber" load (<32 x s32>) from %ir.ptr0, addrspace 1)
3853-
; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (s32) from `ptr addrspace(1) poison`, addrspace 1)
3854-
; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[LOAD2]](s32)
3855-
; CHECK-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p5) = G_INTTOPTR [[LOAD2]](s32)
3853+
; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(p3) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (p3) from `ptr addrspace(1) poison`, addrspace 1)
3854+
; CHECK-NEXT: [[LOAD3:%[0-9]+]]:_(p5) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (p5) from `ptr addrspace(1) poison`, addrspace 1)
38563855
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
38573856
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v32i32_p3_p5
38583857
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
@@ -3881,10 +3880,10 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_p3_p5() #0 {
38813880
; CHECK-NEXT: G_STORE [[UV31]](s32), [[PTR_ADD1]](p5) :: (store (s32) into stack, align 16, addrspace 5)
38823881
; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
38833882
; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C4]](s32)
3884-
; CHECK-NEXT: G_STORE [[INTTOPTR]](p3), [[PTR_ADD2]](p5) :: (store (p3) into stack + 4, addrspace 5)
3883+
; CHECK-NEXT: G_STORE [[LOAD2]](p3), [[PTR_ADD2]](p5) :: (store (p3) into stack + 4, addrspace 5)
38853884
; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
38863885
; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C5]](s32)
3887-
; CHECK-NEXT: G_STORE [[INTTOPTR1]](p5), [[PTR_ADD3]](p5) :: (store (p5) into stack + 8, align 8, addrspace 5)
3886+
; CHECK-NEXT: G_STORE [[LOAD3]](p5), [[PTR_ADD3]](p5) :: (store (p5) into stack + 8, align 8, addrspace 5)
38883887
; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32)
38893888
; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32)
38903889
; CHECK-NEXT: $vgpr2 = COPY [[UV2]](s32)

0 commit comments

Comments
 (0)