@@ -626,26 +626,38 @@ std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
626626 std::vector<Chain> Ret;
627627 Ret.push_back ({C.front ()});
628628
629+ unsigned ChainElemTyBits = DL.getTypeSizeInBits (getChainElemTy (C));
630+ APInt PrevReadEnd = C[0 ].OffsetFromLeader +
631+ DL.getTypeStoreSize (getLoadStoreType (&*C[0 ].Inst ));
629632 for (auto It = std::next (C.begin ()), End = C.end (); It != End; ++It) {
630- // `prev` accesses offsets [PrevDistFromBase, PrevReadEnd).
631633 auto &CurChain = Ret.back ();
632- const ChainElem &Prev = CurChain.back ();
633- unsigned SzBits = DL.getTypeSizeInBits (getLoadStoreType (&*Prev.Inst ));
634- assert (SzBits % 8 == 0 && " Non-byte sizes should have been filtered out by "
635- " collectEquivalenceClass" );
636- APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / 8 ;
634+ unsigned SzBytes = DL.getTypeStoreSize (getLoadStoreType (&*It->Inst ));
637635
638636 // Add this instruction to the end of the current chain, or start a new one.
639- bool AreContiguous = It->OffsetFromLeader == PrevReadEnd;
640- LLVM_DEBUG (dbgs () << " LSV: Instructions are "
641- << (AreContiguous ? " " : " not " ) << " contiguous: "
642- << *Prev.Inst << " (ends at offset " << PrevReadEnd
643- << " ) -> " << *It->Inst << " (starts at offset "
637+ assert (
638+ 8 * SzBytes % ChainElemTyBits == 0 &&
639+ " Every chain-element size must be a multiple of the element size after "
640+ " vectorization." );
641+ APInt ReadEnd = It->OffsetFromLeader + SzBytes;
642+ // Allow redundancy: partial or full overlap counts as contiguous.
643+ bool AreContiguous = false ;
644+ if (It->OffsetFromLeader .sle (PrevReadEnd)) {
645+ // Check overlap is a multiple of the element size after vectorization.
646+ uint64_t Overlap = (PrevReadEnd - It->OffsetFromLeader ).getZExtValue ();
647+ if (8 * Overlap % ChainElemTyBits == 0 )
648+ AreContiguous = true ;
649+ }
650+
651+ LLVM_DEBUG (dbgs () << " LSV: Instruction is "
652+ << (AreContiguous ? " contiguous" : " chain-breaker" )
653+ << *It->Inst << " (starts at offset "
644654 << It->OffsetFromLeader << " )\n " );
655+
645656 if (AreContiguous)
646657 CurChain.push_back (*It);
647658 else
648659 Ret.push_back ({*It});
660+ PrevReadEnd = APIntOps::smax (PrevReadEnd, ReadEnd);
649661 }
650662
651663 // Filter out length-1 chains, these are uninteresting.
@@ -727,14 +739,20 @@ std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
727739 // These chains are over the closed interval [CBegin, CEnd].
728740 SmallVector<std::pair<unsigned /* CEnd*/ , unsigned /* SizeBytes*/ >, 8 >
729741 CandidateChains;
742+ // Need to compute the size of every candidate chain from its beginning
743+ // because of possible overlapping among chain elements.
744+ unsigned Sz = DL.getTypeStoreSize (getLoadStoreType (C[CBegin].Inst ));
745+ APInt PrevReadEnd = C[CBegin].OffsetFromLeader + Sz;
730746 for (unsigned CEnd = CBegin + 1 , Size = C.size (); CEnd < Size; ++CEnd) {
731- APInt Sz = C[CEnd].OffsetFromLeader +
732- DL.getTypeStoreSize (getLoadStoreType (C[CEnd].Inst )) -
733- C[CBegin].OffsetFromLeader ;
734- if (Sz.sgt (VecRegBytes))
747+ APInt ReadEnd = C[CEnd].OffsetFromLeader +
748+ DL.getTypeStoreSize (getLoadStoreType (C[CEnd].Inst ));
749+ unsigned BytesAdded =
750+ PrevReadEnd.sle (ReadEnd) ? (ReadEnd - PrevReadEnd).getSExtValue () : 0 ;
751+ Sz += BytesAdded;
752+ if (Sz > VecRegBytes)
735753 break ;
736- CandidateChains.emplace_back (CEnd,
737- static_cast < unsigned >(Sz. getLimitedValue ()) );
754+ CandidateChains.emplace_back (CEnd, Sz);
755+ PrevReadEnd = APIntOps::smax (PrevReadEnd, ReadEnd );
738756 }
739757
740758 // Consider the longest chain first.
@@ -874,15 +892,24 @@ bool Vectorizer::vectorizeChain(Chain &C) {
874892 Type *VecElemTy = getChainElemTy (C);
875893 bool IsLoadChain = isa<LoadInst>(C[0 ].Inst );
876894 unsigned AS = getLoadStoreAddressSpace (C[0 ].Inst );
877- unsigned ChainBytes = std::accumulate (
878- C.begin (), C.end (), 0u , [&](unsigned Bytes, const ChainElem &E) {
879- return Bytes + DL.getTypeStoreSize (getLoadStoreType (E.Inst ));
880- });
881- assert (ChainBytes % DL.getTypeStoreSize (VecElemTy) == 0 );
895+ unsigned BytesAdded = DL.getTypeStoreSize (getLoadStoreType (&*C[0 ].Inst ));
896+ APInt PrevReadEnd = C[0 ].OffsetFromLeader + BytesAdded;
897+ unsigned ChainBytes = BytesAdded;
898+ for (auto It = std::next (C.begin ()), End = C.end (); It != End; ++It) {
899+ unsigned SzBytes = DL.getTypeStoreSize (getLoadStoreType (&*It->Inst ));
900+ APInt ReadEnd = It->OffsetFromLeader + SzBytes;
901+ // Update ChainBytes considering possible overlap.
902+ BytesAdded =
903+ PrevReadEnd.sle (ReadEnd) ? (ReadEnd - PrevReadEnd).getSExtValue () : 0 ;
904+ ChainBytes += BytesAdded;
905+ PrevReadEnd = APIntOps::smax (PrevReadEnd, ReadEnd);
906+ }
907+
908+ assert (8 * ChainBytes % DL.getTypeSizeInBits (VecElemTy) == 0 );
882909 // VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller
883910 // than 1 byte (e.g. VecTy == <32 x i1>).
884- Type *VecTy = FixedVectorType::get (
885- VecElemTy, 8 * ChainBytes / DL. getTypeSizeInBits (VecElemTy) );
911+ unsigned NumElem = 8 * ChainBytes / DL. getTypeSizeInBits (VecElemTy);
912+ Type *VecTy = FixedVectorType::get (VecElemTy, NumElem );
886913
887914 Align Alignment = getLoadStoreAlignment (C[0 ].Inst );
888915 // If this is a load/store of an alloca, we might have upgraded the alloca's
@@ -909,27 +936,32 @@ bool Vectorizer::vectorizeChain(Chain &C) {
909936 llvm::min_element (C, [](const auto &A, const auto &B) {
910937 return A.Inst ->comesBefore (B.Inst );
911938 })->Inst );
912-
939+ // This can happen due to a chain of redundant loads.
940+ // In this case, just use the element-type, and avoid ExtractElement.
941+ if (NumElem == 1 )
942+ VecTy = VecElemTy;
913943 // Chain is in offset order, so C[0] is the instr with the lowest offset,
914944 // i.e. the root of the vector.
915945 VecInst = Builder.CreateAlignedLoad (VecTy,
916946 getLoadStorePointerOperand (C[0 ].Inst ),
917947 Alignment);
918948
919- unsigned VecIdx = 0 ;
920949 for (const ChainElem &E : C) {
921950 Instruction *I = E.Inst ;
922951 Value *V;
923952 Type *T = getLoadStoreType (I);
953+ unsigned EOffset =
954+ (E.OffsetFromLeader - C[0 ].OffsetFromLeader ).getZExtValue ();
955+ unsigned VecIdx = 8 * EOffset / DL.getTypeSizeInBits (VecElemTy);
924956 if (auto *VT = dyn_cast<FixedVectorType>(T)) {
925957 auto Mask = llvm::to_vector<8 >(
926958 llvm::seq<int >(VecIdx, VecIdx + VT->getNumElements ()));
927959 V = Builder.CreateShuffleVector (VecInst, Mask, I->getName ());
928- VecIdx += VT->getNumElements ();
929- } else {
960+ } else if (VecTy != VecElemTy) {
930961 V = Builder.CreateExtractElement (VecInst, Builder.getInt32 (VecIdx),
931962 I->getName ());
932- ++VecIdx;
963+ } else {
964+ V = VecInst;
933965 }
934966 if (V->getType () != I->getType ())
935967 V = Builder.CreateBitOrPointerCast (V, I->getType ());
@@ -964,22 +996,25 @@ bool Vectorizer::vectorizeChain(Chain &C) {
964996
965997 // Build the vector to store.
966998 Value *Vec = PoisonValue::get (VecTy);
967- unsigned VecIdx = 0 ;
968- auto InsertElem = [&](Value *V) {
999+ auto InsertElem = [&](Value *V, unsigned VecIdx) {
9691000 if (V->getType () != VecElemTy)
9701001 V = Builder.CreateBitOrPointerCast (V, VecElemTy);
971- Vec = Builder.CreateInsertElement (Vec, V, Builder.getInt32 (VecIdx++ ));
1002+ Vec = Builder.CreateInsertElement (Vec, V, Builder.getInt32 (VecIdx));
9721003 };
9731004 for (const ChainElem &E : C) {
9741005 auto *I = cast<StoreInst>(E.Inst );
1006+ unsigned EOffset =
1007+ (E.OffsetFromLeader - C[0 ].OffsetFromLeader ).getZExtValue ();
1008+ unsigned VecIdx = 8 * EOffset / DL.getTypeSizeInBits (VecElemTy);
9751009 if (FixedVectorType *VT =
9761010 dyn_cast<FixedVectorType>(getLoadStoreType (I))) {
9771011 for (int J = 0 , JE = VT->getNumElements (); J < JE; ++J) {
9781012 InsertElem (Builder.CreateExtractElement (I->getValueOperand (),
979- Builder.getInt32 (J)));
1013+ Builder.getInt32 (J)),
1014+ VecIdx++);
9801015 }
9811016 } else {
982- InsertElem (I->getValueOperand ());
1017+ InsertElem (I->getValueOperand (), VecIdx );
9831018 }
9841019 }
9851020
0 commit comments