@@ -626,26 +626,35 @@ std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
626626 std::vector<Chain> Ret;
627627 Ret.push_back ({C.front ()});
628628
629+ unsigned ElemBytes = DL.getTypeStoreSize (getChainElemTy (C));
630+ APInt PrevReadEnd = C[0 ].OffsetFromLeader +
631+ DL.getTypeStoreSize (getLoadStoreType (&*C[0 ].Inst ));
629632 for (auto It = std::next (C.begin ()), End = C.end (); It != End; ++It) {
630633 // `prev` accesses offsets [PrevDistFromBase, PrevReadEnd).
631634 auto &CurChain = Ret.back ();
632- const ChainElem &Prev = CurChain.back ();
633- unsigned SzBits = DL.getTypeSizeInBits (getLoadStoreType (&*Prev.Inst ));
634- assert (SzBits % 8 == 0 && " Non-byte sizes should have been filtered out by "
635- " collectEquivalenceClass" );
636- APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / 8 ;
635+ unsigned SzBytes = DL.getTypeStoreSize (getLoadStoreType (&*It->Inst ));
637636
638637 // Add this instruction to the end of the current chain, or start a new one.
639- bool AreContiguous = It->OffsetFromLeader == PrevReadEnd;
640- LLVM_DEBUG (dbgs () << " LSV: Instructions are "
641- << (AreContiguous ? " " : " not " ) << " contiguous: "
642- << *Prev.Inst << " (ends at offset " << PrevReadEnd
643- << " ) -> " << *It->Inst << " (starts at offset "
638+ assert (SzBytes % ElemBytes == 0 );
639+ APInt ReadEnd = It->OffsetFromLeader + SzBytes;
640+ // Allow redundancy: partial or full overlap counts as contiguous.
641+ bool AreContiguous = false ;
642+ if (It->OffsetFromLeader .sle (PrevReadEnd)) {
643+ uint64_t Overlap = (PrevReadEnd - It->OffsetFromLeader ).getZExtValue ();
644+ if (Overlap % ElemBytes == 0 )
645+ AreContiguous = true ;
646+ }
647+
648+ LLVM_DEBUG (dbgs () << " LSV: Instruction is "
649+ << (AreContiguous ? " contiguous" : " chain-breaker" )
650+ << *It->Inst << " (starts at offset "
644651 << It->OffsetFromLeader << " )\n " );
652+
645653 if (AreContiguous)
646654 CurChain.push_back (*It);
647655 else
648656 Ret.push_back ({*It});
657+ PrevReadEnd = APIntOps::smax (PrevReadEnd, ReadEnd);
649658 }
650659
651660 // Filter out length-1 chains, these are uninteresting.
@@ -727,14 +736,20 @@ std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
727736 // These chains are over the closed interval [CBegin, CEnd].
728737 SmallVector<std::pair<unsigned /* CEnd*/ , unsigned /* SizeBytes*/ >, 8 >
729738 CandidateChains;
739+
740+ unsigned BytesAdded = DL.getTypeStoreSize (getLoadStoreType (C[CBegin].Inst ));
741+ APInt PrevReadEnd = C[CBegin].OffsetFromLeader + BytesAdded;
742+ unsigned Sz = BytesAdded;
730743 for (unsigned CEnd = CBegin + 1 , Size = C.size (); CEnd < Size; ++CEnd) {
731- APInt Sz = C[CEnd].OffsetFromLeader +
732- DL.getTypeStoreSize (getLoadStoreType (C[CEnd].Inst )) -
733- C[CBegin].OffsetFromLeader ;
734- if (Sz.sgt (VecRegBytes))
744+ APInt ReadEnd = C[CEnd].OffsetFromLeader +
745+ DL.getTypeStoreSize (getLoadStoreType (C[CEnd].Inst ));
746+ BytesAdded =
747+ PrevReadEnd.sle (ReadEnd) ? (ReadEnd - PrevReadEnd).getSExtValue () : 0 ;
748+ Sz += BytesAdded;
749+ if (Sz > VecRegBytes)
735750 break ;
736- CandidateChains.emplace_back (CEnd,
737- static_cast < unsigned >(Sz. getLimitedValue ()) );
751+ CandidateChains.emplace_back (CEnd, Sz);
752+ PrevReadEnd = APIntOps::smax (PrevReadEnd, ReadEnd );
738753 }
739754
740755 // Consider the longest chain first.
@@ -874,15 +889,24 @@ bool Vectorizer::vectorizeChain(Chain &C) {
874889 Type *VecElemTy = getChainElemTy (C);
875890 bool IsLoadChain = isa<LoadInst>(C[0 ].Inst );
876891 unsigned AS = getLoadStoreAddressSpace (C[0 ].Inst );
877- unsigned ChainBytes = std::accumulate (
878- C.begin (), C.end (), 0u , [&](unsigned Bytes, const ChainElem &E) {
879- return Bytes + DL.getTypeStoreSize (getLoadStoreType (E.Inst ));
880- });
892+ unsigned BytesAdded = DL.getTypeStoreSize (getLoadStoreType (&*C[0 ].Inst ));
893+ APInt PrevReadEnd = C[0 ].OffsetFromLeader + BytesAdded;
894+ unsigned ChainBytes = BytesAdded;
895+ for (auto It = std::next (C.begin ()), End = C.end (); It != End; ++It) {
896+ unsigned SzBytes = DL.getTypeStoreSize (getLoadStoreType (&*It->Inst ));
897+ APInt ReadEnd = It->OffsetFromLeader + SzBytes;
898+ // Update ChainBytes considering possible overlap.
899+ BytesAdded =
900+ PrevReadEnd.sle (ReadEnd) ? (ReadEnd - PrevReadEnd).getSExtValue () : 0 ;
901+ ChainBytes += BytesAdded;
902+ PrevReadEnd = APIntOps::smax (PrevReadEnd, ReadEnd);
903+ }
904+
881905 assert (ChainBytes % DL.getTypeStoreSize (VecElemTy) == 0 );
882906 // VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller
883907 // than 1 byte (e.g. VecTy == <32 x i1>).
884- Type *VecTy = FixedVectorType::get (
885- VecElemTy, 8 * ChainBytes / DL. getTypeSizeInBits (VecElemTy) );
908+ unsigned NumElem = 8 * ChainBytes / DL. getTypeSizeInBits (VecElemTy);
909+ Type *VecTy = FixedVectorType::get (VecElemTy, NumElem );
886910
887911 Align Alignment = getLoadStoreAlignment (C[0 ].Inst );
888912 // If this is a load/store of an alloca, we might have upgraded the alloca's
@@ -909,27 +933,31 @@ bool Vectorizer::vectorizeChain(Chain &C) {
909933 llvm::min_element (C, [](const auto &A, const auto &B) {
910934 return A.Inst ->comesBefore (B.Inst );
911935 })->Inst );
912-
936+ // This can happen due to a chain of redundant loads.
937+ // In this case, just use the element-type, and avoid ExtractElement.
938+ if (NumElem == 1 )
939+ VecTy = VecElemTy;
913940 // Chain is in offset order, so C[0] is the instr with the lowest offset,
914941 // i.e. the root of the vector.
915942 VecInst = Builder.CreateAlignedLoad (VecTy,
916943 getLoadStorePointerOperand (C[0 ].Inst ),
917944 Alignment);
918945
919- unsigned VecIdx = 0 ;
920946 for (const ChainElem &E : C) {
921947 Instruction *I = E.Inst ;
922948 Value *V;
923949 Type *T = getLoadStoreType (I);
950+ int EOffset = (E.OffsetFromLeader - C[0 ].OffsetFromLeader ).getSExtValue ();
951+ int VecIdx = 8 * EOffset / DL.getTypeSizeInBits (VecElemTy);
924952 if (auto *VT = dyn_cast<FixedVectorType>(T)) {
925953 auto Mask = llvm::to_vector<8 >(
926954 llvm::seq<int >(VecIdx, VecIdx + VT->getNumElements ()));
927955 V = Builder.CreateShuffleVector (VecInst, Mask, I->getName ());
928- VecIdx += VT->getNumElements ();
929- } else {
956+ } else if (VecTy != VecElemTy) {
930957 V = Builder.CreateExtractElement (VecInst, Builder.getInt32 (VecIdx),
931958 I->getName ());
932- ++VecIdx;
959+ } else {
960+ V = VecInst;
933961 }
934962 if (V->getType () != I->getType ())
935963 V = Builder.CreateBitOrPointerCast (V, I->getType ());
@@ -964,22 +992,24 @@ bool Vectorizer::vectorizeChain(Chain &C) {
964992
965993 // Build the vector to store.
966994 Value *Vec = PoisonValue::get (VecTy);
967- unsigned VecIdx = 0 ;
968- auto InsertElem = [&](Value *V) {
995+ auto InsertElem = [&](Value *V, unsigned VecIdx) {
969996 if (V->getType () != VecElemTy)
970997 V = Builder.CreateBitOrPointerCast (V, VecElemTy);
971- Vec = Builder.CreateInsertElement (Vec, V, Builder.getInt32 (VecIdx++ ));
998+ Vec = Builder.CreateInsertElement (Vec, V, Builder.getInt32 (VecIdx));
972999 };
9731000 for (const ChainElem &E : C) {
9741001 auto *I = cast<StoreInst>(E.Inst );
1002+ int EOffset = (E.OffsetFromLeader - C[0 ].OffsetFromLeader ).getSExtValue ();
1003+ int VecIdx = 8 * EOffset / DL.getTypeSizeInBits (VecElemTy);
9751004 if (FixedVectorType *VT =
9761005 dyn_cast<FixedVectorType>(getLoadStoreType (I))) {
9771006 for (int J = 0 , JE = VT->getNumElements (); J < JE; ++J) {
9781007 InsertElem (Builder.CreateExtractElement (I->getValueOperand (),
979- Builder.getInt32 (J)));
1008+ Builder.getInt32 (J)),
1009+ VecIdx++);
9801010 }
9811011 } else {
982- InsertElem (I->getValueOperand ());
1012+ InsertElem (I->getValueOperand (), VecIdx );
9831013 }
9841014 }
9851015
0 commit comments