From a30ca096ddaae30663d4b10c344eff439cd44d57 Mon Sep 17 00:00:00 2001 From: chengjunp Date: Fri, 8 Aug 2025 20:45:07 +0000 Subject: [PATCH 1/8] Initial impl of tree structure merge in SROA --- llvm/lib/Transforms/Scalar/SROA.cpp | 295 ++++++++++++- ...r-promotion-cannot-tree-structure-merge.ll | 214 +++++++++ ...ctor-promotion-via-tree-structure-merge.ll | 408 ++++++++++++++++++ 3 files changed, 910 insertions(+), 7 deletions(-) create mode 100644 llvm/test/Transforms/SROA/vector-promotion-cannot-tree-structure-merge.ll create mode 100644 llvm/test/Transforms/SROA/vector-promotion-via-tree-structure-merge.ll diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index d6e27aa20730b..2bbaf7813c3c0 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -91,6 +91,7 @@ #include #include #include +#include #include #include #include @@ -2678,6 +2679,53 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V, return V; } +static Value *mergeTwoVectors(Value *V0, Value *V1, IRBuilder<> &Builder) { + assert(V0->getType()->isVectorTy() && V1->getType()->isVectorTy() && + "Can not merge two non-vector values"); + + // V0 and V1 are vectors + // Create a new vector type with combined elements + // Use ShuffleVector to concatenate the vectors + auto *VecType0 = cast(V0->getType()); + auto *VecType1 = cast(V1->getType()); + + assert(VecType0->getElementType() == VecType1->getElementType() && + "Can not merge two vectors with different element types"); + unsigned NumElts0 = VecType0->getNumElements(); + unsigned NumElts1 = VecType1->getNumElements(); + + SmallVector ShuffleMask; + + if (NumElts0 == NumElts1) { + for (unsigned i = 0; i < NumElts0 + NumElts1; ++i) + ShuffleMask.push_back(i); + } else { + // If two vectors have different sizes, we need to extend + // the smaller vector to the size of the larger vector. + unsigned SmallSize = std::min(NumElts0, NumElts1); + unsigned LargeSize = std::max(NumElts0, NumElts1); + bool IsV0Smaller = NumElts0 < NumElts1; + Value *SmallVec = IsV0Smaller ? V0 : V1; + + SmallVector ExtendMask; + for (unsigned i = 0; i < SmallSize; ++i) + ExtendMask.push_back(i); + for (unsigned i = SmallSize; i < LargeSize; ++i) + ExtendMask.push_back(PoisonMaskElem); + Value *ExtendedVec = Builder.CreateShuffleVector( + SmallVec, PoisonValue::get(SmallVec->getType()), ExtendMask); + LLVM_DEBUG(dbgs() << " shufflevector: " << *ExtendedVec << "\n"); + V0 = IsV0Smaller ? ExtendedVec : V0; + V1 = IsV0Smaller ? V1 : ExtendedVec; + for (unsigned i = 0; i < NumElts0; ++i) + ShuffleMask.push_back(i); + for (unsigned i = 0; i < NumElts1; ++i) + ShuffleMask.push_back(LargeSize + i); + } + + return Builder.CreateShuffleVector(V0, V1, ShuffleMask); +} + namespace { /// Visitor to rewrite instructions using p particular slice of an alloca @@ -2822,6 +2870,230 @@ class AllocaSliceRewriter : public InstVisitor { return CanSROA; } + /// Attempts to rewrite a partition using tree-structured merge optimization. + /// + /// This function analyzes a partition to determine if it can be optimized + /// using a tree-structured merge pattern, where multiple non-overlapping + /// stores completely fill an alloca. And there is no load from the alloca in + /// the middle of the stores. Such patterns can be optimized by eliminating + /// the intermediate stores and directly constructing the final vector by + /// using shufflevectors. + /// + /// Example transformation: + /// Before: (stores do not have to be in order) + /// %alloca = alloca <8 x float> + /// store <2 x float> %val0, ptr %alloca ; offset 0-1 + /// store <2 x float> %val2, ptr %alloca+16 ; offset 4-5 + /// store <2 x float> %val1, ptr %alloca+8 ; offset 2-3 + /// store <2 x float> %val3, ptr %alloca+24 ; offset 6-7 + /// + /// After: + /// %alloca = alloca <8 x float> + /// %shuffle0 = shufflevector %val0, %val1, <4 x i32> + /// %shuffle1 = shufflevector %val2, %val3, <4 x i32> + /// %shuffle2 = shufflevector %shuffle0, %shuffle1, <8 x i32> + /// store %shuffle2, ptr %alloca + /// + /// The optimization looks for partitions that: + /// 1. Have no overlapping split slice tails + /// 2. Contain non-overlapping stores that cover the entire alloca + /// 3. Have exactly one load that reads the complete alloca structure and not + /// in the middle of the stores (TODO: maybe we can relax the constraint + /// about reading the entire alloca structure) + /// + /// \param P The partition to analyze and potentially rewrite + /// \return An optional vector of values that were deleted during the rewrite + /// process, or std::nullopt if the partition cannot be optimized + /// using tree-structured merge + std::optional> + rewriteTreeStructuredMerge(Partition &P) { + // No tail slices that overlap with the partition + if (P.splitSliceTails().size() > 0) + return std::nullopt; + + SmallVector DeletedValues; + LoadInst *TheLoad = nullptr; + + // Structure to hold store information + struct StoreInfo { + StoreInst *Store; + uint64_t BeginOffset; + uint64_t EndOffset; + Value *StoredValue; + TypeSize StoredTypeSize = TypeSize::getZero(); + + StoreInfo(StoreInst *SI, uint64_t Begin, uint64_t End, Value *Val, + TypeSize StoredTypeSize) + : Store(SI), BeginOffset(Begin), EndOffset(End), StoredValue(Val), + StoredTypeSize(StoredTypeSize) {} + }; + + SmallVector StoreInfos; + + // The alloca must be a fixed vector type + auto *AllocatedTy = NewAI.getAllocatedType(); + if (!isa(AllocatedTy)) + return std::nullopt; + + Slice *LoadSlice = nullptr; + Type *LoadElementType = nullptr; + Type *StoreElementType = nullptr; + for (Slice &S : P) { + auto *User = cast(S.getUse()->getUser()); + if (auto *LI = dyn_cast(User)) { + // Do not handle the case where there is more than one load + // TODO: maybe we can handle this case + if (TheLoad) + return std::nullopt; + // If load is not a fixed vector type, we do not handle it + // If the number of loaded bits is not the same as the new alloca type + // size, we do not handle it + auto *FixedVecTy = dyn_cast(LI->getType()); + if (!FixedVecTy) + return std::nullopt; + if (DL.getTypeSizeInBits(FixedVecTy) != + DL.getTypeSizeInBits(NewAI.getAllocatedType())) + return std::nullopt; + LoadElementType = FixedVecTy->getElementType(); + TheLoad = LI; + LoadSlice = &S; + } else if (auto *SI = dyn_cast(User)) { + // The store needs to be a fixed vector type + // All the stores should have the same element type + Type *StoredValueType = SI->getValueOperand()->getType(); + Type *CurrentElementType = nullptr; + TypeSize StoredTypeSize = TypeSize::getZero(); + if (auto *FixedVecTy = dyn_cast(StoredValueType)) { + // Fixed vector type - use its element type + CurrentElementType = FixedVecTy->getElementType(); + StoredTypeSize = DL.getTypeSizeInBits(FixedVecTy); + } else + return std::nullopt; + // Check element type consistency across all stores + if (StoreElementType && StoreElementType != CurrentElementType) + return std::nullopt; + StoreElementType = CurrentElementType; + StoreInfos.emplace_back(SI, S.beginOffset(), S.endOffset(), + SI->getValueOperand(), StoredTypeSize); + } else { + // If we have instructions other than load and store, we cannot do the + // tree structured merge + return std::nullopt; + } + } + // If we do not have any load, we cannot do the tree structured merge + if (!TheLoad) + return std::nullopt; + + // If we do not have any stores, we cannot do the tree structured merge + if (StoreInfos.empty()) + return std::nullopt; + + // The load and store element types should be the same + if (LoadElementType != StoreElementType) + return std::nullopt; + + // The load should cover the whole alloca + // TODO: maybe we can relax this constraint + if (!LoadSlice || LoadSlice->beginOffset() != NewAllocaBeginOffset || + LoadSlice->endOffset() != NewAllocaEndOffset) + return std::nullopt; + + // Stores should not overlap and should cover the whole alloca + // Sort by begin offset + llvm::sort(StoreInfos, [](const StoreInfo &A, const StoreInfo &B) { + return A.BeginOffset < B.BeginOffset; + }); + + // Check for overlaps and coverage + uint64_t ExpectedStart = NewAllocaBeginOffset; + TypeSize TotalStoreBits = TypeSize::getZero(); + Instruction *PrevStore = nullptr; + for (auto &StoreInfo : StoreInfos) { + uint64_t BeginOff = StoreInfo.BeginOffset; + uint64_t EndOff = StoreInfo.EndOffset; + + // Check for gap or overlap + if (BeginOff != ExpectedStart) + return std::nullopt; + + ExpectedStart = EndOff; + TotalStoreBits += StoreInfo.StoredTypeSize; + PrevStore = StoreInfo.Store; + } + // Check that stores cover the entire alloca + // We need check both the end offset and the total store bits + if (ExpectedStart != NewAllocaEndOffset || + TotalStoreBits != DL.getTypeSizeInBits(NewAI.getAllocatedType())) + return std::nullopt; + + // Stores should be in the same basic block + // The load should not be in the middle of the stores + BasicBlock *LoadBB = TheLoad->getParent(); + BasicBlock *StoreBB = StoreInfos[0].Store->getParent(); + + for (auto &StoreInfo : StoreInfos) { + if (StoreInfo.Store->getParent() != StoreBB) + return std::nullopt; + if (LoadBB == StoreBB && !StoreInfo.Store->comesBefore(TheLoad)) + return std::nullopt; + } + + // If we reach here, the partition can be merged with a tree structured + // merge + LLVM_DEBUG({ + dbgs() << "Tree structured merge rewrite:\n Load: " << *TheLoad + << "\n Ordered stores:\n"; + for (auto [i, Info] : enumerate(StoreInfos)) + dbgs() << " [" << i << "] Range[" << Info.BeginOffset << ", " + << Info.EndOffset << ") \tStore: " << *Info.Store + << "\tValue: " << *Info.StoredValue << "\n"; + }); + + // Instead of having these stores, we merge all the stored values into a + // vector and store the merged value into the alloca + std::queue VecElements; + IRBuilder<> Builder(StoreInfos.back().Store); + for (const auto &Info : StoreInfos) { + DeletedValues.push_back(Info.Store); + VecElements.push(Info.StoredValue); + } + + LLVM_DEBUG(dbgs() << " Rewrite stores into shufflevectors:\n"); + while (VecElements.size() > 1) { + uint64_t NumElts = VecElements.size(); + for (uint64_t i = 0; i < NumElts / 2; i++) { + Value *V0 = VecElements.front(); + VecElements.pop(); + Value *V1 = VecElements.front(); + VecElements.pop(); + Value *Merged = mergeTwoVectors(V0, V1, Builder); + LLVM_DEBUG(dbgs() << " shufflevector: " << *Merged << "\n"); + VecElements.push(Merged); + } + if (NumElts % 2 == 1) { + Value *V = VecElements.front(); + VecElements.pop(); + VecElements.push(V); + } + } + + // Store the merged value into the alloca + Value *MergedValue = VecElements.front(); + Builder.CreateAlignedStore(MergedValue, &NewAI, getSliceAlign()); + + IRBuilder<> LoadBuilder(TheLoad); + TheLoad->replaceAllUsesWith(LoadBuilder.CreateAlignedLoad( + TheLoad->getType(), &NewAI, getSliceAlign(), TheLoad->isVolatile(), + TheLoad->getName() + ".sroa.new.load")); + DeletedValues.push_back(TheLoad); + + return DeletedValues; + } + private: // Make sure the other visit overloads are visible. using Base::visit; @@ -4996,13 +5268,22 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, P.endOffset(), IsIntegerPromotable, VecTy, PHIUsers, SelectUsers); bool Promotable = true; - for (Slice *S : P.splitSliceTails()) { - Promotable &= Rewriter.visit(S); - ++NumUses; - } - for (Slice &S : P) { - Promotable &= Rewriter.visit(&S); - ++NumUses; + // Check whether we can have tree-structured merge. + std::optional> DeletedValues = + Rewriter.rewriteTreeStructuredMerge(P); + if (DeletedValues) { + NumUses += DeletedValues->size() + 1; + for (Value *V : *DeletedValues) + DeadInsts.push_back(V); + } else { + for (Slice *S : P.splitSliceTails()) { + Promotable &= Rewriter.visit(S); + ++NumUses; + } + for (Slice &S : P) { + Promotable &= Rewriter.visit(&S); + ++NumUses; + } } NumAllocaPartitionUses += NumUses; diff --git a/llvm/test/Transforms/SROA/vector-promotion-cannot-tree-structure-merge.ll b/llvm/test/Transforms/SROA/vector-promotion-cannot-tree-structure-merge.ll new file mode 100644 index 0000000000000..61d77478e0b59 --- /dev/null +++ b/llvm/test/Transforms/SROA/vector-promotion-cannot-tree-structure-merge.ll @@ -0,0 +1,214 @@ +; REQUIRES: asserts +; RUN: opt < %s -passes='sroa' -disable-output -debug-only=sroa 2>&1 | FileCheck %s +; RUN: opt < %s -passes='sroa' -disable-output -debug-only=sroa 2>&1 | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64" + +; CHECK-NOT: Tree structured merge rewrite +define i32 @test_alloca_not_fixed_vector() { +entry: + %alloca = alloca [4 x float] + + %ptr0 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 0 + store float 1.0, ptr %ptr0 + + %ptr1 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 1 + store float 2.0, ptr %ptr1 + + %result = load i32, ptr %alloca + ret i32 %result +} + +define <4 x float> @test_more_than_one_load(<2 x float> %a, <2 x float> %b) { +entry: + %alloca = alloca <4 x float> + + %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0 + store <2 x float> %a, ptr %ptr0 + + %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2 + store <2 x float> %b, ptr %ptr1 + + %result1 = load <4 x float>, ptr %alloca + %result2 = load <4 x float>, ptr %alloca + + %final = fadd <4 x float> %result1, %result2 + ret <4 x float> %final +} + +define void @test_no_load(<4 x float> %a) { +entry: + %alloca = alloca <4 x float> + store <4 x float> %a, ptr %alloca + ret void +} + +define i32 @test_load_not_fixed_vector(<2 x float> %a, <2 x float> %b) { +entry: + %alloca = alloca <4 x float> + + %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0 + store <2 x float> %a, ptr %ptr0 + + %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2 + store <2 x float> %b, ptr %ptr1 + + %result = load i32, ptr %alloca + ret i32 %result +} + +define <3 x float> @test_load_not_covering_alloca(<2 x float> %a, <2 x float> %b) { +entry: + %alloca = alloca <4 x float> + + %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0 + store <2 x float> %a, ptr %ptr0 + + %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2 + store <2 x float> %b, ptr %ptr1 + + %result = load <3 x float>, ptr %ptr0 + ret <3 x float> %result +} + +define <4 x float> @test_store_not_fixed_vector( %a) { +entry: + %alloca = alloca <4 x float> + + %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0 + %fixed = extractelement %a, i32 0 + store float %fixed, ptr %ptr0 + + %result = load <4 x float>, ptr %alloca + ret <4 x float> %result +} + +define <4 x float> @test_store_not_same_element_type() { +entry: + %alloca = alloca <4 x float> + + %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0 + %float_vec = insertelement <2 x float> undef, float 1.0, i32 0 + %float_vec2 = insertelement <2 x float> %float_vec, float 2.0, i32 1 + store <2 x float> %float_vec2, ptr %ptr0 + + %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2 + %int_vec = insertelement <2 x i32> undef, i32 3, i32 0 + %int_vec2 = insertelement <2 x i32> %int_vec, i32 4, i32 1 + store <2 x i32> %int_vec2, ptr %ptr1 + + %result = load <4 x float>, ptr %alloca + ret <4 x float> %result +} + +define <4 x i32> @test_load_store_different_element_type() { +entry: + %alloca = alloca <4 x float> + + %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0 + %float_vec = insertelement <2 x float> undef, float 1.0, i32 0 + %float_vec2 = insertelement <2 x float> %float_vec, float 2.0, i32 1 + store <2 x float> %float_vec2, ptr %ptr0 + + %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2 + %float_vec3 = insertelement <2 x float> undef, float 3.0, i32 0 + %float_vec4 = insertelement <2 x float> %float_vec3, float 4.0, i32 1 + store <2 x float> %float_vec4, ptr %ptr1 + + %result = load <4 x i32>, ptr %alloca + ret <4 x i32> %result +} + +define <4 x float> @test_no_stores() { +entry: + %alloca = alloca <4 x float> + + %result = load <4 x float>, ptr %alloca + ret <4 x float> %result +} + +define <4 x float> @test_stores_overlapping(<2 x float> %a, <2 x float> %b, <2 x float> %c) { +entry: + %alloca = alloca <4 x float> + + %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0 + store <2 x float> %a, ptr %ptr0 + + %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 1 + store <2 x float> %b, ptr %ptr1 + + %ptr2 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2 + store <2 x float> %c, ptr %ptr2 + + %result = load <4 x float>, ptr %alloca + ret <4 x float> %result +} + +define <4 x float> @test_stores_not_covering_alloca(<2 x float> %a) { +entry: + %alloca = alloca <4 x float> + + %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0 + store <2 x float> %a, ptr %ptr0 + + %result = load <4 x float>, ptr %alloca + ret <4 x float> %result +} + +define <4 x float> @test_stores_not_same_basic_block(<2 x float> %a, <2 x float> %b, i1 %cond) { +entry: + %alloca = alloca <4 x float> + + %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0 + store <2 x float> %a, ptr %ptr0 + + br i1 %cond, label %then, label %else + +then: + %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2 + store <2 x float> %b, ptr %ptr1 + br label %merge + +else: + br label %merge + +merge: + %result = load <4 x float>, ptr %alloca + ret <4 x float> %result +} + +define <4 x float> @test_load_before_stores(<2 x float> %a, <2 x float> %b) { +entry: + %alloca = alloca <4 x float> + + %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0 + store <2 x float> %a, ptr %ptr0 + + %intermediate = load <4 x float>, ptr %alloca + + %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2 + store <2 x float> %b, ptr %ptr1 + + ret <4 x float> %intermediate +} + +define <4 x float> @test_other_instructions(<2 x float> %a, <2 x float> %b) { +entry: + %alloca = alloca <4 x float> + + ; Store first vector + %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0 + store <2 x float> %a, ptr %ptr0 + + ; Other instruction (memset) that's not a simple load/store + call void @llvm.memset.p0.i64(ptr %alloca, i8 0, i64 8, i1 false) + + ; Store second vector + %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2 + store <2 x float> %b, ptr %ptr1 + + %result = load <4 x float>, ptr %alloca + ret <4 x float> %result +} + +declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) diff --git a/llvm/test/Transforms/SROA/vector-promotion-via-tree-structure-merge.ll b/llvm/test/Transforms/SROA/vector-promotion-via-tree-structure-merge.ll new file mode 100644 index 0000000000000..c74b0b932ddef --- /dev/null +++ b/llvm/test/Transforms/SROA/vector-promotion-via-tree-structure-merge.ll @@ -0,0 +1,408 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes='sroa' -S | FileCheck %s --check-prefixes=CHECK,CHECK-PRESERVE-CFG +; RUN: opt < %s -passes='sroa' -S | FileCheck %s --check-prefixes=CHECK,CHECK-MODIFY-CFG +; RUN: opt < %s -passes=debugify,sroa -S | FileCheck %s --check-prefix=DEBUG +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64" + +; Basic tree-structured merge: 4 stores of <2 x float> into <8 x float> +define <8 x float> @basic_tree_merge(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d) { +; CHECK-LABEL: define <8 x float> @basic_tree_merge( +; CHECK-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]], <2 x float> [[C:%.*]], <2 x float> [[D:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[C]], <2 x float> [[D]], <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <8 x i32> +; CHECK-NEXT: ret <8 x float> [[TMP2]] +; +; DEBUG-LABEL: define <8 x float> @basic_tree_merge( +; DEBUG-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]], <2 x float> [[C:%.*]], <2 x float> [[D:%.*]]) !dbg [[DBG5:![0-9]+]] { +; DEBUG-NEXT: [[ENTRY:.*:]] +; DEBUG-NEXT: #dbg_value(ptr poison, [[META9:![0-9]+]], !DIExpression(), [[META17:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META11:![0-9]+]], !DIExpression(), [[META18:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META12:![0-9]+]], !DIExpression(), [[META19:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META13:![0-9]+]], !DIExpression(), [[META20:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META14:![0-9]+]], !DIExpression(), [[META21:![0-9]+]]) +; DEBUG-NEXT: [[TMP0:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <4 x i32> , !dbg [[DBG22:![0-9]+]] +; DEBUG-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[C]], <2 x float> [[D]], <4 x i32> , !dbg [[DBG22]] +; DEBUG-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <8 x i32> , !dbg [[DBG22]] +; DEBUG-NEXT: #dbg_value(<8 x float> [[TMP2]], [[META15:![0-9]+]], !DIExpression(), [[META23:![0-9]+]]) +; DEBUG-NEXT: ret <8 x float> [[TMP2]], !dbg [[DBG24:![0-9]+]] +; +entry: + %alloca = alloca <8 x float> + + ; Store the vectors at different offsets + %ptr0 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 0 + store <2 x float> %a, ptr %ptr0 + + %ptr1 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 2 + store <2 x float> %b, ptr %ptr1 + + %ptr2 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 4 + store <2 x float> %c, ptr %ptr2 + + %ptr3 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 6 + store <2 x float> %d, ptr %ptr3 + + ; Load the complete vector + %result = load <8 x float>, ptr %alloca + ret <8 x float> %result +} + +define void @multiple_partitions(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d, ptr %e, ptr %f) { +; CHECK-LABEL: define void @multiple_partitions( +; CHECK-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]], <2 x float> [[C:%.*]], <2 x float> [[D:%.*]], ptr [[E:%.*]], ptr [[F:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[C]], <2 x float> [[D]], <4 x i32> +; CHECK-NEXT: store <4 x float> [[TMP0]], ptr [[E]], align 16 +; CHECK-NEXT: store <4 x float> [[TMP1]], ptr [[F]], align 16 +; CHECK-NEXT: ret void +; +; DEBUG-LABEL: define void @multiple_partitions( +; DEBUG-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]], <2 x float> [[C:%.*]], <2 x float> [[D:%.*]], ptr [[E:%.*]], ptr [[F:%.*]]) !dbg [[DBG25:![0-9]+]] { +; DEBUG-NEXT: [[ENTRY:.*:]] +; DEBUG-NEXT: #dbg_value(ptr poison, [[META27:![0-9]+]], !DIExpression(), [[META36:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META27]], !DIExpression(), [[META36]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META28:![0-9]+]], !DIExpression(), [[META37:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META29:![0-9]+]], !DIExpression(), [[META38:![0-9]+]]) +; DEBUG-NEXT: [[TMP0:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <4 x i32> , !dbg [[DBG39:![0-9]+]] +; DEBUG-NEXT: #dbg_value(ptr undef, [[META30:![0-9]+]], !DIExpression(), [[META40:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META31:![0-9]+]], !DIExpression(), [[META41:![0-9]+]]) +; DEBUG-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[C]], <2 x float> [[D]], <4 x i32> , !dbg [[DBG42:![0-9]+]] +; DEBUG-NEXT: #dbg_value(<4 x float> [[TMP0]], [[META32:![0-9]+]], !DIExpression(), [[META43:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META34:![0-9]+]], !DIExpression(), [[META44:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(<4 x float> [[TMP1]], [[META35:![0-9]+]], !DIExpression(), [[META45:![0-9]+]]) +; DEBUG-NEXT: store <4 x float> [[TMP0]], ptr [[E]], align 16, !dbg [[DBG46:![0-9]+]] +; DEBUG-NEXT: store <4 x float> [[TMP1]], ptr [[F]], align 16, !dbg [[DBG47:![0-9]+]] +; DEBUG-NEXT: ret void, !dbg [[DBG48:![0-9]+]] +; +entry: + %alloca = alloca <8 x float> + + %ptr0 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 0 + store <2 x float> %a, ptr %ptr0 + + %ptr1 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 2 + store <2 x float> %b, ptr %ptr1 + + %ptr2 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 4 + store <2 x float> %c, ptr %ptr2 + + %ptr3 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 6 + store <2 x float> %d, ptr %ptr3 + + %result1 = load <4 x float>, ptr %alloca + + %ptr_offset4 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 4 + %result2 = load <4 x float>, ptr %ptr_offset4 + + store <4 x float> %result1, ptr %e + store <4 x float> %result2, ptr %f + + ret void +} + +; Out-of-order stores: stores happen in non-sequential order +define <8 x i32> @out_of_order_stores(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) { +; CHECK-LABEL: define <8 x i32> @out_of_order_stores( +; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]], <2 x i32> [[D:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> [[D]], <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[TMP2]] +; +; DEBUG-LABEL: define <8 x i32> @out_of_order_stores( +; DEBUG-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]], <2 x i32> [[D:%.*]]) !dbg [[DBG49:![0-9]+]] { +; DEBUG-NEXT: [[ENTRY:.*:]] +; DEBUG-NEXT: #dbg_value(ptr poison, [[META51:![0-9]+]], !DIExpression(), [[META57:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META52:![0-9]+]], !DIExpression(), [[META58:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META53:![0-9]+]], !DIExpression(), [[META59:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META54:![0-9]+]], !DIExpression(), [[META60:![0-9]+]]) +; DEBUG-NEXT: [[TMP0:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <4 x i32> , !dbg [[DBG61:![0-9]+]] +; DEBUG-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> [[D]], <4 x i32> , !dbg [[DBG61]] +; DEBUG-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <8 x i32> , !dbg [[DBG61]] +; DEBUG-NEXT: #dbg_value(ptr undef, [[META55:![0-9]+]], !DIExpression(), [[META62:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(<8 x i32> [[TMP2]], [[META56:![0-9]+]], !DIExpression(), [[META63:![0-9]+]]) +; DEBUG-NEXT: ret <8 x i32> [[TMP2]], !dbg [[DBG64:![0-9]+]] +; +entry: + %alloca = alloca <8 x i32> + + ; Store out of order + %ptr2 = getelementptr inbounds <8 x i32>, ptr %alloca, i32 0, i32 4 + store <2 x i32> %c, ptr %ptr2 + + %ptr0 = getelementptr inbounds <8 x i32>, ptr %alloca, i32 0, i32 0 + store <2 x i32> %a, ptr %ptr0 + + %ptr3 = getelementptr inbounds <8 x i32>, ptr %alloca, i32 0, i32 6 + store <2 x i32> %d, ptr %ptr3 + + %ptr1 = getelementptr inbounds <8 x i32>, ptr %alloca, i32 0, i32 2 + store <2 x i32> %b, ptr %ptr1 + + %result = load <8 x i32>, ptr %alloca + ret <8 x i32> %result +} + +; Single element stores: 8 stores of <1 x i16> into <8 x i16> +define <8 x i16> @single_element_stores(<1 x i16> %a, <1 x i16> %b, <1 x i16> %c, <1 x i16> %d, <1 x i16> %e, <1 x i16> %f, <1 x i16> %g, <1 x i16> %h) { +; CHECK-LABEL: define <8 x i16> @single_element_stores( +; CHECK-SAME: <1 x i16> [[A:%.*]], <1 x i16> [[B:%.*]], <1 x i16> [[C:%.*]], <1 x i16> [[D:%.*]], <1 x i16> [[E:%.*]], <1 x i16> [[F:%.*]], <1 x i16> [[G:%.*]], <1 x i16> [[H:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <1 x i16> [[A]], <1 x i16> [[B]], <2 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <1 x i16> [[C]], <1 x i16> [[D]], <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <1 x i16> [[E]], <1 x i16> [[F]], <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <1 x i16> [[G]], <1 x i16> [[H]], <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <8 x i32> +; CHECK-NEXT: ret <8 x i16> [[TMP6]] +; +; DEBUG-LABEL: define <8 x i16> @single_element_stores( +; DEBUG-SAME: <1 x i16> [[A:%.*]], <1 x i16> [[B:%.*]], <1 x i16> [[C:%.*]], <1 x i16> [[D:%.*]], <1 x i16> [[E:%.*]], <1 x i16> [[F:%.*]], <1 x i16> [[G:%.*]], <1 x i16> [[H:%.*]]) !dbg [[DBG65:![0-9]+]] { +; DEBUG-NEXT: [[ENTRY:.*:]] +; DEBUG-NEXT: #dbg_value(ptr poison, [[META67:![0-9]+]], !DIExpression(), [[META77:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META68:![0-9]+]], !DIExpression(), [[META78:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META69:![0-9]+]], !DIExpression(), [[META79:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META70:![0-9]+]], !DIExpression(), [[META80:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META71:![0-9]+]], !DIExpression(), [[META81:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META72:![0-9]+]], !DIExpression(), [[META82:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META73:![0-9]+]], !DIExpression(), [[META83:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META74:![0-9]+]], !DIExpression(), [[META84:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META75:![0-9]+]], !DIExpression(), [[META85:![0-9]+]]) +; DEBUG-NEXT: [[TMP0:%.*]] = shufflevector <1 x i16> [[A]], <1 x i16> [[B]], <2 x i32> , !dbg [[DBG86:![0-9]+]] +; DEBUG-NEXT: [[TMP1:%.*]] = shufflevector <1 x i16> [[C]], <1 x i16> [[D]], <2 x i32> , !dbg [[DBG86]] +; DEBUG-NEXT: [[TMP2:%.*]] = shufflevector <1 x i16> [[E]], <1 x i16> [[F]], <2 x i32> , !dbg [[DBG86]] +; DEBUG-NEXT: [[TMP3:%.*]] = shufflevector <1 x i16> [[G]], <1 x i16> [[H]], <2 x i32> , !dbg [[DBG86]] +; DEBUG-NEXT: [[TMP4:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> [[TMP1]], <4 x i32> , !dbg [[DBG86]] +; DEBUG-NEXT: [[TMP5:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP3]], <4 x i32> , !dbg [[DBG86]] +; DEBUG-NEXT: [[TMP6:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <8 x i32> , !dbg [[DBG86]] +; DEBUG-NEXT: #dbg_value(<8 x i16> [[TMP6]], [[META76:![0-9]+]], !DIExpression(), [[META87:![0-9]+]]) +; DEBUG-NEXT: ret <8 x i16> [[TMP6]], !dbg [[DBG88:![0-9]+]] +; +entry: + %alloca = alloca <8 x i16> + + %ptr0 = getelementptr inbounds <8 x i16>, ptr %alloca, i32 0, i32 0 + store <1 x i16> %a, ptr %ptr0 + %ptr1 = getelementptr inbounds <8 x i16>, ptr %alloca, i32 0, i32 1 + store <1 x i16> %b, ptr %ptr1 + %ptr2 = getelementptr inbounds <8 x i16>, ptr %alloca, i32 0, i32 2 + store <1 x i16> %c, ptr %ptr2 + %ptr3 = getelementptr inbounds <8 x i16>, ptr %alloca, i32 0, i32 3 + store <1 x i16> %d, ptr %ptr3 + %ptr4 = getelementptr inbounds <8 x i16>, ptr %alloca, i32 0, i32 4 + store <1 x i16> %e, ptr %ptr4 + %ptr5 = getelementptr inbounds <8 x i16>, ptr %alloca, i32 0, i32 5 + store <1 x i16> %f, ptr %ptr5 + %ptr6 = getelementptr inbounds <8 x i16>, ptr %alloca, i32 0, i32 6 + store <1 x i16> %g, ptr %ptr6 + %ptr7 = getelementptr inbounds <8 x i16>, ptr %alloca, i32 0, i32 7 + store <1 x i16> %h, ptr %ptr7 + + %result = load <8 x i16>, ptr %alloca + ret <8 x i16> %result +} + +; Non-power-of-2: 3 stores of <2 x float> into <6 x float> +define <6 x float> @non_power_of_2(<2 x float> %a, <2 x float> %b, <2 x float> %c) { +; CHECK-LABEL: define <6 x float> @non_power_of_2( +; CHECK-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]], <2 x float> [[C:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[C]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <6 x i32> +; CHECK-NEXT: ret <6 x float> [[TMP2]] +; +; DEBUG-LABEL: define <6 x float> @non_power_of_2( +; DEBUG-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]], <2 x float> [[C:%.*]]) !dbg [[DBG89:![0-9]+]] { +; DEBUG-NEXT: [[ENTRY:.*:]] +; DEBUG-NEXT: #dbg_value(ptr poison, [[META91:![0-9]+]], !DIExpression(), [[META96:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META92:![0-9]+]], !DIExpression(), [[META97:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META93:![0-9]+]], !DIExpression(), [[META98:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META94:![0-9]+]], !DIExpression(), [[META99:![0-9]+]]) +; DEBUG-NEXT: [[TMP0:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <4 x i32> , !dbg [[DBG100:![0-9]+]] +; DEBUG-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[C]], <2 x float> poison, <4 x i32> , !dbg [[DBG100]] +; DEBUG-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <6 x i32> , !dbg [[DBG100]] +; DEBUG-NEXT: #dbg_value(<6 x float> [[TMP2]], [[META95:![0-9]+]], !DIExpression(), [[META101:![0-9]+]]) +; DEBUG-NEXT: ret <6 x float> [[TMP2]], !dbg [[DBG102:![0-9]+]] +; +entry: + %alloca = alloca <6 x float> + + %ptr0 = getelementptr inbounds <6 x float>, ptr %alloca, i32 0, i32 0 + store <2 x float> %a, ptr %ptr0 + + %ptr1 = getelementptr inbounds <6 x float>, ptr %alloca, i32 0, i32 2 + store <2 x float> %b, ptr %ptr1 + + %ptr2 = getelementptr inbounds <6 x float>, ptr %alloca, i32 0, i32 4 + store <2 x float> %c, ptr %ptr2 + + %result = load <6 x float>, ptr %alloca + ret <6 x float> %result +} + +; Store with different size of vectors +define <7 x float> @store_with_different_size_of_vectors(<1 x float> %a, <4 x float> %b, <2 x float> %c) { +; CHECK-LABEL: define <7 x float> @store_with_different_size_of_vectors( +; CHECK-SAME: <1 x float> [[A:%.*]], <4 x float> [[B:%.*]], <2 x float> [[C:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <1 x float> [[A]], <1 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[B]], <5 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[C]], <2 x float> poison, <5 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <5 x float> [[TMP1]], <5 x float> [[TMP2]], <7 x i32> +; CHECK-NEXT: ret <7 x float> [[TMP3]] +; +; DEBUG-LABEL: define <7 x float> @store_with_different_size_of_vectors( +; DEBUG-SAME: <1 x float> [[A:%.*]], <4 x float> [[B:%.*]], <2 x float> [[C:%.*]]) !dbg [[DBG103:![0-9]+]] { +; DEBUG-NEXT: [[ENTRY:.*:]] +; DEBUG-NEXT: #dbg_value(ptr poison, [[META105:![0-9]+]], !DIExpression(), [[META110:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META106:![0-9]+]], !DIExpression(), [[META111:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META107:![0-9]+]], !DIExpression(), [[META112:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META108:![0-9]+]], !DIExpression(), [[META113:![0-9]+]]) +; DEBUG-NEXT: [[TMP0:%.*]] = shufflevector <1 x float> [[A]], <1 x float> poison, <4 x i32> , !dbg [[DBG114:![0-9]+]] +; DEBUG-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[B]], <5 x i32> , !dbg [[DBG114]] +; DEBUG-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[C]], <2 x float> poison, <5 x i32> , !dbg [[DBG114]] +; DEBUG-NEXT: [[TMP3:%.*]] = shufflevector <5 x float> [[TMP1]], <5 x float> [[TMP2]], <7 x i32> , !dbg [[DBG114]] +; DEBUG-NEXT: #dbg_value(<7 x float> [[TMP3]], [[META109:![0-9]+]], !DIExpression(), [[META115:![0-9]+]]) +; DEBUG-NEXT: ret <7 x float> [[TMP3]], !dbg [[DBG116:![0-9]+]] +; +entry: + %alloca = alloca <7 x float> + + %ptr0 = getelementptr inbounds <7 x float>, ptr %alloca, i32 0, i32 0 + store <1 x float> %a, ptr %ptr0 + + %ptr1 = getelementptr inbounds <7 x float>, ptr %alloca, i32 0, i32 1 + store <4 x float> %b, ptr %ptr1 + + %ptr2 = getelementptr inbounds <7 x float>, ptr %alloca, i32 0, i32 5 + store <2 x float> %c, ptr %ptr2 + + %result = load <7 x float>, ptr %alloca + ret <7 x float> %result +} + +;. +; DEBUG: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C, file: [[META1:![0-9]+]], producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +; DEBUG: [[META1]] = !DIFile(filename: "{{.*}}", directory: {{.*}}) +; DEBUG: [[DBG5]] = distinct !DISubprogram(name: "basic_tree_merge", linkageName: "basic_tree_merge", scope: null, file: [[META1]], line: 1, type: [[META6:![0-9]+]], scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META8:![0-9]+]]) +; DEBUG: [[META6]] = !DISubroutineType(types: [[META7:![0-9]+]]) +; DEBUG: [[META7]] = !{} +; DEBUG: [[META8]] = !{[[META9]], [[META11]], [[META12]], [[META13]], [[META14]], [[META15]]} +; DEBUG: [[META9]] = !DILocalVariable(name: "1", scope: [[DBG5]], file: [[META1]], line: 1, type: [[META10:![0-9]+]]) +; DEBUG: [[META10]] = !DIBasicType(name: "ty64", size: 64, encoding: DW_ATE_unsigned) +; DEBUG: [[META11]] = !DILocalVariable(name: "2", scope: [[DBG5]], file: [[META1]], line: 2, type: [[META10]]) +; DEBUG: [[META12]] = !DILocalVariable(name: "3", scope: [[DBG5]], file: [[META1]], line: 4, type: [[META10]]) +; DEBUG: [[META13]] = !DILocalVariable(name: "4", scope: [[DBG5]], file: [[META1]], line: 6, type: [[META10]]) +; DEBUG: [[META14]] = !DILocalVariable(name: "5", scope: [[DBG5]], file: [[META1]], line: 8, type: [[META10]]) +; DEBUG: [[META15]] = !DILocalVariable(name: "6", scope: [[DBG5]], file: [[META1]], line: 10, type: [[META16:![0-9]+]]) +; DEBUG: [[META16]] = !DIBasicType(name: "ty256", size: 256, encoding: DW_ATE_unsigned) +; DEBUG: [[META17]] = !DILocation(line: 1, column: 1, scope: [[DBG5]]) +; DEBUG: [[META18]] = !DILocation(line: 2, column: 1, scope: [[DBG5]]) +; DEBUG: [[META19]] = !DILocation(line: 4, column: 1, scope: [[DBG5]]) +; DEBUG: [[META20]] = !DILocation(line: 6, column: 1, scope: [[DBG5]]) +; DEBUG: [[META21]] = !DILocation(line: 8, column: 1, scope: [[DBG5]]) +; DEBUG: [[DBG22]] = !DILocation(line: 9, column: 1, scope: [[DBG5]]) +; DEBUG: [[META23]] = !DILocation(line: 10, column: 1, scope: [[DBG5]]) +; DEBUG: [[DBG24]] = !DILocation(line: 11, column: 1, scope: [[DBG5]]) +; DEBUG: [[DBG25]] = distinct !DISubprogram(name: "multiple_partitions", linkageName: "multiple_partitions", scope: null, file: [[META1]], line: 12, type: [[META6]], scopeLine: 12, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META26:![0-9]+]]) +; DEBUG: [[META26]] = !{[[META27]], [[META28]], [[META29]], [[META30]], [[META31]], [[META32]], [[META34]], [[META35]]} +; DEBUG: [[META27]] = !DILocalVariable(name: "7", scope: [[DBG25]], file: [[META1]], line: 12, type: [[META10]]) +; DEBUG: [[META28]] = !DILocalVariable(name: "8", scope: [[DBG25]], file: [[META1]], line: 13, type: [[META10]]) +; DEBUG: [[META29]] = !DILocalVariable(name: "9", scope: [[DBG25]], file: [[META1]], line: 15, type: [[META10]]) +; DEBUG: [[META30]] = !DILocalVariable(name: "10", scope: [[DBG25]], file: [[META1]], line: 17, type: [[META10]]) +; DEBUG: [[META31]] = !DILocalVariable(name: "11", scope: [[DBG25]], file: [[META1]], line: 19, type: [[META10]]) +; DEBUG: [[META32]] = !DILocalVariable(name: "12", scope: [[DBG25]], file: [[META1]], line: 21, type: [[META33:![0-9]+]]) +; DEBUG: [[META33]] = !DIBasicType(name: "ty128", size: 128, encoding: DW_ATE_unsigned) +; DEBUG: [[META34]] = !DILocalVariable(name: "13", scope: [[DBG25]], file: [[META1]], line: 22, type: [[META10]]) +; DEBUG: [[META35]] = !DILocalVariable(name: "14", scope: [[DBG25]], file: [[META1]], line: 23, type: [[META33]]) +; DEBUG: [[META36]] = !DILocation(line: 12, column: 1, scope: [[DBG25]]) +; DEBUG: [[META37]] = !DILocation(line: 13, column: 1, scope: [[DBG25]]) +; DEBUG: [[META38]] = !DILocation(line: 15, column: 1, scope: [[DBG25]]) +; DEBUG: [[DBG39]] = !DILocation(line: 16, column: 1, scope: [[DBG25]]) +; DEBUG: [[META40]] = !DILocation(line: 17, column: 1, scope: [[DBG25]]) +; DEBUG: [[META41]] = !DILocation(line: 19, column: 1, scope: [[DBG25]]) +; DEBUG: [[DBG42]] = !DILocation(line: 20, column: 1, scope: [[DBG25]]) +; DEBUG: [[META43]] = !DILocation(line: 21, column: 1, scope: [[DBG25]]) +; DEBUG: [[META44]] = !DILocation(line: 22, column: 1, scope: [[DBG25]]) +; DEBUG: [[META45]] = !DILocation(line: 23, column: 1, scope: [[DBG25]]) +; DEBUG: [[DBG46]] = !DILocation(line: 24, column: 1, scope: [[DBG25]]) +; DEBUG: [[DBG47]] = !DILocation(line: 25, column: 1, scope: [[DBG25]]) +; DEBUG: [[DBG48]] = !DILocation(line: 26, column: 1, scope: [[DBG25]]) +; DEBUG: [[DBG49]] = distinct !DISubprogram(name: "out_of_order_stores", linkageName: "out_of_order_stores", scope: null, file: [[META1]], line: 27, type: [[META6]], scopeLine: 27, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META50:![0-9]+]]) +; DEBUG: [[META50]] = !{[[META51]], [[META52]], [[META53]], [[META54]], [[META55]], [[META56]]} +; DEBUG: [[META51]] = !DILocalVariable(name: "15", scope: [[DBG49]], file: [[META1]], line: 27, type: [[META10]]) +; DEBUG: [[META52]] = !DILocalVariable(name: "16", scope: [[DBG49]], file: [[META1]], line: 28, type: [[META10]]) +; DEBUG: [[META53]] = !DILocalVariable(name: "17", scope: [[DBG49]], file: [[META1]], line: 30, type: [[META10]]) +; DEBUG: [[META54]] = !DILocalVariable(name: "18", scope: [[DBG49]], file: [[META1]], line: 32, type: [[META10]]) +; DEBUG: [[META55]] = !DILocalVariable(name: "19", scope: [[DBG49]], file: [[META1]], line: 34, type: [[META10]]) +; DEBUG: [[META56]] = !DILocalVariable(name: "20", scope: [[DBG49]], file: [[META1]], line: 36, type: [[META16]]) +; DEBUG: [[META57]] = !DILocation(line: 27, column: 1, scope: [[DBG49]]) +; DEBUG: [[META58]] = !DILocation(line: 28, column: 1, scope: [[DBG49]]) +; DEBUG: [[META59]] = !DILocation(line: 30, column: 1, scope: [[DBG49]]) +; DEBUG: [[META60]] = !DILocation(line: 32, column: 1, scope: [[DBG49]]) +; DEBUG: [[DBG61]] = !DILocation(line: 33, column: 1, scope: [[DBG49]]) +; DEBUG: [[META62]] = !DILocation(line: 34, column: 1, scope: [[DBG49]]) +; DEBUG: [[META63]] = !DILocation(line: 36, column: 1, scope: [[DBG49]]) +; DEBUG: [[DBG64]] = !DILocation(line: 37, column: 1, scope: [[DBG49]]) +; DEBUG: [[DBG65]] = distinct !DISubprogram(name: "single_element_stores", linkageName: "single_element_stores", scope: null, file: [[META1]], line: 38, type: [[META6]], scopeLine: 38, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META66:![0-9]+]]) +; DEBUG: [[META66]] = !{[[META67]], [[META68]], [[META69]], [[META70]], [[META71]], [[META72]], [[META73]], [[META74]], [[META75]], [[META76]]} +; DEBUG: [[META67]] = !DILocalVariable(name: "21", scope: [[DBG65]], file: [[META1]], line: 38, type: [[META10]]) +; DEBUG: [[META68]] = !DILocalVariable(name: "22", scope: [[DBG65]], file: [[META1]], line: 39, type: [[META10]]) +; DEBUG: [[META69]] = !DILocalVariable(name: "23", scope: [[DBG65]], file: [[META1]], line: 41, type: [[META10]]) +; DEBUG: [[META70]] = !DILocalVariable(name: "24", scope: [[DBG65]], file: [[META1]], line: 43, type: [[META10]]) +; DEBUG: [[META71]] = !DILocalVariable(name: "25", scope: [[DBG65]], file: [[META1]], line: 45, type: [[META10]]) +; DEBUG: [[META72]] = !DILocalVariable(name: "26", scope: [[DBG65]], file: [[META1]], line: 47, type: [[META10]]) +; DEBUG: [[META73]] = !DILocalVariable(name: "27", scope: [[DBG65]], file: [[META1]], line: 49, type: [[META10]]) +; DEBUG: [[META74]] = !DILocalVariable(name: "28", scope: [[DBG65]], file: [[META1]], line: 51, type: [[META10]]) +; DEBUG: [[META75]] = !DILocalVariable(name: "29", scope: [[DBG65]], file: [[META1]], line: 53, type: [[META10]]) +; DEBUG: [[META76]] = !DILocalVariable(name: "30", scope: [[DBG65]], file: [[META1]], line: 55, type: [[META33]]) +; DEBUG: [[META77]] = !DILocation(line: 38, column: 1, scope: [[DBG65]]) +; DEBUG: [[META78]] = !DILocation(line: 39, column: 1, scope: [[DBG65]]) +; DEBUG: [[META79]] = !DILocation(line: 41, column: 1, scope: [[DBG65]]) +; DEBUG: [[META80]] = !DILocation(line: 43, column: 1, scope: [[DBG65]]) +; DEBUG: [[META81]] = !DILocation(line: 45, column: 1, scope: [[DBG65]]) +; DEBUG: [[META82]] = !DILocation(line: 47, column: 1, scope: [[DBG65]]) +; DEBUG: [[META83]] = !DILocation(line: 49, column: 1, scope: [[DBG65]]) +; DEBUG: [[META84]] = !DILocation(line: 51, column: 1, scope: [[DBG65]]) +; DEBUG: [[META85]] = !DILocation(line: 53, column: 1, scope: [[DBG65]]) +; DEBUG: [[DBG86]] = !DILocation(line: 54, column: 1, scope: [[DBG65]]) +; DEBUG: [[META87]] = !DILocation(line: 55, column: 1, scope: [[DBG65]]) +; DEBUG: [[DBG88]] = !DILocation(line: 56, column: 1, scope: [[DBG65]]) +; DEBUG: [[DBG89]] = distinct !DISubprogram(name: "non_power_of_2", linkageName: "non_power_of_2", scope: null, file: [[META1]], line: 57, type: [[META6]], scopeLine: 57, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META90:![0-9]+]]) +; DEBUG: [[META90]] = !{[[META91]], [[META92]], [[META93]], [[META94]], [[META95]]} +; DEBUG: [[META91]] = !DILocalVariable(name: "31", scope: [[DBG89]], file: [[META1]], line: 57, type: [[META10]]) +; DEBUG: [[META92]] = !DILocalVariable(name: "32", scope: [[DBG89]], file: [[META1]], line: 58, type: [[META10]]) +; DEBUG: [[META93]] = !DILocalVariable(name: "33", scope: [[DBG89]], file: [[META1]], line: 60, type: [[META10]]) +; DEBUG: [[META94]] = !DILocalVariable(name: "34", scope: [[DBG89]], file: [[META1]], line: 62, type: [[META10]]) +; DEBUG: [[META95]] = !DILocalVariable(name: "35", scope: [[DBG89]], file: [[META1]], line: 64, type: [[META16]]) +; DEBUG: [[META96]] = !DILocation(line: 57, column: 1, scope: [[DBG89]]) +; DEBUG: [[META97]] = !DILocation(line: 58, column: 1, scope: [[DBG89]]) +; DEBUG: [[META98]] = !DILocation(line: 60, column: 1, scope: [[DBG89]]) +; DEBUG: [[META99]] = !DILocation(line: 62, column: 1, scope: [[DBG89]]) +; DEBUG: [[DBG100]] = !DILocation(line: 63, column: 1, scope: [[DBG89]]) +; DEBUG: [[META101]] = !DILocation(line: 64, column: 1, scope: [[DBG89]]) +; DEBUG: [[DBG102]] = !DILocation(line: 65, column: 1, scope: [[DBG89]]) +; DEBUG: [[DBG103]] = distinct !DISubprogram(name: "store_with_different_size_of_vectors", linkageName: "store_with_different_size_of_vectors", scope: null, file: [[META1]], line: 66, type: [[META6]], scopeLine: 66, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META104:![0-9]+]]) +; DEBUG: [[META104]] = !{[[META105]], [[META106]], [[META107]], [[META108]], [[META109]]} +; DEBUG: [[META105]] = !DILocalVariable(name: "36", scope: [[DBG103]], file: [[META1]], line: 66, type: [[META10]]) +; DEBUG: [[META106]] = !DILocalVariable(name: "37", scope: [[DBG103]], file: [[META1]], line: 67, type: [[META10]]) +; DEBUG: [[META107]] = !DILocalVariable(name: "38", scope: [[DBG103]], file: [[META1]], line: 69, type: [[META10]]) +; DEBUG: [[META108]] = !DILocalVariable(name: "39", scope: [[DBG103]], file: [[META1]], line: 71, type: [[META10]]) +; DEBUG: [[META109]] = !DILocalVariable(name: "40", scope: [[DBG103]], file: [[META1]], line: 73, type: [[META16]]) +; DEBUG: [[META110]] = !DILocation(line: 66, column: 1, scope: [[DBG103]]) +; DEBUG: [[META111]] = !DILocation(line: 67, column: 1, scope: [[DBG103]]) +; DEBUG: [[META112]] = !DILocation(line: 69, column: 1, scope: [[DBG103]]) +; DEBUG: [[META113]] = !DILocation(line: 71, column: 1, scope: [[DBG103]]) +; DEBUG: [[DBG114]] = !DILocation(line: 72, column: 1, scope: [[DBG103]]) +; DEBUG: [[META115]] = !DILocation(line: 73, column: 1, scope: [[DBG103]]) +; DEBUG: [[DBG116]] = !DILocation(line: 74, column: 1, scope: [[DBG103]]) +;. +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK-MODIFY-CFG: {{.*}} +; CHECK-PRESERVE-CFG: {{.*}} From 46039cebead52e23434f0ebba1e468d8a003068a Mon Sep 17 00:00:00 2001 From: chengjunp Date: Fri, 8 Aug 2025 22:03:01 +0000 Subject: [PATCH 2/8] Not do tree merge when only having one store --- llvm/lib/Transforms/Scalar/SROA.cpp | 4 ++-- .../SROA/vector-promotion-cannot-tree-structure-merge.ll | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index 2bbaf7813c3c0..397f44687aa6d 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -2988,8 +2988,8 @@ class AllocaSliceRewriter : public InstVisitor { if (!TheLoad) return std::nullopt; - // If we do not have any stores, we cannot do the tree structured merge - if (StoreInfos.empty()) + // If we do not have multiple stores, we cannot do the tree structured merge + if (StoreInfos.size() < 2) return std::nullopt; // The load and store element types should be the same diff --git a/llvm/test/Transforms/SROA/vector-promotion-cannot-tree-structure-merge.ll b/llvm/test/Transforms/SROA/vector-promotion-cannot-tree-structure-merge.ll index 61d77478e0b59..ab11adaa8156e 100644 --- a/llvm/test/Transforms/SROA/vector-promotion-cannot-tree-structure-merge.ll +++ b/llvm/test/Transforms/SROA/vector-promotion-cannot-tree-structure-merge.ll @@ -88,12 +88,12 @@ entry: %alloca = alloca <4 x float> %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0 - %float_vec = insertelement <2 x float> undef, float 1.0, i32 0 + %float_vec = insertelement <2 x float> poison, float 1.0, i32 0 %float_vec2 = insertelement <2 x float> %float_vec, float 2.0, i32 1 store <2 x float> %float_vec2, ptr %ptr0 %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2 - %int_vec = insertelement <2 x i32> undef, i32 3, i32 0 + %int_vec = insertelement <2 x i32> poison, i32 3, i32 0 %int_vec2 = insertelement <2 x i32> %int_vec, i32 4, i32 1 store <2 x i32> %int_vec2, ptr %ptr1 @@ -106,12 +106,12 @@ entry: %alloca = alloca <4 x float> %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0 - %float_vec = insertelement <2 x float> undef, float 1.0, i32 0 + %float_vec = insertelement <2 x float> poison, float 1.0, i32 0 %float_vec2 = insertelement <2 x float> %float_vec, float 2.0, i32 1 store <2 x float> %float_vec2, ptr %ptr0 %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2 - %float_vec3 = insertelement <2 x float> undef, float 3.0, i32 0 + %float_vec3 = insertelement <2 x float> poison, float 3.0, i32 0 %float_vec4 = insertelement <2 x float> %float_vec3, float 4.0, i32 1 store <2 x float> %float_vec4, ptr %ptr1 From 68eea1e5440139a3b23d0057987de18bdd9ffd16 Mon Sep 17 00:00:00 2001 From: chengjunp Date: Fri, 15 Aug 2025 19:27:59 +0000 Subject: [PATCH 3/8] Handle the cases where ld/st has different elt types --- llvm/lib/Transforms/Scalar/SROA.cpp | 108 +++++++++++------- ...r-promotion-cannot-tree-structure-merge.ll | 35 ------ ...ctor-promotion-via-tree-structure-merge.ll | 70 ++++++++++++ 3 files changed, 135 insertions(+), 78 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index 397f44687aa6d..7a0ebf7ce6bc0 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -2679,7 +2679,32 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V, return V; } -static Value *mergeTwoVectors(Value *V0, Value *V1, IRBuilder<> &Builder) { +/// This function takes two vector values and combines them into a single vector +/// by concatenating their elements. The function handles: +/// +/// 1. Element type mismatch: If either vector's element type differs from +/// NewAIEltType, the function bitcasts the vector to use NewAIEltType while +/// preserving the total bit width (adjusting the number of elements +/// accordingly). +/// +/// 2. Size mismatch: After transforming the vectors to have the desired element +/// type, if the two vectors have different numbers of elements, the smaller +/// vector is extended with poison values to match the size of the larger +/// vector before concatenation. +/// +/// 3. Concatenation: The vectors are merged using a shuffle operation that +/// places all elements of V0 first, followed by all elements of V1. +/// +/// \param V0 The first vector to merge (must be a vector type) +/// \param V1 The second vector to merge (must be a vector type) +/// \param DL The data layout for size calculations +/// \param NewAIEltTy The desired element type for the result vector +/// \param Builder IRBuilder for creating new instructions +/// \return A new vector containing all elements from V0 followed by all +/// elements from V1 +static Value *mergeTwoVectors(Value *V0, Value *V1, const DataLayout &DL, + Type *NewAIEltTy, + IRBuilder<> &Builder) { assert(V0->getType()->isVectorTy() && V1->getType()->isVectorTy() && "Can not merge two non-vector values"); @@ -2689,8 +2714,28 @@ static Value *mergeTwoVectors(Value *V0, Value *V1, IRBuilder<> &Builder) { auto *VecType0 = cast(V0->getType()); auto *VecType1 = cast(V1->getType()); - assert(VecType0->getElementType() == VecType1->getElementType() && - "Can not merge two vectors with different element types"); + // If V0/V1 element types are different from NewAllocaElementType, + // we need to introduce bitcasts before merging them + auto BitcastIfNeeded = [&](Value *&V, FixedVectorType *&VecType, + const char *DebugName) { + Type *EltType = VecType->getElementType(); + if (EltType != NewAIEltTy) { + // Calculate new number of elements to maintain same bit width + unsigned TotalBits = + VecType->getNumElements() * DL.getTypeSizeInBits(EltType); + unsigned NewNumElts = + TotalBits / DL.getTypeSizeInBits(NewAIEltTy); + + auto *NewVecType = FixedVectorType::get(NewAIEltTy, NewNumElts); + V = Builder.CreateBitCast(V, NewVecType); + VecType = NewVecType; + LLVM_DEBUG(dbgs() << " bitcast " << DebugName << ": " << *V << "\n"); + } + }; + + BitcastIfNeeded(V0, VecType0, "V0"); + BitcastIfNeeded(V1, VecType1, "V1"); + unsigned NumElts0 = VecType0->getNumElements(); unsigned NumElts1 = VecType1->getNumElements(); @@ -2923,24 +2968,19 @@ class AllocaSliceRewriter : public InstVisitor { uint64_t BeginOffset; uint64_t EndOffset; Value *StoredValue; - TypeSize StoredTypeSize = TypeSize::getZero(); - - StoreInfo(StoreInst *SI, uint64_t Begin, uint64_t End, Value *Val, - TypeSize StoredTypeSize) - : Store(SI), BeginOffset(Begin), EndOffset(End), StoredValue(Val), - StoredTypeSize(StoredTypeSize) {} + StoreInfo(StoreInst *SI, uint64_t Begin, uint64_t End, Value *Val) + : Store(SI), BeginOffset(Begin), EndOffset(End), StoredValue(Val) {} }; SmallVector StoreInfos; // The alloca must be a fixed vector type - auto *AllocatedTy = NewAI.getAllocatedType(); - if (!isa(AllocatedTy)) + Type *AllocatedEltTy = nullptr; + if (auto *FixedVecTy = dyn_cast(NewAI.getAllocatedType())) + AllocatedEltTy = FixedVecTy->getElementType(); + else return std::nullopt; - Slice *LoadSlice = nullptr; - Type *LoadElementType = nullptr; - Type *StoreElementType = nullptr; for (Slice &S : P) { auto *User = cast(S.getUse()->getUser()); if (auto *LI = dyn_cast(User)) { @@ -2957,27 +2997,20 @@ class AllocaSliceRewriter : public InstVisitor { if (DL.getTypeSizeInBits(FixedVecTy) != DL.getTypeSizeInBits(NewAI.getAllocatedType())) return std::nullopt; - LoadElementType = FixedVecTy->getElementType(); TheLoad = LI; - LoadSlice = &S; } else if (auto *SI = dyn_cast(User)) { - // The store needs to be a fixed vector type - // All the stores should have the same element type + // The stored value should be a fixed vector type Type *StoredValueType = SI->getValueOperand()->getType(); - Type *CurrentElementType = nullptr; - TypeSize StoredTypeSize = TypeSize::getZero(); - if (auto *FixedVecTy = dyn_cast(StoredValueType)) { - // Fixed vector type - use its element type - CurrentElementType = FixedVecTy->getElementType(); - StoredTypeSize = DL.getTypeSizeInBits(FixedVecTy); - } else + if (!isa(StoredValueType)) return std::nullopt; - // Check element type consistency across all stores - if (StoreElementType && StoreElementType != CurrentElementType) + + // The total number of stored bits should be the multiple of the new + // alloca element type size + if (DL.getTypeSizeInBits(StoredValueType) % + DL.getTypeSizeInBits(AllocatedEltTy) != 0) return std::nullopt; - StoreElementType = CurrentElementType; StoreInfos.emplace_back(SI, S.beginOffset(), S.endOffset(), - SI->getValueOperand(), StoredTypeSize); + SI->getValueOperand()); } else { // If we have instructions other than load and store, we cannot do the // tree structured merge @@ -2992,16 +3025,6 @@ class AllocaSliceRewriter : public InstVisitor { if (StoreInfos.size() < 2) return std::nullopt; - // The load and store element types should be the same - if (LoadElementType != StoreElementType) - return std::nullopt; - - // The load should cover the whole alloca - // TODO: maybe we can relax this constraint - if (!LoadSlice || LoadSlice->beginOffset() != NewAllocaBeginOffset || - LoadSlice->endOffset() != NewAllocaEndOffset) - return std::nullopt; - // Stores should not overlap and should cover the whole alloca // Sort by begin offset llvm::sort(StoreInfos, [](const StoreInfo &A, const StoreInfo &B) { @@ -3011,7 +3034,6 @@ class AllocaSliceRewriter : public InstVisitor { // Check for overlaps and coverage uint64_t ExpectedStart = NewAllocaBeginOffset; TypeSize TotalStoreBits = TypeSize::getZero(); - Instruction *PrevStore = nullptr; for (auto &StoreInfo : StoreInfos) { uint64_t BeginOff = StoreInfo.BeginOffset; uint64_t EndOff = StoreInfo.EndOffset; @@ -3021,8 +3043,8 @@ class AllocaSliceRewriter : public InstVisitor { return std::nullopt; ExpectedStart = EndOff; - TotalStoreBits += StoreInfo.StoredTypeSize; - PrevStore = StoreInfo.Store; + TotalStoreBits += + DL.getTypeSizeInBits(StoreInfo.Store->getValueOperand()->getType()); } // Check that stores cover the entire alloca // We need check both the end offset and the total store bits @@ -3070,7 +3092,7 @@ class AllocaSliceRewriter : public InstVisitor { VecElements.pop(); Value *V1 = VecElements.front(); VecElements.pop(); - Value *Merged = mergeTwoVectors(V0, V1, Builder); + Value *Merged = mergeTwoVectors(V0, V1, DL, AllocatedEltTy, Builder); LLVM_DEBUG(dbgs() << " shufflevector: " << *Merged << "\n"); VecElements.push(Merged); } diff --git a/llvm/test/Transforms/SROA/vector-promotion-cannot-tree-structure-merge.ll b/llvm/test/Transforms/SROA/vector-promotion-cannot-tree-structure-merge.ll index ab11adaa8156e..e4b106856de47 100644 --- a/llvm/test/Transforms/SROA/vector-promotion-cannot-tree-structure-merge.ll +++ b/llvm/test/Transforms/SROA/vector-promotion-cannot-tree-structure-merge.ll @@ -83,41 +83,6 @@ entry: ret <4 x float> %result } -define <4 x float> @test_store_not_same_element_type() { -entry: - %alloca = alloca <4 x float> - - %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0 - %float_vec = insertelement <2 x float> poison, float 1.0, i32 0 - %float_vec2 = insertelement <2 x float> %float_vec, float 2.0, i32 1 - store <2 x float> %float_vec2, ptr %ptr0 - - %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2 - %int_vec = insertelement <2 x i32> poison, i32 3, i32 0 - %int_vec2 = insertelement <2 x i32> %int_vec, i32 4, i32 1 - store <2 x i32> %int_vec2, ptr %ptr1 - - %result = load <4 x float>, ptr %alloca - ret <4 x float> %result -} - -define <4 x i32> @test_load_store_different_element_type() { -entry: - %alloca = alloca <4 x float> - - %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0 - %float_vec = insertelement <2 x float> poison, float 1.0, i32 0 - %float_vec2 = insertelement <2 x float> %float_vec, float 2.0, i32 1 - store <2 x float> %float_vec2, ptr %ptr0 - - %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2 - %float_vec3 = insertelement <2 x float> poison, float 3.0, i32 0 - %float_vec4 = insertelement <2 x float> %float_vec3, float 4.0, i32 1 - store <2 x float> %float_vec4, ptr %ptr1 - - %result = load <4 x i32>, ptr %alloca - ret <4 x i32> %result -} define <4 x float> @test_no_stores() { entry: diff --git a/llvm/test/Transforms/SROA/vector-promotion-via-tree-structure-merge.ll b/llvm/test/Transforms/SROA/vector-promotion-via-tree-structure-merge.ll index c74b0b932ddef..83bc48b617f29 100644 --- a/llvm/test/Transforms/SROA/vector-promotion-via-tree-structure-merge.ll +++ b/llvm/test/Transforms/SROA/vector-promotion-via-tree-structure-merge.ll @@ -287,6 +287,60 @@ entry: ret <7 x float> %result } +; Load and store with different element type +define <4 x double> @load_store_different_element_type(<2 x i32> %a, <2 x float> %b, <2 x float> %c, <2 x i32> %d) { +; CHECK-LABEL: define <4 x double> @load_store_different_element_type( +; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x float> [[B:%.*]], <2 x float> [[C:%.*]], <2 x i32> [[D:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <1 x double> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <1 x double> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <1 x double> [[TMP0]], <1 x double> [[TMP1]], <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[C]] to <1 x double> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[D]] to <1 x double> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP4]], <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP5]], <4 x i32> +; CHECK-NEXT: ret <4 x double> [[TMP6]] +; +; DEBUG-LABEL: define <4 x double> @load_store_different_element_type( +; DEBUG-SAME: <2 x i32> [[A:%.*]], <2 x float> [[B:%.*]], <2 x float> [[C:%.*]], <2 x i32> [[D:%.*]]) !dbg [[DBG117:![0-9]+]] { +; DEBUG-NEXT: [[ENTRY:.*:]] +; DEBUG-NEXT: #dbg_value(ptr poison, [[META119:![0-9]+]], !DIExpression(), [[META125:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META119]], !DIExpression(), [[META125]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META120:![0-9]+]], !DIExpression(), [[META126:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META121:![0-9]+]], !DIExpression(), [[META127:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META122:![0-9]+]], !DIExpression(), [[META128:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META123:![0-9]+]], !DIExpression(), [[META129:![0-9]+]]) +; DEBUG-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <1 x double>, !dbg [[DBG130:![0-9]+]] +; DEBUG-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <1 x double>, !dbg [[DBG130]] +; DEBUG-NEXT: [[TMP2:%.*]] = shufflevector <1 x double> [[TMP0]], <1 x double> [[TMP1]], <2 x i32> , !dbg [[DBG130]] +; DEBUG-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[C]] to <1 x double>, !dbg [[DBG130]] +; DEBUG-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[D]] to <1 x double>, !dbg [[DBG130]] +; DEBUG-NEXT: [[TMP5:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP4]], <2 x i32> , !dbg [[DBG130]] +; DEBUG-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP5]], <4 x i32> , !dbg [[DBG130]] +; DEBUG-NEXT: #dbg_value(<4 x double> [[TMP6]], [[META124:![0-9]+]], !DIExpression(), [[META131:![0-9]+]]) +; DEBUG-NEXT: ret <4 x double> [[TMP6]], !dbg [[DBG132:![0-9]+]] +; +entry: + %alloca = alloca <8 x float> + + ; Store the vectors at different offsets + %ptr0 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 0 + store <2 x i32> %a, ptr %ptr0 + + %ptr1 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 2 + store <2 x float> %b, ptr %ptr1 + + %ptr2 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 4 + store <2 x float> %c, ptr %ptr2 + + %ptr3 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 6 + store <2 x i32> %d, ptr %ptr3 + + ; Load the complete vector + %result = load <4 x double>, ptr %alloca + ret <4 x double> %result +} + ;. ; DEBUG: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C, file: [[META1:![0-9]+]], producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) ; DEBUG: [[META1]] = !DIFile(filename: "{{.*}}", directory: {{.*}}) @@ -402,6 +456,22 @@ entry: ; DEBUG: [[DBG114]] = !DILocation(line: 72, column: 1, scope: [[DBG103]]) ; DEBUG: [[META115]] = !DILocation(line: 73, column: 1, scope: [[DBG103]]) ; DEBUG: [[DBG116]] = !DILocation(line: 74, column: 1, scope: [[DBG103]]) +; DEBUG: [[DBG117]] = distinct !DISubprogram(name: "load_store_different_element_type", linkageName: "load_store_different_element_type", scope: null, file: [[META1]], line: 75, type: [[META6]], scopeLine: 75, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META118:![0-9]+]]) +; DEBUG: [[META118]] = !{[[META119]], [[META120]], [[META121]], [[META122]], [[META123]], [[META124]]} +; DEBUG: [[META119]] = !DILocalVariable(name: "41", scope: [[DBG117]], file: [[META1]], line: 75, type: [[META10]]) +; DEBUG: [[META120]] = !DILocalVariable(name: "42", scope: [[DBG117]], file: [[META1]], line: 76, type: [[META10]]) +; DEBUG: [[META121]] = !DILocalVariable(name: "43", scope: [[DBG117]], file: [[META1]], line: 78, type: [[META10]]) +; DEBUG: [[META122]] = !DILocalVariable(name: "44", scope: [[DBG117]], file: [[META1]], line: 80, type: [[META10]]) +; DEBUG: [[META123]] = !DILocalVariable(name: "45", scope: [[DBG117]], file: [[META1]], line: 82, type: [[META10]]) +; DEBUG: [[META124]] = !DILocalVariable(name: "46", scope: [[DBG117]], file: [[META1]], line: 84, type: [[META16]]) +; DEBUG: [[META125]] = !DILocation(line: 75, column: 1, scope: [[DBG117]]) +; DEBUG: [[META126]] = !DILocation(line: 76, column: 1, scope: [[DBG117]]) +; DEBUG: [[META127]] = !DILocation(line: 78, column: 1, scope: [[DBG117]]) +; DEBUG: [[META128]] = !DILocation(line: 80, column: 1, scope: [[DBG117]]) +; DEBUG: [[META129]] = !DILocation(line: 82, column: 1, scope: [[DBG117]]) +; DEBUG: [[DBG130]] = !DILocation(line: 83, column: 1, scope: [[DBG117]]) +; DEBUG: [[META131]] = !DILocation(line: 84, column: 1, scope: [[DBG117]]) +; DEBUG: [[DBG132]] = !DILocation(line: 85, column: 1, scope: [[DBG117]]) ;. ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; CHECK-MODIFY-CFG: {{.*}} From 1d1e3d38590f3f47dd0fa667ed0e2314ea78342a Mon Sep 17 00:00:00 2001 From: chengjunp Date: Tue, 19 Aug 2025 04:20:04 +0000 Subject: [PATCH 4/8] Do not handle ptr cases --- llvm/lib/Transforms/Scalar/SROA.cpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index 7a0ebf7ce6bc0..5a0aa3365444f 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -2980,6 +2980,10 @@ class AllocaSliceRewriter : public InstVisitor { AllocatedEltTy = FixedVecTy->getElementType(); else return std::nullopt; + // If the allocated element type is a pointer, we do not handle it + // TODO: handle this case by using inttoptr/ptrtoint + if (AllocatedEltTy->isPtrOrPtrVectorTy()) + return std::nullopt; for (Slice &S : P) { auto *User = cast(S.getUse()->getUser()); @@ -2997,6 +3001,10 @@ class AllocaSliceRewriter : public InstVisitor { if (DL.getTypeSizeInBits(FixedVecTy) != DL.getTypeSizeInBits(NewAI.getAllocatedType())) return std::nullopt; + // If the loaded value is a pointer, we do not handle it + // TODO: handle this case by using inttoptr/ptrtoint + if (FixedVecTy->getElementType()->isPtrOrPtrVectorTy()) + return std::nullopt; TheLoad = LI; } else if (auto *SI = dyn_cast(User)) { // The stored value should be a fixed vector type @@ -3009,6 +3017,10 @@ class AllocaSliceRewriter : public InstVisitor { if (DL.getTypeSizeInBits(StoredValueType) % DL.getTypeSizeInBits(AllocatedEltTy) != 0) return std::nullopt; + // If the stored value is a pointer, we do not handle it + // TODO: handle this case by using inttoptr/ptrtoint + if (StoredValueType->isPtrOrPtrVectorTy()) + return std::nullopt; StoreInfos.emplace_back(SI, S.beginOffset(), S.endOffset(), SI->getValueOperand()); } else { From a8849460978d33eec56560bd12683c7971d237a3 Mon Sep 17 00:00:00 2001 From: chengjunp Date: Tue, 19 Aug 2025 04:23:39 +0000 Subject: [PATCH 5/8] format --- llvm/lib/Transforms/Scalar/SROA.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index 5a0aa3365444f..9c5c3b3d50555 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -2703,8 +2703,7 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V, /// \return A new vector containing all elements from V0 followed by all /// elements from V1 static Value *mergeTwoVectors(Value *V0, Value *V1, const DataLayout &DL, - Type *NewAIEltTy, - IRBuilder<> &Builder) { + Type *NewAIEltTy, IRBuilder<> &Builder) { assert(V0->getType()->isVectorTy() && V1->getType()->isVectorTy() && "Can not merge two non-vector values"); @@ -2723,8 +2722,7 @@ static Value *mergeTwoVectors(Value *V0, Value *V1, const DataLayout &DL, // Calculate new number of elements to maintain same bit width unsigned TotalBits = VecType->getNumElements() * DL.getTypeSizeInBits(EltType); - unsigned NewNumElts = - TotalBits / DL.getTypeSizeInBits(NewAIEltTy); + unsigned NewNumElts = TotalBits / DL.getTypeSizeInBits(NewAIEltTy); auto *NewVecType = FixedVectorType::get(NewAIEltTy, NewNumElts); V = Builder.CreateBitCast(V, NewVecType); @@ -3011,11 +3009,12 @@ class AllocaSliceRewriter : public InstVisitor { Type *StoredValueType = SI->getValueOperand()->getType(); if (!isa(StoredValueType)) return std::nullopt; - + // The total number of stored bits should be the multiple of the new // alloca element type size if (DL.getTypeSizeInBits(StoredValueType) % - DL.getTypeSizeInBits(AllocatedEltTy) != 0) + DL.getTypeSizeInBits(AllocatedEltTy) != + 0) return std::nullopt; // If the stored value is a pointer, we do not handle it // TODO: handle this case by using inttoptr/ptrtoint From 3146a3b65467814fb5dce0a8043837f74e8ef34e Mon Sep 17 00:00:00 2001 From: chengjunp Date: Wed, 27 Aug 2025 20:07:37 +0000 Subject: [PATCH 6/8] Fix bugs and update tests --- llvm/lib/Transforms/Scalar/SROA.cpp | 91 ++-- ...r-promotion-cannot-tree-structure-merge.ll | 99 +++-- ...ctor-promotion-via-tree-structure-merge.ll | 403 +++++------------- 3 files changed, 220 insertions(+), 373 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index c24655e7492d9..aeea2d31c7a4e 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -2693,9 +2693,6 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V, /// elements from V1 static Value *mergeTwoVectors(Value *V0, Value *V1, const DataLayout &DL, Type *NewAIEltTy, IRBuilder<> &Builder) { - assert(V0->getType()->isVectorTy() && V1->getType()->isVectorTy() && - "Can not merge two non-vector values"); - // V0 and V1 are vectors // Create a new vector type with combined elements // Use ShuffleVector to concatenate the vectors @@ -2737,18 +2734,15 @@ static Value *mergeTwoVectors(Value *V0, Value *V1, const DataLayout &DL, unsigned SmallSize = std::min(NumElts0, NumElts1); unsigned LargeSize = std::max(NumElts0, NumElts1); bool IsV0Smaller = NumElts0 < NumElts1; - Value *SmallVec = IsV0Smaller ? V0 : V1; - + Value *&ExtendedVec = IsV0Smaller ? V0 : V1; SmallVector ExtendMask; for (unsigned i = 0; i < SmallSize; ++i) ExtendMask.push_back(i); for (unsigned i = SmallSize; i < LargeSize; ++i) ExtendMask.push_back(PoisonMaskElem); - Value *ExtendedVec = Builder.CreateShuffleVector( - SmallVec, PoisonValue::get(SmallVec->getType()), ExtendMask); + ExtendedVec = Builder.CreateShuffleVector( + ExtendedVec, PoisonValue::get(ExtendedVec->getType()), ExtendMask); LLVM_DEBUG(dbgs() << " shufflevector: " << *ExtendedVec << "\n"); - V0 = IsV0Smaller ? ExtendedVec : V0; - V1 = IsV0Smaller ? V1 : ExtendedVec; for (unsigned i = 0; i < NumElts0; ++i) ShuffleMask.push_back(i); for (unsigned i = 0; i < NumElts1; ++i) @@ -2961,53 +2955,45 @@ class AllocaSliceRewriter : public InstVisitor { SmallVector StoreInfos; - // The alloca must be a fixed vector type - Type *AllocatedEltTy = nullptr; - if (auto *FixedVecTy = dyn_cast(NewAI.getAllocatedType())) - AllocatedEltTy = FixedVecTy->getElementType(); - else - return std::nullopt; - // If the allocated element type is a pointer, we do not handle it - // TODO: handle this case by using inttoptr/ptrtoint - if (AllocatedEltTy->isPtrOrPtrVectorTy()) - return std::nullopt; + // If the new alloca is a fixed vector type, we use its element type as the + // allocated element type, otherwise we use i8 as the allocated element + Type *AllocatedEltTy = + isa(NewAI.getAllocatedType()) + ? cast(NewAI.getAllocatedType())->getElementType() + : Type::getInt8Ty(NewAI.getContext()); + + // Helper to check if a type is + // 1. A fixed vector type + // 2. The element type is not a pointer + // 3. The element type size is byte-aligned + // We only handle the cases that the ld/st meet these conditions + auto IsTypeValidForTreeStructuredMerge = [&](Type *Ty) -> bool { + auto *FixedVecTy = dyn_cast(Ty); + return FixedVecTy && + DL.getTypeSizeInBits(FixedVecTy->getElementType()) % 8 == 0 && + !FixedVecTy->getElementType()->isPointerTy(); + }; for (Slice &S : P) { auto *User = cast(S.getUse()->getUser()); if (auto *LI = dyn_cast(User)) { - // Do not handle the case where there is more than one load - // TODO: maybe we can handle this case - if (TheLoad) - return std::nullopt; - // If load is not a fixed vector type, we do not handle it - // If the number of loaded bits is not the same as the new alloca type - // size, we do not handle it - auto *FixedVecTy = dyn_cast(LI->getType()); - if (!FixedVecTy) - return std::nullopt; - if (DL.getTypeSizeInBits(FixedVecTy) != - DL.getTypeSizeInBits(NewAI.getAllocatedType())) - return std::nullopt; - // If the loaded value is a pointer, we do not handle it - // TODO: handle this case by using inttoptr/ptrtoint - if (FixedVecTy->getElementType()->isPtrOrPtrVectorTy()) + // Do not handle the case if + // 1. There is more than one load + // 2. The load is volatile + // 3. The load does not read the entire alloca structure + // 4. The load does not meet the conditions in the helper function + if (TheLoad || !IsTypeValidForTreeStructuredMerge(LI->getType()) || + S.beginOffset() != NewAllocaBeginOffset || + S.endOffset() != NewAllocaEndOffset || + LI->isVolatile()) return std::nullopt; TheLoad = LI; } else if (auto *SI = dyn_cast(User)) { - // The stored value should be a fixed vector type - Type *StoredValueType = SI->getValueOperand()->getType(); - if (!isa(StoredValueType)) - return std::nullopt; - - // The total number of stored bits should be the multiple of the new - // alloca element type size - if (DL.getTypeSizeInBits(StoredValueType) % - DL.getTypeSizeInBits(AllocatedEltTy) != - 0) - return std::nullopt; - // If the stored value is a pointer, we do not handle it - // TODO: handle this case by using inttoptr/ptrtoint - if (StoredValueType->isPtrOrPtrVectorTy()) + // Do not handle the case if + // 1. The store does not meet the conditions in the helper function + // 2. The store is volatile + if (!IsTypeValidForTreeStructuredMerge(SI->getValueOperand()->getType()) || + SI->isVolatile()) return std::nullopt; StoreInfos.emplace_back(SI, S.beginOffset(), S.endOffset(), SI->getValueOperand()); @@ -3033,7 +3019,6 @@ class AllocaSliceRewriter : public InstVisitor { // Check for overlaps and coverage uint64_t ExpectedStart = NewAllocaBeginOffset; - TypeSize TotalStoreBits = TypeSize::getZero(); for (auto &StoreInfo : StoreInfos) { uint64_t BeginOff = StoreInfo.BeginOffset; uint64_t EndOff = StoreInfo.EndOffset; @@ -3043,13 +3028,9 @@ class AllocaSliceRewriter : public InstVisitor { return std::nullopt; ExpectedStart = EndOff; - TotalStoreBits += - DL.getTypeSizeInBits(StoreInfo.Store->getValueOperand()->getType()); } // Check that stores cover the entire alloca - // We need check both the end offset and the total store bits - if (ExpectedStart != NewAllocaEndOffset || - TotalStoreBits != DL.getTypeSizeInBits(NewAI.getAllocatedType())) + if (ExpectedStart != NewAllocaEndOffset) return std::nullopt; // Stores should be in the same basic block diff --git a/llvm/test/Transforms/SROA/vector-promotion-cannot-tree-structure-merge.ll b/llvm/test/Transforms/SROA/vector-promotion-cannot-tree-structure-merge.ll index e4b106856de47..c858d071451e8 100644 --- a/llvm/test/Transforms/SROA/vector-promotion-cannot-tree-structure-merge.ll +++ b/llvm/test/Transforms/SROA/vector-promotion-cannot-tree-structure-merge.ll @@ -21,12 +21,12 @@ entry: define <4 x float> @test_more_than_one_load(<2 x float> %a, <2 x float> %b) { entry: - %alloca = alloca <4 x float> + %alloca = alloca [4 x float] - %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0 + %ptr0 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 0 store <2 x float> %a, ptr %ptr0 - %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2 + %ptr1 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 2 store <2 x float> %b, ptr %ptr1 %result1 = load <4 x float>, ptr %alloca @@ -38,19 +38,19 @@ entry: define void @test_no_load(<4 x float> %a) { entry: - %alloca = alloca <4 x float> + %alloca = alloca [4 x float] store <4 x float> %a, ptr %alloca ret void } define i32 @test_load_not_fixed_vector(<2 x float> %a, <2 x float> %b) { entry: - %alloca = alloca <4 x float> + %alloca = alloca [4 x float] - %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0 + %ptr0 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 0 store <2 x float> %a, ptr %ptr0 - %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2 + %ptr1 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 2 store <2 x float> %b, ptr %ptr1 %result = load i32, ptr %alloca @@ -59,12 +59,12 @@ entry: define <3 x float> @test_load_not_covering_alloca(<2 x float> %a, <2 x float> %b) { entry: - %alloca = alloca <4 x float> + %alloca = alloca [4 x float] - %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0 + %ptr0 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 0 store <2 x float> %a, ptr %ptr0 - %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2 + %ptr1 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 2 store <2 x float> %b, ptr %ptr1 %result = load <3 x float>, ptr %ptr0 @@ -73,9 +73,9 @@ entry: define <4 x float> @test_store_not_fixed_vector( %a) { entry: - %alloca = alloca <4 x float> + %alloca = alloca [4 x float] - %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0 + %ptr0 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 0 %fixed = extractelement %a, i32 0 store float %fixed, ptr %ptr0 @@ -86,7 +86,7 @@ entry: define <4 x float> @test_no_stores() { entry: - %alloca = alloca <4 x float> + %alloca = alloca [4 x float] %result = load <4 x float>, ptr %alloca ret <4 x float> %result @@ -94,15 +94,15 @@ entry: define <4 x float> @test_stores_overlapping(<2 x float> %a, <2 x float> %b, <2 x float> %c) { entry: - %alloca = alloca <4 x float> + %alloca = alloca [4 x float] - %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0 + %ptr0 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 0 store <2 x float> %a, ptr %ptr0 - %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 1 + %ptr1 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 1 store <2 x float> %b, ptr %ptr1 - %ptr2 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2 + %ptr2 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 2 store <2 x float> %c, ptr %ptr2 %result = load <4 x float>, ptr %alloca @@ -111,9 +111,9 @@ entry: define <4 x float> @test_stores_not_covering_alloca(<2 x float> %a) { entry: - %alloca = alloca <4 x float> + %alloca = alloca [4 x float] - %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0 + %ptr0 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 0 store <2 x float> %a, ptr %ptr0 %result = load <4 x float>, ptr %alloca @@ -122,15 +122,15 @@ entry: define <4 x float> @test_stores_not_same_basic_block(<2 x float> %a, <2 x float> %b, i1 %cond) { entry: - %alloca = alloca <4 x float> + %alloca = alloca [4 x float] - %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0 + %ptr0 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 0 store <2 x float> %a, ptr %ptr0 br i1 %cond, label %then, label %else then: - %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2 + %ptr1 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 2 store <2 x float> %b, ptr %ptr1 br label %merge @@ -144,14 +144,14 @@ merge: define <4 x float> @test_load_before_stores(<2 x float> %a, <2 x float> %b) { entry: - %alloca = alloca <4 x float> + %alloca = alloca [4 x float] - %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0 + %ptr0 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 0 store <2 x float> %a, ptr %ptr0 %intermediate = load <4 x float>, ptr %alloca - %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2 + %ptr1 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 2 store <2 x float> %b, ptr %ptr1 ret <4 x float> %intermediate @@ -159,21 +159,64 @@ entry: define <4 x float> @test_other_instructions(<2 x float> %a, <2 x float> %b) { entry: - %alloca = alloca <4 x float> + %alloca = alloca [4 x float] ; Store first vector - %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0 + %ptr0 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 0 store <2 x float> %a, ptr %ptr0 ; Other instruction (memset) that's not a simple load/store call void @llvm.memset.p0.i64(ptr %alloca, i8 0, i64 8, i1 false) ; Store second vector - %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2 + %ptr1 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 2 store <2 x float> %b, ptr %ptr1 %result = load <4 x float>, ptr %alloca ret <4 x float> %result } +define <4 x float> @volatile_stores(<2 x i32> %a, <2 x i32> %b) { +entry: + %alloca = alloca [4 x float] + + %ptr0 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 0 + store volatile <2 x i32> %a, ptr %ptr0 + + %ptr1 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 2 + store volatile <2 x i32> %b, ptr %ptr1 + + %result = load <4 x float>, ptr %alloca + ret <4 x float> %result +} + +define <4 x float> @volatile_loads(<2 x i32> %a, <2 x i32> %b) { +entry: + %alloca = alloca [4 x float] + + %ptr0 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 0 + store <2 x i32> %a, ptr %ptr0 + + %ptr1 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 2 + store <2 x i32> %b, ptr %ptr1 + + %result = load volatile <4 x float>, ptr %alloca + ret <4 x float> %result +} + +define <4 x i15> @non_byte_aligned_alloca(<2 x i15> %a, <2 x i15> %b) { +entry: + %alloca = alloca [4 x i15] + + %ptr0 = getelementptr inbounds [4 x i15], ptr %alloca, i32 0, i32 0 + store <2 x i15> %a, ptr %ptr0 + + %ptr1 = getelementptr inbounds [4 x i15], ptr %alloca, i32 0, i32 2 + store <2 x i15> %b, ptr %ptr1 + + %result = load <4 x i15>, ptr %alloca + ret <4 x i15> %result + +} + declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) diff --git a/llvm/test/Transforms/SROA/vector-promotion-via-tree-structure-merge.ll b/llvm/test/Transforms/SROA/vector-promotion-via-tree-structure-merge.ll index 83bc48b617f29..8bfe0bb83051e 100644 --- a/llvm/test/Transforms/SROA/vector-promotion-via-tree-structure-merge.ll +++ b/llvm/test/Transforms/SROA/vector-promotion-via-tree-structure-merge.ll @@ -1,7 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt < %s -passes='sroa' -S | FileCheck %s --check-prefixes=CHECK,CHECK-PRESERVE-CFG ; RUN: opt < %s -passes='sroa' -S | FileCheck %s --check-prefixes=CHECK,CHECK-MODIFY-CFG -; RUN: opt < %s -passes=debugify,sroa -S | FileCheck %s --check-prefix=DEBUG target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64" ; Basic tree-structured merge: 4 stores of <2 x float> into <8 x float> @@ -14,37 +13,21 @@ define <8 x float> @basic_tree_merge(<2 x float> %a, <2 x float> %b, <2 x float> ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <8 x i32> ; CHECK-NEXT: ret <8 x float> [[TMP2]] ; -; DEBUG-LABEL: define <8 x float> @basic_tree_merge( -; DEBUG-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]], <2 x float> [[C:%.*]], <2 x float> [[D:%.*]]) !dbg [[DBG5:![0-9]+]] { -; DEBUG-NEXT: [[ENTRY:.*:]] -; DEBUG-NEXT: #dbg_value(ptr poison, [[META9:![0-9]+]], !DIExpression(), [[META17:![0-9]+]]) -; DEBUG-NEXT: #dbg_value(ptr undef, [[META11:![0-9]+]], !DIExpression(), [[META18:![0-9]+]]) -; DEBUG-NEXT: #dbg_value(ptr undef, [[META12:![0-9]+]], !DIExpression(), [[META19:![0-9]+]]) -; DEBUG-NEXT: #dbg_value(ptr undef, [[META13:![0-9]+]], !DIExpression(), [[META20:![0-9]+]]) -; DEBUG-NEXT: #dbg_value(ptr undef, [[META14:![0-9]+]], !DIExpression(), [[META21:![0-9]+]]) -; DEBUG-NEXT: [[TMP0:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <4 x i32> , !dbg [[DBG22:![0-9]+]] -; DEBUG-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[C]], <2 x float> [[D]], <4 x i32> , !dbg [[DBG22]] -; DEBUG-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <8 x i32> , !dbg [[DBG22]] -; DEBUG-NEXT: #dbg_value(<8 x float> [[TMP2]], [[META15:![0-9]+]], !DIExpression(), [[META23:![0-9]+]]) -; DEBUG-NEXT: ret <8 x float> [[TMP2]], !dbg [[DBG24:![0-9]+]] -; entry: - %alloca = alloca <8 x float> + %alloca = alloca [8 x float] - ; Store the vectors at different offsets - %ptr0 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 0 + %ptr0 = getelementptr inbounds [8 x float], ptr %alloca, i32 0, i32 0 store <2 x float> %a, ptr %ptr0 - %ptr1 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 2 + %ptr1 = getelementptr inbounds [8 x float], ptr %alloca, i32 0, i32 2 store <2 x float> %b, ptr %ptr1 - %ptr2 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 4 + %ptr2 = getelementptr inbounds [8 x float], ptr %alloca, i32 0, i32 4 store <2 x float> %c, ptr %ptr2 - %ptr3 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 6 + %ptr3 = getelementptr inbounds [8 x float], ptr %alloca, i32 0, i32 6 store <2 x float> %d, ptr %ptr3 - ; Load the complete vector %result = load <8 x float>, ptr %alloca ret <8 x float> %result } @@ -59,42 +42,24 @@ define void @multiple_partitions(<2 x float> %a, <2 x float> %b, <2 x float> %c, ; CHECK-NEXT: store <4 x float> [[TMP1]], ptr [[F]], align 16 ; CHECK-NEXT: ret void ; -; DEBUG-LABEL: define void @multiple_partitions( -; DEBUG-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]], <2 x float> [[C:%.*]], <2 x float> [[D:%.*]], ptr [[E:%.*]], ptr [[F:%.*]]) !dbg [[DBG25:![0-9]+]] { -; DEBUG-NEXT: [[ENTRY:.*:]] -; DEBUG-NEXT: #dbg_value(ptr poison, [[META27:![0-9]+]], !DIExpression(), [[META36:![0-9]+]]) -; DEBUG-NEXT: #dbg_value(ptr undef, [[META27]], !DIExpression(), [[META36]]) -; DEBUG-NEXT: #dbg_value(ptr undef, [[META28:![0-9]+]], !DIExpression(), [[META37:![0-9]+]]) -; DEBUG-NEXT: #dbg_value(ptr undef, [[META29:![0-9]+]], !DIExpression(), [[META38:![0-9]+]]) -; DEBUG-NEXT: [[TMP0:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <4 x i32> , !dbg [[DBG39:![0-9]+]] -; DEBUG-NEXT: #dbg_value(ptr undef, [[META30:![0-9]+]], !DIExpression(), [[META40:![0-9]+]]) -; DEBUG-NEXT: #dbg_value(ptr undef, [[META31:![0-9]+]], !DIExpression(), [[META41:![0-9]+]]) -; DEBUG-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[C]], <2 x float> [[D]], <4 x i32> , !dbg [[DBG42:![0-9]+]] -; DEBUG-NEXT: #dbg_value(<4 x float> [[TMP0]], [[META32:![0-9]+]], !DIExpression(), [[META43:![0-9]+]]) -; DEBUG-NEXT: #dbg_value(ptr undef, [[META34:![0-9]+]], !DIExpression(), [[META44:![0-9]+]]) -; DEBUG-NEXT: #dbg_value(<4 x float> [[TMP1]], [[META35:![0-9]+]], !DIExpression(), [[META45:![0-9]+]]) -; DEBUG-NEXT: store <4 x float> [[TMP0]], ptr [[E]], align 16, !dbg [[DBG46:![0-9]+]] -; DEBUG-NEXT: store <4 x float> [[TMP1]], ptr [[F]], align 16, !dbg [[DBG47:![0-9]+]] -; DEBUG-NEXT: ret void, !dbg [[DBG48:![0-9]+]] -; entry: - %alloca = alloca <8 x float> + %alloca = alloca [8 x float] - %ptr0 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 0 + %ptr0 = getelementptr inbounds [8 x float], ptr %alloca, i32 0, i32 0 store <2 x float> %a, ptr %ptr0 - %ptr1 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 2 + %ptr1 = getelementptr inbounds [8 x float], ptr %alloca, i32 0, i32 2 store <2 x float> %b, ptr %ptr1 - %ptr2 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 4 + %ptr2 = getelementptr inbounds [8 x float], ptr %alloca, i32 0, i32 4 store <2 x float> %c, ptr %ptr2 - %ptr3 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 6 + %ptr3 = getelementptr inbounds [8 x float], ptr %alloca, i32 0, i32 6 store <2 x float> %d, ptr %ptr3 %result1 = load <4 x float>, ptr %alloca - %ptr_offset4 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 4 + %ptr_offset4 = getelementptr inbounds [8 x float], ptr %alloca, i32 0, i32 4 %result2 = load <4 x float>, ptr %ptr_offset4 store <4 x float> %result1, ptr %e @@ -113,34 +78,19 @@ define <8 x i32> @out_of_order_stores(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[TMP2]] ; -; DEBUG-LABEL: define <8 x i32> @out_of_order_stores( -; DEBUG-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]], <2 x i32> [[D:%.*]]) !dbg [[DBG49:![0-9]+]] { -; DEBUG-NEXT: [[ENTRY:.*:]] -; DEBUG-NEXT: #dbg_value(ptr poison, [[META51:![0-9]+]], !DIExpression(), [[META57:![0-9]+]]) -; DEBUG-NEXT: #dbg_value(ptr undef, [[META52:![0-9]+]], !DIExpression(), [[META58:![0-9]+]]) -; DEBUG-NEXT: #dbg_value(ptr undef, [[META53:![0-9]+]], !DIExpression(), [[META59:![0-9]+]]) -; DEBUG-NEXT: #dbg_value(ptr undef, [[META54:![0-9]+]], !DIExpression(), [[META60:![0-9]+]]) -; DEBUG-NEXT: [[TMP0:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <4 x i32> , !dbg [[DBG61:![0-9]+]] -; DEBUG-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> [[D]], <4 x i32> , !dbg [[DBG61]] -; DEBUG-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <8 x i32> , !dbg [[DBG61]] -; DEBUG-NEXT: #dbg_value(ptr undef, [[META55:![0-9]+]], !DIExpression(), [[META62:![0-9]+]]) -; DEBUG-NEXT: #dbg_value(<8 x i32> [[TMP2]], [[META56:![0-9]+]], !DIExpression(), [[META63:![0-9]+]]) -; DEBUG-NEXT: ret <8 x i32> [[TMP2]], !dbg [[DBG64:![0-9]+]] -; entry: - %alloca = alloca <8 x i32> + %alloca = alloca [8 x i32] - ; Store out of order - %ptr2 = getelementptr inbounds <8 x i32>, ptr %alloca, i32 0, i32 4 + %ptr2 = getelementptr inbounds [8 x i32], ptr %alloca, i32 0, i32 4 store <2 x i32> %c, ptr %ptr2 - %ptr0 = getelementptr inbounds <8 x i32>, ptr %alloca, i32 0, i32 0 + %ptr0 = getelementptr inbounds [8 x i32], ptr %alloca, i32 0, i32 0 store <2 x i32> %a, ptr %ptr0 - %ptr3 = getelementptr inbounds <8 x i32>, ptr %alloca, i32 0, i32 6 + %ptr3 = getelementptr inbounds [8 x i32], ptr %alloca, i32 0, i32 6 store <2 x i32> %d, ptr %ptr3 - %ptr1 = getelementptr inbounds <8 x i32>, ptr %alloca, i32 0, i32 2 + %ptr1 = getelementptr inbounds [8 x i32], ptr %alloca, i32 0, i32 2 store <2 x i32> %b, ptr %ptr1 %result = load <8 x i32>, ptr %alloca @@ -161,46 +111,24 @@ define <8 x i16> @single_element_stores(<1 x i16> %a, <1 x i16> %b, <1 x i16> %c ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <8 x i32> ; CHECK-NEXT: ret <8 x i16> [[TMP6]] ; -; DEBUG-LABEL: define <8 x i16> @single_element_stores( -; DEBUG-SAME: <1 x i16> [[A:%.*]], <1 x i16> [[B:%.*]], <1 x i16> [[C:%.*]], <1 x i16> [[D:%.*]], <1 x i16> [[E:%.*]], <1 x i16> [[F:%.*]], <1 x i16> [[G:%.*]], <1 x i16> [[H:%.*]]) !dbg [[DBG65:![0-9]+]] { -; DEBUG-NEXT: [[ENTRY:.*:]] -; DEBUG-NEXT: #dbg_value(ptr poison, [[META67:![0-9]+]], !DIExpression(), [[META77:![0-9]+]]) -; DEBUG-NEXT: #dbg_value(ptr undef, [[META68:![0-9]+]], !DIExpression(), [[META78:![0-9]+]]) -; DEBUG-NEXT: #dbg_value(ptr undef, [[META69:![0-9]+]], !DIExpression(), [[META79:![0-9]+]]) -; DEBUG-NEXT: #dbg_value(ptr undef, [[META70:![0-9]+]], !DIExpression(), [[META80:![0-9]+]]) -; DEBUG-NEXT: #dbg_value(ptr undef, [[META71:![0-9]+]], !DIExpression(), [[META81:![0-9]+]]) -; DEBUG-NEXT: #dbg_value(ptr undef, [[META72:![0-9]+]], !DIExpression(), [[META82:![0-9]+]]) -; DEBUG-NEXT: #dbg_value(ptr undef, [[META73:![0-9]+]], !DIExpression(), [[META83:![0-9]+]]) -; DEBUG-NEXT: #dbg_value(ptr undef, [[META74:![0-9]+]], !DIExpression(), [[META84:![0-9]+]]) -; DEBUG-NEXT: #dbg_value(ptr undef, [[META75:![0-9]+]], !DIExpression(), [[META85:![0-9]+]]) -; DEBUG-NEXT: [[TMP0:%.*]] = shufflevector <1 x i16> [[A]], <1 x i16> [[B]], <2 x i32> , !dbg [[DBG86:![0-9]+]] -; DEBUG-NEXT: [[TMP1:%.*]] = shufflevector <1 x i16> [[C]], <1 x i16> [[D]], <2 x i32> , !dbg [[DBG86]] -; DEBUG-NEXT: [[TMP2:%.*]] = shufflevector <1 x i16> [[E]], <1 x i16> [[F]], <2 x i32> , !dbg [[DBG86]] -; DEBUG-NEXT: [[TMP3:%.*]] = shufflevector <1 x i16> [[G]], <1 x i16> [[H]], <2 x i32> , !dbg [[DBG86]] -; DEBUG-NEXT: [[TMP4:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> [[TMP1]], <4 x i32> , !dbg [[DBG86]] -; DEBUG-NEXT: [[TMP5:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP3]], <4 x i32> , !dbg [[DBG86]] -; DEBUG-NEXT: [[TMP6:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <8 x i32> , !dbg [[DBG86]] -; DEBUG-NEXT: #dbg_value(<8 x i16> [[TMP6]], [[META76:![0-9]+]], !DIExpression(), [[META87:![0-9]+]]) -; DEBUG-NEXT: ret <8 x i16> [[TMP6]], !dbg [[DBG88:![0-9]+]] -; entry: - %alloca = alloca <8 x i16> + %alloca = alloca [8 x i16] - %ptr0 = getelementptr inbounds <8 x i16>, ptr %alloca, i32 0, i32 0 + %ptr0 = getelementptr inbounds [8 x i16], ptr %alloca, i32 0, i32 0 store <1 x i16> %a, ptr %ptr0 - %ptr1 = getelementptr inbounds <8 x i16>, ptr %alloca, i32 0, i32 1 + %ptr1 = getelementptr inbounds [8 x i16], ptr %alloca, i32 0, i32 1 store <1 x i16> %b, ptr %ptr1 - %ptr2 = getelementptr inbounds <8 x i16>, ptr %alloca, i32 0, i32 2 + %ptr2 = getelementptr inbounds [8 x i16], ptr %alloca, i32 0, i32 2 store <1 x i16> %c, ptr %ptr2 - %ptr3 = getelementptr inbounds <8 x i16>, ptr %alloca, i32 0, i32 3 + %ptr3 = getelementptr inbounds [8 x i16], ptr %alloca, i32 0, i32 3 store <1 x i16> %d, ptr %ptr3 - %ptr4 = getelementptr inbounds <8 x i16>, ptr %alloca, i32 0, i32 4 + %ptr4 = getelementptr inbounds [8 x i16], ptr %alloca, i32 0, i32 4 store <1 x i16> %e, ptr %ptr4 - %ptr5 = getelementptr inbounds <8 x i16>, ptr %alloca, i32 0, i32 5 + %ptr5 = getelementptr inbounds [8 x i16], ptr %alloca, i32 0, i32 5 store <1 x i16> %f, ptr %ptr5 - %ptr6 = getelementptr inbounds <8 x i16>, ptr %alloca, i32 0, i32 6 + %ptr6 = getelementptr inbounds [8 x i16], ptr %alloca, i32 0, i32 6 store <1 x i16> %g, ptr %ptr6 - %ptr7 = getelementptr inbounds <8 x i16>, ptr %alloca, i32 0, i32 7 + %ptr7 = getelementptr inbounds [8 x i16], ptr %alloca, i32 0, i32 7 store <1 x i16> %h, ptr %ptr7 %result = load <8 x i16>, ptr %alloca @@ -217,29 +145,16 @@ define <6 x float> @non_power_of_2(<2 x float> %a, <2 x float> %b, <2 x float> % ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <6 x i32> ; CHECK-NEXT: ret <6 x float> [[TMP2]] ; -; DEBUG-LABEL: define <6 x float> @non_power_of_2( -; DEBUG-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]], <2 x float> [[C:%.*]]) !dbg [[DBG89:![0-9]+]] { -; DEBUG-NEXT: [[ENTRY:.*:]] -; DEBUG-NEXT: #dbg_value(ptr poison, [[META91:![0-9]+]], !DIExpression(), [[META96:![0-9]+]]) -; DEBUG-NEXT: #dbg_value(ptr undef, [[META92:![0-9]+]], !DIExpression(), [[META97:![0-9]+]]) -; DEBUG-NEXT: #dbg_value(ptr undef, [[META93:![0-9]+]], !DIExpression(), [[META98:![0-9]+]]) -; DEBUG-NEXT: #dbg_value(ptr undef, [[META94:![0-9]+]], !DIExpression(), [[META99:![0-9]+]]) -; DEBUG-NEXT: [[TMP0:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <4 x i32> , !dbg [[DBG100:![0-9]+]] -; DEBUG-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[C]], <2 x float> poison, <4 x i32> , !dbg [[DBG100]] -; DEBUG-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <6 x i32> , !dbg [[DBG100]] -; DEBUG-NEXT: #dbg_value(<6 x float> [[TMP2]], [[META95:![0-9]+]], !DIExpression(), [[META101:![0-9]+]]) -; DEBUG-NEXT: ret <6 x float> [[TMP2]], !dbg [[DBG102:![0-9]+]] -; entry: - %alloca = alloca <6 x float> + %alloca = alloca [6 x float] - %ptr0 = getelementptr inbounds <6 x float>, ptr %alloca, i32 0, i32 0 + %ptr0 = getelementptr inbounds [6 x float], ptr %alloca, i32 0, i32 0 store <2 x float> %a, ptr %ptr0 - %ptr1 = getelementptr inbounds <6 x float>, ptr %alloca, i32 0, i32 2 + %ptr1 = getelementptr inbounds [6 x float], ptr %alloca, i32 0, i32 2 store <2 x float> %b, ptr %ptr1 - %ptr2 = getelementptr inbounds <6 x float>, ptr %alloca, i32 0, i32 4 + %ptr2 = getelementptr inbounds [6 x float], ptr %alloca, i32 0, i32 4 store <2 x float> %c, ptr %ptr2 %result = load <6 x float>, ptr %alloca @@ -257,30 +172,16 @@ define <7 x float> @store_with_different_size_of_vectors(<1 x float> %a, <4 x fl ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <5 x float> [[TMP1]], <5 x float> [[TMP2]], <7 x i32> ; CHECK-NEXT: ret <7 x float> [[TMP3]] ; -; DEBUG-LABEL: define <7 x float> @store_with_different_size_of_vectors( -; DEBUG-SAME: <1 x float> [[A:%.*]], <4 x float> [[B:%.*]], <2 x float> [[C:%.*]]) !dbg [[DBG103:![0-9]+]] { -; DEBUG-NEXT: [[ENTRY:.*:]] -; DEBUG-NEXT: #dbg_value(ptr poison, [[META105:![0-9]+]], !DIExpression(), [[META110:![0-9]+]]) -; DEBUG-NEXT: #dbg_value(ptr undef, [[META106:![0-9]+]], !DIExpression(), [[META111:![0-9]+]]) -; DEBUG-NEXT: #dbg_value(ptr undef, [[META107:![0-9]+]], !DIExpression(), [[META112:![0-9]+]]) -; DEBUG-NEXT: #dbg_value(ptr undef, [[META108:![0-9]+]], !DIExpression(), [[META113:![0-9]+]]) -; DEBUG-NEXT: [[TMP0:%.*]] = shufflevector <1 x float> [[A]], <1 x float> poison, <4 x i32> , !dbg [[DBG114:![0-9]+]] -; DEBUG-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[B]], <5 x i32> , !dbg [[DBG114]] -; DEBUG-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[C]], <2 x float> poison, <5 x i32> , !dbg [[DBG114]] -; DEBUG-NEXT: [[TMP3:%.*]] = shufflevector <5 x float> [[TMP1]], <5 x float> [[TMP2]], <7 x i32> , !dbg [[DBG114]] -; DEBUG-NEXT: #dbg_value(<7 x float> [[TMP3]], [[META109:![0-9]+]], !DIExpression(), [[META115:![0-9]+]]) -; DEBUG-NEXT: ret <7 x float> [[TMP3]], !dbg [[DBG116:![0-9]+]] -; entry: - %alloca = alloca <7 x float> + %alloca = alloca [7 x float] - %ptr0 = getelementptr inbounds <7 x float>, ptr %alloca, i32 0, i32 0 + %ptr0 = getelementptr inbounds [7 x float], ptr %alloca, i32 0, i32 0 store <1 x float> %a, ptr %ptr0 - %ptr1 = getelementptr inbounds <7 x float>, ptr %alloca, i32 0, i32 1 + %ptr1 = getelementptr inbounds [7 x float], ptr %alloca, i32 0, i32 1 store <4 x float> %b, ptr %ptr1 - %ptr2 = getelementptr inbounds <7 x float>, ptr %alloca, i32 0, i32 5 + %ptr2 = getelementptr inbounds [7 x float], ptr %alloca, i32 0, i32 5 store <2 x float> %c, ptr %ptr2 %result = load <7 x float>, ptr %alloca @@ -301,178 +202,100 @@ define <4 x double> @load_store_different_element_type(<2 x i32> %a, <2 x float> ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP5]], <4 x i32> ; CHECK-NEXT: ret <4 x double> [[TMP6]] ; -; DEBUG-LABEL: define <4 x double> @load_store_different_element_type( -; DEBUG-SAME: <2 x i32> [[A:%.*]], <2 x float> [[B:%.*]], <2 x float> [[C:%.*]], <2 x i32> [[D:%.*]]) !dbg [[DBG117:![0-9]+]] { -; DEBUG-NEXT: [[ENTRY:.*:]] -; DEBUG-NEXT: #dbg_value(ptr poison, [[META119:![0-9]+]], !DIExpression(), [[META125:![0-9]+]]) -; DEBUG-NEXT: #dbg_value(ptr undef, [[META119]], !DIExpression(), [[META125]]) -; DEBUG-NEXT: #dbg_value(ptr undef, [[META120:![0-9]+]], !DIExpression(), [[META126:![0-9]+]]) -; DEBUG-NEXT: #dbg_value(ptr undef, [[META121:![0-9]+]], !DIExpression(), [[META127:![0-9]+]]) -; DEBUG-NEXT: #dbg_value(ptr undef, [[META122:![0-9]+]], !DIExpression(), [[META128:![0-9]+]]) -; DEBUG-NEXT: #dbg_value(ptr undef, [[META123:![0-9]+]], !DIExpression(), [[META129:![0-9]+]]) -; DEBUG-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <1 x double>, !dbg [[DBG130:![0-9]+]] -; DEBUG-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <1 x double>, !dbg [[DBG130]] -; DEBUG-NEXT: [[TMP2:%.*]] = shufflevector <1 x double> [[TMP0]], <1 x double> [[TMP1]], <2 x i32> , !dbg [[DBG130]] -; DEBUG-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[C]] to <1 x double>, !dbg [[DBG130]] -; DEBUG-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[D]] to <1 x double>, !dbg [[DBG130]] -; DEBUG-NEXT: [[TMP5:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP4]], <2 x i32> , !dbg [[DBG130]] -; DEBUG-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP5]], <4 x i32> , !dbg [[DBG130]] -; DEBUG-NEXT: #dbg_value(<4 x double> [[TMP6]], [[META124:![0-9]+]], !DIExpression(), [[META131:![0-9]+]]) -; DEBUG-NEXT: ret <4 x double> [[TMP6]], !dbg [[DBG132:![0-9]+]] -; entry: - %alloca = alloca <8 x float> + %alloca = alloca [8 x float] - ; Store the vectors at different offsets - %ptr0 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 0 + %ptr0 = getelementptr inbounds [8 x float], ptr %alloca, i32 0, i32 0 store <2 x i32> %a, ptr %ptr0 - %ptr1 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 2 + %ptr1 = getelementptr inbounds [8 x float], ptr %alloca, i32 0, i32 2 store <2 x float> %b, ptr %ptr1 - %ptr2 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 4 + %ptr2 = getelementptr inbounds [8 x float], ptr %alloca, i32 0, i32 4 store <2 x float> %c, ptr %ptr2 - %ptr3 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 6 + %ptr3 = getelementptr inbounds [8 x float], ptr %alloca, i32 0, i32 6 store <2 x i32> %d, ptr %ptr3 - ; Load the complete vector %result = load <4 x double>, ptr %alloca ret <4 x double> %result } -;. -; DEBUG: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C, file: [[META1:![0-9]+]], producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) -; DEBUG: [[META1]] = !DIFile(filename: "{{.*}}", directory: {{.*}}) -; DEBUG: [[DBG5]] = distinct !DISubprogram(name: "basic_tree_merge", linkageName: "basic_tree_merge", scope: null, file: [[META1]], line: 1, type: [[META6:![0-9]+]], scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META8:![0-9]+]]) -; DEBUG: [[META6]] = !DISubroutineType(types: [[META7:![0-9]+]]) -; DEBUG: [[META7]] = !{} -; DEBUG: [[META8]] = !{[[META9]], [[META11]], [[META12]], [[META13]], [[META14]], [[META15]]} -; DEBUG: [[META9]] = !DILocalVariable(name: "1", scope: [[DBG5]], file: [[META1]], line: 1, type: [[META10:![0-9]+]]) -; DEBUG: [[META10]] = !DIBasicType(name: "ty64", size: 64, encoding: DW_ATE_unsigned) -; DEBUG: [[META11]] = !DILocalVariable(name: "2", scope: [[DBG5]], file: [[META1]], line: 2, type: [[META10]]) -; DEBUG: [[META12]] = !DILocalVariable(name: "3", scope: [[DBG5]], file: [[META1]], line: 4, type: [[META10]]) -; DEBUG: [[META13]] = !DILocalVariable(name: "4", scope: [[DBG5]], file: [[META1]], line: 6, type: [[META10]]) -; DEBUG: [[META14]] = !DILocalVariable(name: "5", scope: [[DBG5]], file: [[META1]], line: 8, type: [[META10]]) -; DEBUG: [[META15]] = !DILocalVariable(name: "6", scope: [[DBG5]], file: [[META1]], line: 10, type: [[META16:![0-9]+]]) -; DEBUG: [[META16]] = !DIBasicType(name: "ty256", size: 256, encoding: DW_ATE_unsigned) -; DEBUG: [[META17]] = !DILocation(line: 1, column: 1, scope: [[DBG5]]) -; DEBUG: [[META18]] = !DILocation(line: 2, column: 1, scope: [[DBG5]]) -; DEBUG: [[META19]] = !DILocation(line: 4, column: 1, scope: [[DBG5]]) -; DEBUG: [[META20]] = !DILocation(line: 6, column: 1, scope: [[DBG5]]) -; DEBUG: [[META21]] = !DILocation(line: 8, column: 1, scope: [[DBG5]]) -; DEBUG: [[DBG22]] = !DILocation(line: 9, column: 1, scope: [[DBG5]]) -; DEBUG: [[META23]] = !DILocation(line: 10, column: 1, scope: [[DBG5]]) -; DEBUG: [[DBG24]] = !DILocation(line: 11, column: 1, scope: [[DBG5]]) -; DEBUG: [[DBG25]] = distinct !DISubprogram(name: "multiple_partitions", linkageName: "multiple_partitions", scope: null, file: [[META1]], line: 12, type: [[META6]], scopeLine: 12, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META26:![0-9]+]]) -; DEBUG: [[META26]] = !{[[META27]], [[META28]], [[META29]], [[META30]], [[META31]], [[META32]], [[META34]], [[META35]]} -; DEBUG: [[META27]] = !DILocalVariable(name: "7", scope: [[DBG25]], file: [[META1]], line: 12, type: [[META10]]) -; DEBUG: [[META28]] = !DILocalVariable(name: "8", scope: [[DBG25]], file: [[META1]], line: 13, type: [[META10]]) -; DEBUG: [[META29]] = !DILocalVariable(name: "9", scope: [[DBG25]], file: [[META1]], line: 15, type: [[META10]]) -; DEBUG: [[META30]] = !DILocalVariable(name: "10", scope: [[DBG25]], file: [[META1]], line: 17, type: [[META10]]) -; DEBUG: [[META31]] = !DILocalVariable(name: "11", scope: [[DBG25]], file: [[META1]], line: 19, type: [[META10]]) -; DEBUG: [[META32]] = !DILocalVariable(name: "12", scope: [[DBG25]], file: [[META1]], line: 21, type: [[META33:![0-9]+]]) -; DEBUG: [[META33]] = !DIBasicType(name: "ty128", size: 128, encoding: DW_ATE_unsigned) -; DEBUG: [[META34]] = !DILocalVariable(name: "13", scope: [[DBG25]], file: [[META1]], line: 22, type: [[META10]]) -; DEBUG: [[META35]] = !DILocalVariable(name: "14", scope: [[DBG25]], file: [[META1]], line: 23, type: [[META33]]) -; DEBUG: [[META36]] = !DILocation(line: 12, column: 1, scope: [[DBG25]]) -; DEBUG: [[META37]] = !DILocation(line: 13, column: 1, scope: [[DBG25]]) -; DEBUG: [[META38]] = !DILocation(line: 15, column: 1, scope: [[DBG25]]) -; DEBUG: [[DBG39]] = !DILocation(line: 16, column: 1, scope: [[DBG25]]) -; DEBUG: [[META40]] = !DILocation(line: 17, column: 1, scope: [[DBG25]]) -; DEBUG: [[META41]] = !DILocation(line: 19, column: 1, scope: [[DBG25]]) -; DEBUG: [[DBG42]] = !DILocation(line: 20, column: 1, scope: [[DBG25]]) -; DEBUG: [[META43]] = !DILocation(line: 21, column: 1, scope: [[DBG25]]) -; DEBUG: [[META44]] = !DILocation(line: 22, column: 1, scope: [[DBG25]]) -; DEBUG: [[META45]] = !DILocation(line: 23, column: 1, scope: [[DBG25]]) -; DEBUG: [[DBG46]] = !DILocation(line: 24, column: 1, scope: [[DBG25]]) -; DEBUG: [[DBG47]] = !DILocation(line: 25, column: 1, scope: [[DBG25]]) -; DEBUG: [[DBG48]] = !DILocation(line: 26, column: 1, scope: [[DBG25]]) -; DEBUG: [[DBG49]] = distinct !DISubprogram(name: "out_of_order_stores", linkageName: "out_of_order_stores", scope: null, file: [[META1]], line: 27, type: [[META6]], scopeLine: 27, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META50:![0-9]+]]) -; DEBUG: [[META50]] = !{[[META51]], [[META52]], [[META53]], [[META54]], [[META55]], [[META56]]} -; DEBUG: [[META51]] = !DILocalVariable(name: "15", scope: [[DBG49]], file: [[META1]], line: 27, type: [[META10]]) -; DEBUG: [[META52]] = !DILocalVariable(name: "16", scope: [[DBG49]], file: [[META1]], line: 28, type: [[META10]]) -; DEBUG: [[META53]] = !DILocalVariable(name: "17", scope: [[DBG49]], file: [[META1]], line: 30, type: [[META10]]) -; DEBUG: [[META54]] = !DILocalVariable(name: "18", scope: [[DBG49]], file: [[META1]], line: 32, type: [[META10]]) -; DEBUG: [[META55]] = !DILocalVariable(name: "19", scope: [[DBG49]], file: [[META1]], line: 34, type: [[META10]]) -; DEBUG: [[META56]] = !DILocalVariable(name: "20", scope: [[DBG49]], file: [[META1]], line: 36, type: [[META16]]) -; DEBUG: [[META57]] = !DILocation(line: 27, column: 1, scope: [[DBG49]]) -; DEBUG: [[META58]] = !DILocation(line: 28, column: 1, scope: [[DBG49]]) -; DEBUG: [[META59]] = !DILocation(line: 30, column: 1, scope: [[DBG49]]) -; DEBUG: [[META60]] = !DILocation(line: 32, column: 1, scope: [[DBG49]]) -; DEBUG: [[DBG61]] = !DILocation(line: 33, column: 1, scope: [[DBG49]]) -; DEBUG: [[META62]] = !DILocation(line: 34, column: 1, scope: [[DBG49]]) -; DEBUG: [[META63]] = !DILocation(line: 36, column: 1, scope: [[DBG49]]) -; DEBUG: [[DBG64]] = !DILocation(line: 37, column: 1, scope: [[DBG49]]) -; DEBUG: [[DBG65]] = distinct !DISubprogram(name: "single_element_stores", linkageName: "single_element_stores", scope: null, file: [[META1]], line: 38, type: [[META6]], scopeLine: 38, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META66:![0-9]+]]) -; DEBUG: [[META66]] = !{[[META67]], [[META68]], [[META69]], [[META70]], [[META71]], [[META72]], [[META73]], [[META74]], [[META75]], [[META76]]} -; DEBUG: [[META67]] = !DILocalVariable(name: "21", scope: [[DBG65]], file: [[META1]], line: 38, type: [[META10]]) -; DEBUG: [[META68]] = !DILocalVariable(name: "22", scope: [[DBG65]], file: [[META1]], line: 39, type: [[META10]]) -; DEBUG: [[META69]] = !DILocalVariable(name: "23", scope: [[DBG65]], file: [[META1]], line: 41, type: [[META10]]) -; DEBUG: [[META70]] = !DILocalVariable(name: "24", scope: [[DBG65]], file: [[META1]], line: 43, type: [[META10]]) -; DEBUG: [[META71]] = !DILocalVariable(name: "25", scope: [[DBG65]], file: [[META1]], line: 45, type: [[META10]]) -; DEBUG: [[META72]] = !DILocalVariable(name: "26", scope: [[DBG65]], file: [[META1]], line: 47, type: [[META10]]) -; DEBUG: [[META73]] = !DILocalVariable(name: "27", scope: [[DBG65]], file: [[META1]], line: 49, type: [[META10]]) -; DEBUG: [[META74]] = !DILocalVariable(name: "28", scope: [[DBG65]], file: [[META1]], line: 51, type: [[META10]]) -; DEBUG: [[META75]] = !DILocalVariable(name: "29", scope: [[DBG65]], file: [[META1]], line: 53, type: [[META10]]) -; DEBUG: [[META76]] = !DILocalVariable(name: "30", scope: [[DBG65]], file: [[META1]], line: 55, type: [[META33]]) -; DEBUG: [[META77]] = !DILocation(line: 38, column: 1, scope: [[DBG65]]) -; DEBUG: [[META78]] = !DILocation(line: 39, column: 1, scope: [[DBG65]]) -; DEBUG: [[META79]] = !DILocation(line: 41, column: 1, scope: [[DBG65]]) -; DEBUG: [[META80]] = !DILocation(line: 43, column: 1, scope: [[DBG65]]) -; DEBUG: [[META81]] = !DILocation(line: 45, column: 1, scope: [[DBG65]]) -; DEBUG: [[META82]] = !DILocation(line: 47, column: 1, scope: [[DBG65]]) -; DEBUG: [[META83]] = !DILocation(line: 49, column: 1, scope: [[DBG65]]) -; DEBUG: [[META84]] = !DILocation(line: 51, column: 1, scope: [[DBG65]]) -; DEBUG: [[META85]] = !DILocation(line: 53, column: 1, scope: [[DBG65]]) -; DEBUG: [[DBG86]] = !DILocation(line: 54, column: 1, scope: [[DBG65]]) -; DEBUG: [[META87]] = !DILocation(line: 55, column: 1, scope: [[DBG65]]) -; DEBUG: [[DBG88]] = !DILocation(line: 56, column: 1, scope: [[DBG65]]) -; DEBUG: [[DBG89]] = distinct !DISubprogram(name: "non_power_of_2", linkageName: "non_power_of_2", scope: null, file: [[META1]], line: 57, type: [[META6]], scopeLine: 57, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META90:![0-9]+]]) -; DEBUG: [[META90]] = !{[[META91]], [[META92]], [[META93]], [[META94]], [[META95]]} -; DEBUG: [[META91]] = !DILocalVariable(name: "31", scope: [[DBG89]], file: [[META1]], line: 57, type: [[META10]]) -; DEBUG: [[META92]] = !DILocalVariable(name: "32", scope: [[DBG89]], file: [[META1]], line: 58, type: [[META10]]) -; DEBUG: [[META93]] = !DILocalVariable(name: "33", scope: [[DBG89]], file: [[META1]], line: 60, type: [[META10]]) -; DEBUG: [[META94]] = !DILocalVariable(name: "34", scope: [[DBG89]], file: [[META1]], line: 62, type: [[META10]]) -; DEBUG: [[META95]] = !DILocalVariable(name: "35", scope: [[DBG89]], file: [[META1]], line: 64, type: [[META16]]) -; DEBUG: [[META96]] = !DILocation(line: 57, column: 1, scope: [[DBG89]]) -; DEBUG: [[META97]] = !DILocation(line: 58, column: 1, scope: [[DBG89]]) -; DEBUG: [[META98]] = !DILocation(line: 60, column: 1, scope: [[DBG89]]) -; DEBUG: [[META99]] = !DILocation(line: 62, column: 1, scope: [[DBG89]]) -; DEBUG: [[DBG100]] = !DILocation(line: 63, column: 1, scope: [[DBG89]]) -; DEBUG: [[META101]] = !DILocation(line: 64, column: 1, scope: [[DBG89]]) -; DEBUG: [[DBG102]] = !DILocation(line: 65, column: 1, scope: [[DBG89]]) -; DEBUG: [[DBG103]] = distinct !DISubprogram(name: "store_with_different_size_of_vectors", linkageName: "store_with_different_size_of_vectors", scope: null, file: [[META1]], line: 66, type: [[META6]], scopeLine: 66, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META104:![0-9]+]]) -; DEBUG: [[META104]] = !{[[META105]], [[META106]], [[META107]], [[META108]], [[META109]]} -; DEBUG: [[META105]] = !DILocalVariable(name: "36", scope: [[DBG103]], file: [[META1]], line: 66, type: [[META10]]) -; DEBUG: [[META106]] = !DILocalVariable(name: "37", scope: [[DBG103]], file: [[META1]], line: 67, type: [[META10]]) -; DEBUG: [[META107]] = !DILocalVariable(name: "38", scope: [[DBG103]], file: [[META1]], line: 69, type: [[META10]]) -; DEBUG: [[META108]] = !DILocalVariable(name: "39", scope: [[DBG103]], file: [[META1]], line: 71, type: [[META10]]) -; DEBUG: [[META109]] = !DILocalVariable(name: "40", scope: [[DBG103]], file: [[META1]], line: 73, type: [[META16]]) -; DEBUG: [[META110]] = !DILocation(line: 66, column: 1, scope: [[DBG103]]) -; DEBUG: [[META111]] = !DILocation(line: 67, column: 1, scope: [[DBG103]]) -; DEBUG: [[META112]] = !DILocation(line: 69, column: 1, scope: [[DBG103]]) -; DEBUG: [[META113]] = !DILocation(line: 71, column: 1, scope: [[DBG103]]) -; DEBUG: [[DBG114]] = !DILocation(line: 72, column: 1, scope: [[DBG103]]) -; DEBUG: [[META115]] = !DILocation(line: 73, column: 1, scope: [[DBG103]]) -; DEBUG: [[DBG116]] = !DILocation(line: 74, column: 1, scope: [[DBG103]]) -; DEBUG: [[DBG117]] = distinct !DISubprogram(name: "load_store_different_element_type", linkageName: "load_store_different_element_type", scope: null, file: [[META1]], line: 75, type: [[META6]], scopeLine: 75, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META118:![0-9]+]]) -; DEBUG: [[META118]] = !{[[META119]], [[META120]], [[META121]], [[META122]], [[META123]], [[META124]]} -; DEBUG: [[META119]] = !DILocalVariable(name: "41", scope: [[DBG117]], file: [[META1]], line: 75, type: [[META10]]) -; DEBUG: [[META120]] = !DILocalVariable(name: "42", scope: [[DBG117]], file: [[META1]], line: 76, type: [[META10]]) -; DEBUG: [[META121]] = !DILocalVariable(name: "43", scope: [[DBG117]], file: [[META1]], line: 78, type: [[META10]]) -; DEBUG: [[META122]] = !DILocalVariable(name: "44", scope: [[DBG117]], file: [[META1]], line: 80, type: [[META10]]) -; DEBUG: [[META123]] = !DILocalVariable(name: "45", scope: [[DBG117]], file: [[META1]], line: 82, type: [[META10]]) -; DEBUG: [[META124]] = !DILocalVariable(name: "46", scope: [[DBG117]], file: [[META1]], line: 84, type: [[META16]]) -; DEBUG: [[META125]] = !DILocation(line: 75, column: 1, scope: [[DBG117]]) -; DEBUG: [[META126]] = !DILocation(line: 76, column: 1, scope: [[DBG117]]) -; DEBUG: [[META127]] = !DILocation(line: 78, column: 1, scope: [[DBG117]]) -; DEBUG: [[META128]] = !DILocation(line: 80, column: 1, scope: [[DBG117]]) -; DEBUG: [[META129]] = !DILocation(line: 82, column: 1, scope: [[DBG117]]) -; DEBUG: [[DBG130]] = !DILocation(line: 83, column: 1, scope: [[DBG117]]) -; DEBUG: [[META131]] = !DILocation(line: 84, column: 1, scope: [[DBG117]]) -; DEBUG: [[DBG132]] = !DILocation(line: 85, column: 1, scope: [[DBG117]]) -;. +define <8 x float> @bitcast_needed(<2 x i32> %a, <2 x i16> %b, <12 x i8> %c, <1 x i64> %d) { +; CHECK-LABEL: define <8 x float> @bitcast_needed( +; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i16> [[B:%.*]], <12 x i8> [[C:%.*]], <1 x i64> [[D:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <2 x float> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i16> [[B]] to <1 x float> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <1 x float> [[TMP1]], <1 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> [[TMP2]], <3 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <12 x i8> [[C]] to <3 x float> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[D]] to <2 x float> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <3 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <3 x float> [[TMP5]], <3 x float> [[TMP9]], <5 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <3 x float> [[TMP3]], <3 x float> poison, <5 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <5 x float> [[TMP8]], <5 x float> [[TMP7]], <8 x i32> +; CHECK-NEXT: ret <8 x float> [[TMP6]] +; +entry: + %alloca = alloca [8 x float] + + %ptr0 = getelementptr inbounds [8 x float], ptr %alloca, i32 0, i32 0 + store <2 x i32> %a, ptr %ptr0 + + %ptr1 = getelementptr inbounds [8 x float], ptr %alloca, i32 0, i32 2 + store <2 x i16> %b, ptr %ptr1 + + %ptr2 = getelementptr inbounds [8 x float], ptr %alloca, i32 0, i32 3 + store <12 x i8> %c, ptr %ptr2 + + %ptr3 = getelementptr inbounds [8 x float], ptr %alloca, i32 0, i32 6 + store <1 x i64> %d, ptr %ptr3 + + %result = load <8 x float>, ptr %alloca + ret <8 x float> %result +} + +define <8 x float> @load_in_different_blocks(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d, i1 %cond) { +; CHECK-LABEL: define <8 x float> @load_in_different_blocks( +; CHECK-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]], <2 x float> [[C:%.*]], <2 x float> [[D:%.*]], i1 [[COND:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[C]], <2 x float> [[D]], <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <8 x i32> +; CHECK-NEXT: br i1 [[COND]], label %[[TRUEBRANCH:.*]], label %[[FALSEBRANCH:.*]] +; CHECK: [[TRUEBRANCH]]: +; CHECK-NEXT: br label %[[FALSEBRANCH]] +; CHECK: [[FALSEBRANCH]]: +; CHECK-NEXT: [[RESULT:%.*]] = phi <8 x float> [ poison, %[[ENTRY]] ], [ [[TMP2]], %[[TRUEBRANCH]] ] +; CHECK-NEXT: ret <8 x float> [[RESULT]] +; +entry: + %alloca = alloca [8 x float] + + %ptr0 = getelementptr inbounds [8 x float], ptr %alloca, i32 0, i32 0 + store <2 x float> %a, ptr %ptr0 + + %ptr1 = getelementptr inbounds [8 x float], ptr %alloca, i32 0, i32 2 + store <2 x float> %b, ptr %ptr1 + + %ptr2 = getelementptr inbounds [8 x float], ptr %alloca, i32 0, i32 4 + store <2 x float> %c, ptr %ptr2 + + %ptr3 = getelementptr inbounds [8 x float], ptr %alloca, i32 0, i32 6 + store <2 x float> %d, ptr %ptr3 + + br i1 %cond, label %TrueBranch, label %FalseBranch + +TrueBranch: + %load1 = load <8 x float>, ptr %alloca + br label %FalseBranch + +FalseBranch: + %result = phi <8 x float> [ poison, %entry ], [ %load1, %TrueBranch ] + ret <8 x float> %result +} + ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; CHECK-MODIFY-CFG: {{.*}} ; CHECK-PRESERVE-CFG: {{.*}} From a3c0c0628c721459582cf04906e8fb06969df585 Mon Sep 17 00:00:00 2001 From: chengjunp Date: Wed, 27 Aug 2025 20:20:18 +0000 Subject: [PATCH 7/8] Fix nits --- llvm/lib/Transforms/Scalar/SROA.cpp | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index aeea2d31c7a4e..c76510480a070 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -3035,6 +3035,12 @@ class AllocaSliceRewriter : public InstVisitor { // Stores should be in the same basic block // The load should not be in the middle of the stores + // Note: + // If the load is in a different basic block with the stores, we can still + // do the tree structured merge. This is because we do not have the + // store->load forwarding here. The merged vector will be stored back to + // NewAI and the new load will load from NewAI. The forwarding will be + // handled later when we try to promote NewAI. BasicBlock *LoadBB = TheLoad->getParent(); BasicBlock *StoreBB = StoreInfos[0].Store->getParent(); @@ -3067,8 +3073,8 @@ class AllocaSliceRewriter : public InstVisitor { LLVM_DEBUG(dbgs() << " Rewrite stores into shufflevectors:\n"); while (VecElements.size() > 1) { - uint64_t NumElts = VecElements.size(); - for (uint64_t i = 0; i < NumElts / 2; i++) { + const auto NumElts = VecElements.size(); + for ([[maybe_unused]] const auto _ : llvm::seq(NumElts / 2)) { Value *V0 = VecElements.front(); VecElements.pop(); Value *V1 = VecElements.front(); @@ -5268,9 +5274,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, PHIUsers, SelectUsers); bool Promotable = true; // Check whether we can have tree-structured merge. - std::optional> DeletedValues = - Rewriter.rewriteTreeStructuredMerge(P); - if (DeletedValues) { + if (auto DeletedValues = Rewriter.rewriteTreeStructuredMerge(P)) { NumUses += DeletedValues->size() + 1; for (Value *V : *DeletedValues) DeadInsts.push_back(V); From 8afd48404e50bae37ad6f187dababea770099bc0 Mon Sep 17 00:00:00 2001 From: chengjunp Date: Wed, 27 Aug 2025 20:29:32 +0000 Subject: [PATCH 8/8] Format --- llvm/lib/Transforms/Scalar/SROA.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index c76510480a070..cbef18555c4ba 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -2977,22 +2977,22 @@ class AllocaSliceRewriter : public InstVisitor { for (Slice &S : P) { auto *User = cast(S.getUse()->getUser()); if (auto *LI = dyn_cast(User)) { - // Do not handle the case if + // Do not handle the case if // 1. There is more than one load // 2. The load is volatile // 3. The load does not read the entire alloca structure // 4. The load does not meet the conditions in the helper function if (TheLoad || !IsTypeValidForTreeStructuredMerge(LI->getType()) || S.beginOffset() != NewAllocaBeginOffset || - S.endOffset() != NewAllocaEndOffset || - LI->isVolatile()) + S.endOffset() != NewAllocaEndOffset || LI->isVolatile()) return std::nullopt; TheLoad = LI; } else if (auto *SI = dyn_cast(User)) { - // Do not handle the case if + // Do not handle the case if // 1. The store does not meet the conditions in the helper function // 2. The store is volatile - if (!IsTypeValidForTreeStructuredMerge(SI->getValueOperand()->getType()) || + if (!IsTypeValidForTreeStructuredMerge( + SI->getValueOperand()->getType()) || SI->isVolatile()) return std::nullopt; StoreInfos.emplace_back(SI, S.beginOffset(), S.endOffset(),