diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index 42d1d9a437bb2..1e229a5544963 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -490,6 +490,10 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit, for_each(DVRAssignMarkerRange, MigrateDbgAssign); } +static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset, + uint64_t Size); +static Type *getTypePartition(const AllocaInst &AI, const Partition &P); + namespace { /// A custom IRBuilder inserter which prefixes all names, but only in @@ -1011,6 +1015,35 @@ static Value *foldPHINodeOrSelectInst(Instruction &I) { return foldSelectInst(cast(I)); } +/// Returns a fixed vector type equivalent to the memory set by II or nullptr if +/// not viable. +static FixedVectorType *getVectorTypeFor(const DataLayout &DL, Type *PartTy, + const MemSetInst &II) { + auto *PartVecTy = dyn_cast_or_null(PartTy); + if (!PartVecTy) + return nullptr; + + const uint64_t PartVecSize = DL.getTypeStoreSize(PartVecTy).getFixedValue(); + + const ConstantInt *Length = dyn_cast(II.getLength()); + if (!Length) + return nullptr; + + const APInt &Val = Length->getValue(); + if (Val.ugt(PartVecSize)) + return nullptr; + + // Element type will always be i8. TODO: Support + // llvm.experimental.memset.pattern? + return FixedVectorType::get(II.getValue()->getType(), Val.getZExtValue()); +} + +static FixedVectorType *getVectorTypeFor(const AllocaInst &AI, + const Partition &P, + const MemSetInst &II) { + return getVectorTypeFor(AI.getDataLayout(), getTypePartition(AI, P), II); +} + /// Builder for the alloca slices. /// /// This class builds a set of alloca slices by recursively visiting the uses @@ -1022,6 +1055,7 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor { using Base = PtrUseVisitor; const uint64_t AllocSize; + const AllocaInst &AI; AllocaSlices &AS; SmallDenseMap MemTransferSliceMap; @@ -1034,7 +1068,7 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor { SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS) : PtrUseVisitor(DL), AllocSize(DL.getTypeAllocSize(AI.getAllocatedType()).getFixedValue()), - AS(AS) {} + AI(AI), AS(AS) {} private: void markAsDead(Instruction &I) { @@ -1182,10 +1216,17 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor { if (!IsOffsetKnown) return PI.setAborted(&II); - insertUse(II, Offset, - Length ? Length->getLimitedValue() - : AllocSize - Offset.getLimitedValue(), - (bool)Length); + uint64_t Size = Length ? Length->getLimitedValue() + : AllocSize - Offset.getLimitedValue(); + bool Splittable = (bool)Length; + if (Splittable) { + // Encourage the use of vector types by making this non-splittable if the + // memset corresponds to viable vector type. + Type *PartTy = getTypePartition(DL, AI.getAllocatedType(), + Offset.getLimitedValue(), Size); + Splittable = !getVectorTypeFor(DL, PartTy, II); + } + insertUse(II, Offset, Size, Splittable); } void visitMemTransferInst(MemTransferInst &II) { @@ -2118,11 +2159,12 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, /// /// This function is called to test each entry in a partition which is slated /// for a single slice. -static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S, - VectorType *Ty, +static bool isVectorPromotionViableForSlice(const AllocaInst &AI, Partition &P, + const Slice &S, VectorType *Ty, uint64_t ElementSize, - const DataLayout &DL, unsigned VScale) { + const DataLayout &DL = AI.getDataLayout(); + // First validate the slice offsets. uint64_t BeginOffset = std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset(); @@ -2150,8 +2192,20 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S, if (MemIntrinsic *MI = dyn_cast(U->getUser())) { if (MI->isVolatile()) return false; - if (!S.isSplittable()) - return false; // Skip any unsplittable intrinsics. + + if (!S.isSplittable()) { + // Skip any non-memset unsplittable intrinsics. + auto *II = dyn_cast(U->getUser()); + if (!II) + return false; + + // For memset, allow if we have a viable vector type + Type *VTy = getVectorTypeFor(AI, P, *II); + if (!VTy) + return false; + if (!canConvertValue(DL, SliceTy, VTy)) + return false; + } } else if (IntrinsicInst *II = dyn_cast(U->getUser())) { if (!II->isLifetimeStartOrEnd() && !II->isDroppable()) return false; @@ -2193,8 +2247,9 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S, /// This implements the necessary checking for \c checkVectorTypesForPromotion /// (and thus isVectorPromotionViable) over all slices of the alloca for the /// given VectorType. -static bool checkVectorTypeForPromotion(Partition &P, VectorType *VTy, - const DataLayout &DL, unsigned VScale) { +static bool checkVectorTypeForPromotion(const AllocaInst &AI, Partition &P, + VectorType *VTy, unsigned VScale) { + const DataLayout &DL = AI.getDataLayout(); uint64_t ElementSize = DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue(); @@ -2207,11 +2262,11 @@ static bool checkVectorTypeForPromotion(Partition &P, VectorType *VTy, ElementSize /= 8; for (const Slice &S : P) - if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL, VScale)) + if (!isVectorPromotionViableForSlice(AI, P, S, VTy, ElementSize, VScale)) return false; for (const Slice *S : P.splitSliceTails()) - if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL, VScale)) + if (!isVectorPromotionViableForSlice(AI, P, *S, VTy, ElementSize, VScale)) return false; return true; @@ -2222,11 +2277,12 @@ static bool checkVectorTypeForPromotion(Partition &P, VectorType *VTy, /// This implements the necessary checking for \c isVectorPromotionViable over /// all slices of the alloca for the given VectorType. static VectorType * -checkVectorTypesForPromotion(Partition &P, const DataLayout &DL, +checkVectorTypesForPromotion(const AllocaInst &AI, Partition &P, SmallVectorImpl &CandidateTys, bool HaveCommonEltTy, Type *CommonEltTy, bool HaveVecPtrTy, bool HaveCommonVecPtrTy, VectorType *CommonVecPtrTy, unsigned VScale) { + const DataLayout &DL = AI.getDataLayout(); // If we didn't find a vector type, nothing to do here. if (CandidateTys.empty()) return nullptr; @@ -2302,7 +2358,7 @@ checkVectorTypesForPromotion(Partition &P, const DataLayout &DL, }); for (VectorType *VTy : CandidateTys) - if (checkVectorTypeForPromotion(P, VTy, DL, VScale)) + if (checkVectorTypeForPromotion(AI, P, VTy, VScale)) return VTy; return nullptr; @@ -2310,10 +2366,11 @@ checkVectorTypesForPromotion(Partition &P, const DataLayout &DL, static VectorType *createAndCheckVectorTypesForPromotion( SetVector &OtherTys, ArrayRef CandidateTysCopy, - function_ref CheckCandidateType, Partition &P, - const DataLayout &DL, SmallVectorImpl &CandidateTys, + function_ref CheckCandidateType, const AllocaInst &AI, + Partition &P, SmallVectorImpl &CandidateTys, bool &HaveCommonEltTy, Type *&CommonEltTy, bool &HaveVecPtrTy, bool &HaveCommonVecPtrTy, VectorType *&CommonVecPtrTy, unsigned VScale) { + const DataLayout &DL = AI.getDataLayout(); [[maybe_unused]] VectorType *OriginalElt = CandidateTysCopy.size() ? CandidateTysCopy[0] : nullptr; // Consider additional vector types where the element type size is a @@ -2339,7 +2396,7 @@ static VectorType *createAndCheckVectorTypesForPromotion( } return checkVectorTypesForPromotion( - P, DL, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy, + AI, P, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy, HaveCommonVecPtrTy, CommonVecPtrTy, VScale); } @@ -2352,10 +2409,11 @@ static VectorType *createAndCheckVectorTypesForPromotion( /// SSA value. We only can ensure this for a limited set of operations, and we /// don't want to do the rewrites unless we are confident that the result will /// be promotable, so we have an early test here. -static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL, +static VectorType *isVectorPromotionViable(const AllocaInst &AI, Partition &P, unsigned VScale) { // Collect the candidate types for vector-based promotion. Also track whether // we have different element types. + const DataLayout &DL = AI.getDataLayout(); SmallVector CandidateTys; SetVector LoadStoreTys; SetVector DeferredTys; @@ -2395,12 +2453,16 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL, // Put load and store types into a set for de-duplication. for (const Slice &S : P) { - Type *Ty; + Type *Ty = nullptr; if (auto *LI = dyn_cast(S.getUse()->getUser())) Ty = LI->getType(); else if (auto *SI = dyn_cast(S.getUse()->getUser())) Ty = SI->getValueOperand()->getType(); - else + else if (auto *II = dyn_cast(S.getUse()->getUser())) { + Ty = getVectorTypeFor(AI, P, *II); + if (!Ty) + continue; + } else continue; auto CandTy = Ty->getScalarType(); @@ -2418,14 +2480,14 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL, SmallVector CandidateTysCopy = CandidateTys; if (auto *VTy = createAndCheckVectorTypesForPromotion( - LoadStoreTys, CandidateTysCopy, CheckCandidateType, P, DL, + LoadStoreTys, CandidateTysCopy, CheckCandidateType, AI, P, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy, HaveCommonVecPtrTy, CommonVecPtrTy, VScale)) return VTy; CandidateTys.clear(); return createAndCheckVectorTypesForPromotion( - DeferredTys, CandidateTysCopy, CheckCandidateType, P, DL, CandidateTys, + DeferredTys, CandidateTysCopy, CheckCandidateType, AI, P, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy, HaveCommonVecPtrTy, CommonVecPtrTy, VScale); } @@ -4410,6 +4472,13 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset, return SubTy; } +static Type *getTypePartition(const AllocaInst &AI, const Partition &P) { + if (P.empty()) + return nullptr; + return getTypePartition(AI.getDataLayout(), AI.getAllocatedType(), + P.beginOffset(), P.size()); +} + /// Pre-split loads and stores to simplify rewriting. /// /// We want to break up the splittable load+store pairs as much as @@ -4957,12 +5026,12 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, // If the common use types are not viable for promotion then attempt to find // another type that is viable. - if (SliceVecTy && !checkVectorTypeForPromotion(P, SliceVecTy, DL, VScale)) + if (SliceVecTy && !checkVectorTypeForPromotion(AI, P, SliceVecTy, VScale)) if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(), P.beginOffset(), P.size())) { VectorType *TypePartitionVecTy = dyn_cast(TypePartitionTy); if (TypePartitionVecTy && - checkVectorTypeForPromotion(P, TypePartitionVecTy, DL, VScale)) + checkVectorTypeForPromotion(AI, P, TypePartitionVecTy, VScale)) SliceTy = TypePartitionTy; } @@ -4973,7 +5042,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL); VectorType *VecTy = - IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL, VScale); + IsIntegerPromotable ? nullptr : isVectorPromotionViable(AI, P, VScale); if (VecTy) SliceTy = VecTy; diff --git a/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll b/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll index 72014912edd20..f3b1fa8ce4fdd 100644 --- a/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll +++ b/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll @@ -285,8 +285,6 @@ bb: define amdgpu_kernel void @test_array_vector() #0 { ; CHECK-LABEL: @test_array_vector( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_5:%.*]] = alloca <8 x half>, align 16 -; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 16 [[B_BLOCKWISE_COPY_SROA_5]], i8 0, i32 16, i1 false) ; CHECK-NEXT: [[DATA:%.*]] = load <4 x float>, ptr undef, align 16 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x i16> ; CHECK-NEXT: br label [[BB:%.*]] @@ -318,8 +316,6 @@ bb: define amdgpu_kernel void @test_array_vector2() #0 { ; CHECK-LABEL: @test_array_vector2( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_5:%.*]] = alloca <8 x half>, align 16 -; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 16 [[B_BLOCKWISE_COPY_SROA_5]], i8 0, i32 16, i1 false) ; CHECK-NEXT: [[DATA:%.*]] = load <4 x float>, ptr undef, align 16 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x i16> ; CHECK-NEXT: br label [[BB:%.*]] @@ -355,12 +351,10 @@ define amdgpu_kernel void @test_array_vector_no_vector_common_type() #0 { ; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_4:%.*]] = alloca float, align 4 ; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_7:%.*]] = alloca float, align 8 ; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_10:%.*]] = alloca float, align 4 -; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_13:%.*]] = alloca <8 x half>, align 16 ; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 16 [[B_BLOCKWISE_COPY_SROA_0]], i8 0, i32 4, i1 false) ; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 4 [[B_BLOCKWISE_COPY_SROA_4]], i8 0, i32 4, i1 false) ; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 8 [[B_BLOCKWISE_COPY_SROA_7]], i8 0, i32 4, i1 false) ; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 4 [[B_BLOCKWISE_COPY_SROA_10]], i8 0, i32 4, i1 false) -; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 16 [[B_BLOCKWISE_COPY_SROA_13]], i8 0, i32 16, i1 false) ; CHECK-NEXT: [[DATA1:%.*]] = load float, ptr undef, align 4 ; CHECK-NEXT: [[DATA2:%.*]] = load float, ptr undef, align 4 ; CHECK-NEXT: [[DATA3:%.*]] = load float, ptr undef, align 4 diff --git a/llvm/test/Transforms/SROA/vector-promotion-memset.ll b/llvm/test/Transforms/SROA/vector-promotion-memset.ll new file mode 100644 index 0000000000000..342ed51175145 --- /dev/null +++ b/llvm/test/Transforms/SROA/vector-promotion-memset.ll @@ -0,0 +1,145 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -passes='sroa' -S | FileCheck %s +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64" + +%struct_a = type { <32 x i8> } +define i8 @vector_promote_a(ptr %arg0) { +; CHECK-LABEL: @vector_promote_a( +; CHECK-NEXT: [[V0:%.*]] = load i8, ptr [[ARG0:%.*]], align 1 +; CHECK-NEXT: [[A0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <32 x i8> zeroinitializer, i8 [[V0]], i32 0 +; CHECK-NEXT: [[A0_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <32 x i8> [[A0_SROA_0_0_VEC_INSERT]], i32 4 +; CHECK-NEXT: ret i8 [[A0_SROA_0_4_VEC_EXTRACT]] +; + %a0 = alloca %struct_a, align 32 + call void @llvm.memset.p0.i64(ptr align 32 %a0, i8 0, i64 32, i1 false) + %v0 = load i8, ptr %arg0, align 1 + store i8 %v0, ptr %a0, align 1 + %p0 = getelementptr inbounds i8, ptr %a0, i64 4 + %v1 = load i8, ptr %p0, align 1 + ret i8 %v1 +} + +%struct_b = type { <16 x i16> } +define i16 @vector_promote_b(ptr %arg0) { +; CHECK-LABEL: @vector_promote_b( +; CHECK-NEXT: [[V0:%.*]] = load i16, ptr [[ARG0:%.*]], align 1 +; CHECK-NEXT: [[A0_SROA_0_20_VEC_INSERT:%.*]] = insertelement <16 x i16> zeroinitializer, i16 [[V0]], i32 10 +; CHECK-NEXT: [[A0_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <16 x i16> [[A0_SROA_0_20_VEC_INSERT]], i32 2 +; CHECK-NEXT: ret i16 [[A0_SROA_0_4_VEC_EXTRACT]] +; + %a0 = alloca %struct_b, align 32 + call void @llvm.memset.p0.i64(ptr align 32 %a0, i8 0, i64 32, i1 false) + %v0 = load i16, ptr %arg0, align 1 + %p0 = getelementptr inbounds i16, ptr %a0, i64 10 + store i16 %v0, ptr %p0, align 1 + %p1 = getelementptr inbounds i16, ptr %a0, i64 2 + %v1 = load i16, ptr %p1, align 1 + ret i16 %v1 +} + +%struct_c = type { <4 x i32> } +define i32 @vector_promote_c(ptr %arg0) { +; CHECK-LABEL: @vector_promote_c( +; CHECK-NEXT: [[V0:%.*]] = load i32, ptr [[ARG0:%.*]], align 1 +; CHECK-NEXT: [[A0_SROA_0_12_VEC_INSERT:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[V0]], i32 3 +; CHECK-NEXT: [[A0_SROA_0_8_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[A0_SROA_0_12_VEC_INSERT]], i32 2 +; CHECK-NEXT: ret i32 [[A0_SROA_0_8_VEC_EXTRACT]] +; + %a0 = alloca %struct_c, align 32 + call void @llvm.memset.p0.i64(ptr align 32 %a0, i8 0, i64 16, i1 false) + %v0 = load i32, ptr %arg0, align 1 + %p0 = getelementptr inbounds i32, ptr %a0, i64 3 + store i32 %v0, ptr %p0, align 1 + %p1 = getelementptr inbounds i32, ptr %a0, i64 2 + %v1 = load i32, ptr %p1, align 1 + ret i32 %v1 +} + +; These memsets do not get promoted because getTypePartition does not break +; vectors into smaller vectors. +%struct_d = type { <8 x i32> } +define i32 @vector_promote_d(ptr %arg0) { +; CHECK-LABEL: @vector_promote_d( +; CHECK-NEXT: [[A0_SROA_0:%.*]] = alloca [3 x i32], align 32 +; CHECK-NEXT: [[A0_SROA_4:%.*]] = alloca [3 x i32], align 4 +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 32 [[A0_SROA_0]], i8 0, i64 12, i1 false) +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[A0_SROA_4]], i8 1, i64 12, i1 false) +; CHECK-NEXT: [[V0:%.*]] = load i32, ptr [[ARG0:%.*]], align 1 +; CHECK-NEXT: ret i32 16843009 +; + %a0 = alloca %struct_d, align 32 + + call void @llvm.memset.p0.i64(ptr align 32 %a0, i8 0, i64 16, i1 false) + %p0 = getelementptr inbounds i32, ptr %a0, i64 4 + call void @llvm.memset.p0.i64(ptr align 32 %p0, i8 1, i64 16, i1 false) + + %v0 = load i32, ptr %arg0, align 1 + %p1 = getelementptr inbounds i32, ptr %a0, i64 3 + store i32 %v0, ptr %p1, align 1 + %p2 = getelementptr inbounds i32, ptr %a0, i64 4 + %v1 = load i32, ptr %p2, align 1 + ret i32 %v1 +} + +%struct_e = type { %struct_c, %struct_c } +define i32 @vector_promote_e(ptr %arg0) { +; CHECK-LABEL: @vector_promote_e( +; CHECK-NEXT: [[V0:%.*]] = load i32, ptr [[ARG0:%.*]], align 1 +; CHECK-NEXT: [[A0_SROA_0_12_VEC_INSERT:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[V0]], i32 3 +; CHECK-NEXT: [[A0_SROA_2_16_VEC_EXTRACT:%.*]] = extractelement <4 x i32> splat (i32 16843009), i32 0 +; CHECK-NEXT: ret i32 [[A0_SROA_2_16_VEC_EXTRACT]] +; + %a0 = alloca %struct_e, align 32 + + call void @llvm.memset.p0.i64(ptr align 32 %a0, i8 0, i64 16, i1 false) + %p0 = getelementptr inbounds i32, ptr %a0, i64 4 + call void @llvm.memset.p0.i64(ptr align 32 %p0, i8 1, i64 16, i1 false) + + %v0 = load i32, ptr %arg0, align 1 + %p1 = getelementptr inbounds i32, ptr %a0, i64 3 + store i32 %v0, ptr %p1, align 1 + %p2 = getelementptr inbounds i32, ptr %a0, i64 4 + %v1 = load i32, ptr %p2, align 1 + ret i32 %v1 +} + +; Don't promote non-vector alloca type +%struct_f = type { [32 x i8] } +define i8 @vector_promote_f(ptr %arg0) { +; CHECK-LABEL: @vector_promote_f( +; CHECK-NEXT: [[A0_SROA_2:%.*]] = alloca [3 x i8], align 1 +; CHECK-NEXT: [[A0_SROA_3:%.*]] = alloca [27 x i8], align 1 +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[A0_SROA_2]], i8 0, i64 3, i1 false) +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[A0_SROA_3]], i8 0, i64 27, i1 false) +; CHECK-NEXT: [[V0:%.*]] = load i8, ptr [[ARG0:%.*]], align 1 +; CHECK-NEXT: ret i8 0 +; + %a0 = alloca %struct_f, align 32 + call void @llvm.memset.p0.i64(ptr align 32 %a0, i8 0, i64 32, i1 false) + %v0 = load i8, ptr %arg0, align 1 + store i8 %v0, ptr %a0, align 1 + %p0 = getelementptr inbounds i8, ptr %a0, i64 4 + %v1 = load i8, ptr %p0, align 1 + ret i8 %v1 +} + +; Don't promote memset that crosses vector partition boundary +%struct_g = type { i8, i8, i8, i8, <32 x i8> } +define i8 @vector_promote_g(ptr %arg0) { +; CHECK-LABEL: @vector_promote_g( +; CHECK-NEXT: [[A0_SROA_2:%.*]] = alloca [27 x i8], align 1 +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[A0_SROA_2]], i8 0, i64 27, i1 false) +; CHECK-NEXT: ret i8 0 +; + %a0 = alloca %struct_g, align 32 + call void @llvm.memset.p0.i64(ptr align 32 %a0, i8 0, i64 36, i1 false) + %p0 = getelementptr inbounds i8, ptr %a0, i64 8 + %v0 = load i8, ptr %p0, align 1 + ret i8 %v0 +} + +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write) +declare void @llvm.memset.p0.i64(ptr writeonly captures(none), i8, i64, i1 immarg) #0 +declare void @llvm.memset.p0.i32(ptr writeonly captures(none), i8, i32, i1 immarg) #0 + +attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: write) }