diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index d6eb00da11dc8..4549fc3c1c582 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -257,6 +257,8 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { unsigned MinVecNumElts = MinVectorSize / ScalarSize; auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false); unsigned OffsetEltIndex = 0; + unsigned VectorRange = 0; + bool NeedCast = false; Align Alignment = Load->getAlign(); if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC, &DT)) { @@ -273,15 +275,28 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { if (Offset.isNegative()) return false; - // The offset must be a multiple of the scalar element to shuffle cleanly - // in the element's size. + // If Offset is multiple of a Scalar element, it can be shuffled to the + // element's size; otherwise, Offset and Scalar must be shuffled to the + // appropriate element size for both. uint64_t ScalarSizeInBytes = ScalarSize / 8; - if (Offset.urem(ScalarSizeInBytes) != 0) - return false; + if (auto UnalignedBytes = Offset.urem(ScalarSizeInBytes)) { + if (DL->isBigEndian()) + return false; + uint64_t OldScalarSizeInBytes = ScalarSizeInBytes; + // Assign the greatest common divisor between UnalignedBytes and Offset to + // ScalarSizeInBytes + ScalarSizeInBytes = std::gcd(ScalarSizeInBytes, UnalignedBytes); + ScalarSize = ScalarSizeInBytes * 8; + VectorRange = OldScalarSizeInBytes / ScalarSizeInBytes; + MinVecNumElts = MinVectorSize / ScalarSize; + ScalarTy = Type::getIntNTy(I.getContext(), ScalarSize); + MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false); + NeedCast = true; + } - // If we load MinVecNumElts, will our target element still be loaded? OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue(); - if (OffsetEltIndex >= MinVecNumElts) + // If we load MinVecNumElts, will our target element still be loaded? + if (OffsetEltIndex + VectorRange >= MinVecNumElts) return false; if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC, @@ -299,11 +314,14 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment); Type *LoadTy = Load->getType(); unsigned AS = Load->getPointerAddressSpace(); + auto VecTy = cast(&I)->getType(); + InstructionCost OldCost = TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind); - APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0); + APInt DemandedElts = + APInt::getOneBitSet(VecTy->getElementCount().getFixedValue(), 0); OldCost += - TTI.getScalarizationOverhead(MinVecTy, DemandedElts, + TTI.getScalarizationOverhead(VecTy, DemandedElts, /* Insert */ true, HasExtract, CostKind); // New pattern: load VecPtr @@ -317,13 +335,32 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { // Note that we could use freeze to avoid poison problems, but then we might // still need a shuffle to change the vector size. auto *Ty = cast(I.getType()); - unsigned OutputNumElts = Ty->getNumElements(); - SmallVector Mask(OutputNumElts, PoisonMaskElem); - assert(OffsetEltIndex < MinVecNumElts && "Address offset too big"); - Mask[0] = OffsetEltIndex; - if (OffsetEltIndex) - NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, MinVecTy, Mask, - CostKind); + SmallVector Mask; + assert(OffsetEltIndex + VectorRange < MinVecNumElts && + "Address offset too big"); + if (NeedCast) { + Mask.assign(MinVecNumElts, PoisonMaskElem); + std::iota(Mask.begin(), Mask.begin() + VectorRange, OffsetEltIndex); + } else { + unsigned OutputNumElts = Ty->getNumElements(); + Mask.assign(OutputNumElts, PoisonMaskElem); + Mask[0] = OffsetEltIndex; + } + + if (OffsetEltIndex) { + if (NeedCast) { + NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, MinVecTy, + MinVecTy, Mask, CostKind); + } else { + NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, MinVecTy, + Mask, CostKind); + } + } + + if (NeedCast) + NewCost += TTI.getCastInstrCost(Instruction::BitCast, Ty, MinVecTy, + TargetTransformInfo::CastContextHint::None, + CostKind); // We can aggressively convert to the vector form because the backend can // invert this transform if it does not result in a performance win. @@ -333,12 +370,17 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { // It is safe and potentially profitable to load a vector directly: // inselt undef, load Scalar, 0 --> load VecPtr IRBuilder<> Builder(Load); + Value *Result; Value *CastedPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS)); - Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment); - VecLd = Builder.CreateShuffleVector(VecLd, Mask); + Result = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment); + Worklist.pushValue(Result); + Result = Builder.CreateShuffleVector(Result, Mask); + Worklist.pushValue(Result); + if (NeedCast) + Result = Builder.CreateBitOrPointerCast(Result, I.getType()); - replaceValue(I, *VecLd); + replaceValue(I, *Result); ++NumVecLoad; return true; } diff --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll index e8381d1b206e2..39b0280ab82b6 100644 --- a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll +++ b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s -; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s +; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE2 +; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX2 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @@ -159,8 +159,7 @@ define double @larger_fp_scalar_256bit_vec(ptr align 32 dereferenceable(32) %p) define <4 x float> @load_f32_insert_v4f32(ptr align 16 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @load_f32_insert_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: ret <4 x float> [[R]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; %s = load float, ptr %p, align 4 %r = insertelement <4 x float> poison, float %s, i32 0 @@ -170,8 +169,7 @@ define <4 x float> @load_f32_insert_v4f32(ptr align 16 dereferenceable(16) %p) n define <4 x float> @casted_load_f32_insert_v4f32(ptr align 4 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @casted_load_f32_insert_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 4 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: ret <4 x float> [[R]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; %s = load float, ptr %p, align 4 %r = insertelement <4 x float> poison, float %s, i32 0 @@ -183,8 +181,7 @@ define <4 x float> @casted_load_f32_insert_v4f32(ptr align 4 dereferenceable(16) define <4 x i32> @load_i32_insert_v4i32(ptr align 16 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @load_i32_insert_v4i32( ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 16 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: ret <4 x i32> [[R]] +; CHECK-NEXT: ret <4 x i32> [[TMP1]] ; %s = load i32, ptr %p, align 4 %r = insertelement <4 x i32> poison, i32 %s, i32 0 @@ -196,8 +193,7 @@ define <4 x i32> @load_i32_insert_v4i32(ptr align 16 dereferenceable(16) %p) nof define <4 x i32> @casted_load_i32_insert_v4i32(ptr align 4 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @casted_load_i32_insert_v4i32( ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 4 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: ret <4 x i32> [[R]] +; CHECK-NEXT: ret <4 x i32> [[TMP1]] ; %s = load i32, ptr %p, align 4 %r = insertelement <4 x i32> poison, i32 %s, i32 0 @@ -209,8 +205,7 @@ define <4 x i32> @casted_load_i32_insert_v4i32(ptr align 4 dereferenceable(16) % define <4 x float> @gep00_load_f32_insert_v4f32(ptr align 16 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @gep00_load_f32_insert_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: ret <4 x float> [[R]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; %s = load float, ptr %p, align 16 %r = insertelement <4 x float> poison, float %s, i64 0 @@ -222,8 +217,7 @@ define <4 x float> @gep00_load_f32_insert_v4f32(ptr align 16 dereferenceable(16) define <4 x float> @gep00_load_f32_insert_v4f32_addrspace(ptr addrspace(44) align 16 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @gep00_load_f32_insert_v4f32_addrspace( ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr addrspace(44) [[P:%.*]], align 16 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: ret <4 x float> [[R]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; %s = load float, ptr addrspace(44) %p, align 16 %r = insertelement <4 x float> poison, float %s, i64 0 @@ -235,8 +229,8 @@ define <4 x float> @gep00_load_f32_insert_v4f32_addrspace(ptr addrspace(44) alig define <4 x i32> @unsafe_load_i32_insert_v4i32_addrspace(ptr align 16 dereferenceable(16) %v3) { ; CHECK-LABEL: @unsafe_load_i32_insert_v4i32_addrspace( ; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[V3:%.*]] to ptr addrspace(42) -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(42) [[TMP1]], align 16 -; CHECK-NEXT: [[INSELT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = load <3 x i32>, ptr addrspace(42) [[TMP1]], align 16 +; CHECK-NEXT: [[INSELT:%.*]] = shufflevector <3 x i32> [[TMP2]], <3 x i32> poison, <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[INSELT]] ; %t0 = getelementptr inbounds i32, ptr %v3, i32 1 @@ -253,8 +247,7 @@ define <8 x i16> @gep01_load_i16_insert_v8i16(ptr align 16 dereferenceable(18) % ; CHECK-LABEL: @gep01_load_i16_insert_v8i16( ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1 ; CHECK-NEXT: [[R:%.*]] = load <8 x i16>, ptr [[GEP]], align 2 -; CHECK-NEXT: [[R1:%.*]] = shufflevector <8 x i16> [[R]], <8 x i16> poison, <8 x i32> -; CHECK-NEXT: ret <8 x i16> [[R1]] +; CHECK-NEXT: ret <8 x i16> [[R]] ; %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1 %s = load i16, ptr %gep, align 2 @@ -266,8 +259,8 @@ define <8 x i16> @gep01_load_i16_insert_v8i16(ptr align 16 dereferenceable(18) % define <8 x i16> @gep01_load_i16_insert_v8i16_deref(ptr align 16 dereferenceable(17) %p) nofree nosync { ; CHECK-LABEL: @gep01_load_i16_insert_v8i16_deref( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 16 -; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i16> [[R]] ; %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1 @@ -280,8 +273,8 @@ define <8 x i16> @gep01_load_i16_insert_v8i16_deref(ptr align 16 dereferenceable define <8 x i16> @gep01_load_i16_insert_v8i16_deref_minalign(ptr align 2 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 2 -; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr [[P:%.*]], align 2 +; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i16> [[R]] ; %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1 @@ -290,23 +283,121 @@ define <8 x i16> @gep01_load_i16_insert_v8i16_deref_minalign(ptr align 2 derefer ret <8 x i16> %r } -; Negative test - if we are shuffling a load from the base pointer, the address offset -; must be a multiple of element size. -; TODO: Could bitcast around this limitation. +define <4 x i32> @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(ptr align 1 dereferenceable(16) %p) { +; SSE2-LABEL: @gep01_bitcast_load_i32_from_v16i8_insert_v4i32( +; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1 +; SSE2-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1 +; SSE2-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0 +; SSE2-NEXT: ret <4 x i32> [[R]] +; +; AVX2-LABEL: @gep01_bitcast_load_i32_from_v16i8_insert_v4i32( +; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[P:%.*]], align 1 +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> +; AVX2-NEXT: [[R:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +; AVX2-NEXT: ret <4 x i32> [[R]] +; + %gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 1 + %s = load i32, ptr %gep, align 1 + %r = insertelement <4 x i32> poison, i32 %s, i64 0 + ret <4 x i32> %r +} -define <4 x i32> @gep01_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceable(16) %p) nofree nosync { -; CHECK-LABEL: @gep01_bitcast_load_i32_insert_v4i32( -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1 -; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1 -; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0 -; CHECK-NEXT: ret <4 x i32> [[R]] +define <2 x i64> @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(ptr align 1 dereferenceable(16) %p) { +; SSE2-LABEL: @gep01_bitcast_load_i64_from_v16i8_insert_v2i64( +; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1 +; SSE2-NEXT: [[S:%.*]] = load i64, ptr [[GEP]], align 1 +; SSE2-NEXT: [[R:%.*]] = insertelement <2 x i64> poison, i64 [[S]], i64 0 +; SSE2-NEXT: ret <2 x i64> [[R]] +; +; AVX2-LABEL: @gep01_bitcast_load_i64_from_v16i8_insert_v2i64( +; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[P:%.*]], align 1 +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> +; AVX2-NEXT: [[R:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +; AVX2-NEXT: ret <2 x i64> [[R]] ; %gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 1 + %s = load i64, ptr %gep, align 1 + %r = insertelement <2 x i64> poison, i64 %s, i64 0 + ret <2 x i64> %r +} + +define <4 x i32> @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(ptr align 1 dereferenceable(16) %p) { +; SSE2-LABEL: @gep11_bitcast_load_i32_from_v16i8_insert_v4i32( +; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 11 +; SSE2-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1 +; SSE2-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0 +; SSE2-NEXT: ret <4 x i32> [[R]] +; +; AVX2-LABEL: @gep11_bitcast_load_i32_from_v16i8_insert_v4i32( +; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[P:%.*]], align 1 +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> +; AVX2-NEXT: [[R:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +; AVX2-NEXT: ret <4 x i32> [[R]] +; + %gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 11 + %s = load i32, ptr %gep, align 1 + %r = insertelement <4 x i32> poison, i32 %s, i64 0 + ret <4 x i32> %r +} + +define <4 x i32> @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(ptr align 1 dereferenceable(16) %p) { +; CHECK-LABEL: @gep01_bitcast_load_i32_from_v8i16_insert_v4i32( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[R:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[R]] +; + %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1 + %s = load i32, ptr %gep, align 1 + %r = insertelement <4 x i32> poison, i32 %s, i64 0 + ret <4 x i32> %r +} + +define <2 x i64> @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(ptr align 1 dereferenceable(16) %p) { +; SSE2-LABEL: @gep01_bitcast_load_i64_from_v8i16_insert_v2i64( +; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1 +; SSE2-NEXT: [[S:%.*]] = load i64, ptr [[GEP]], align 1 +; SSE2-NEXT: [[R:%.*]] = insertelement <2 x i64> poison, i64 [[S]], i64 0 +; SSE2-NEXT: ret <2 x i64> [[R]] +; +; AVX2-LABEL: @gep01_bitcast_load_i64_from_v8i16_insert_v2i64( +; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 1 +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> +; AVX2-NEXT: [[R:%.*]] = bitcast <8 x i16> [[TMP2]] to <2 x i64> +; AVX2-NEXT: ret <2 x i64> [[R]] +; + %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1 + %s = load i64, ptr %gep, align 1 + %r = insertelement <2 x i64> poison, i64 %s, i64 0 + ret <2 x i64> %r +} + +define <4 x i32> @gep05_bitcast_load_i32_from_v8i16_insert_v4i32(ptr align 1 dereferenceable(16) %p) { +; CHECK-LABEL: @gep05_bitcast_load_i32_from_v8i16_insert_v4i32( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[R:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[R]] +; + %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 5 %s = load i32, ptr %gep, align 1 %r = insertelement <4 x i32> poison, i32 %s, i64 0 ret <4 x i32> %r } +define <2 x i64> @gep01_bitcast_load_i32_from_v4i32_insert_v2i64(ptr align 1 dereferenceable(16) %p) { +; CHECK-LABEL: @gep01_bitcast_load_i32_from_v4i32_insert_v2i64( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[R:%.*]] = bitcast <4 x i32> [[TMP2]] to <2 x i64> +; CHECK-NEXT: ret <2 x i64> [[R]] +; + %gep = getelementptr inbounds <4 x i32>, ptr %p, i64 0, i64 1 + %s = load i64, ptr %gep, align 1 + %r = insertelement <2 x i64> poison, i64 %s, i64 0 + ret <2 x i64> %r +} + define <4 x i32> @gep012_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceable(20) %p) nofree nosync { ; CHECK-LABEL: @gep012_bitcast_load_i32_insert_v4i32( ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 1 @@ -342,8 +433,7 @@ define <8 x i16> @gep10_load_i16_insert_v8i16(ptr align 16 dereferenceable(32) % ; CHECK-LABEL: @gep10_load_i16_insert_v8i16( ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 1, i64 0 ; CHECK-NEXT: [[R:%.*]] = load <8 x i16>, ptr [[GEP]], align 16 -; CHECK-NEXT: [[R1:%.*]] = shufflevector <8 x i16> [[R]], <8 x i16> poison, <8 x i32> -; CHECK-NEXT: ret <8 x i16> [[R1]] +; CHECK-NEXT: ret <8 x i16> [[R]] ; %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 1, i64 0 %s = load i16, ptr %gep, align 16 @@ -445,8 +535,7 @@ define <4 x float> @load_f32_insert_v4f32_volatile(ptr align 16 dereferenceable( define <4 x float> @load_f32_insert_v4f32_align(ptr align 1 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @load_f32_insert_v4f32_align( ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 4 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: ret <4 x float> [[R]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; %s = load float, ptr %p, align 4 %r = insertelement <4 x float> poison, float %s, i32 0 @@ -468,8 +557,8 @@ define <4 x float> @load_f32_insert_v4f32_deref(ptr align 4 dereferenceable(15) define <8 x i32> @load_i32_insert_v8i32(ptr align 16 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @load_i32_insert_v8i32( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 16 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i32>, ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[R:%.*]] = shufflevector <1 x i32> [[TMP1]], <1 x i32> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[R]] ; %s = load i32, ptr %p, align 4 @@ -479,8 +568,8 @@ define <8 x i32> @load_i32_insert_v8i32(ptr align 16 dereferenceable(16) %p) nof define <8 x i32> @casted_load_i32_insert_v8i32(ptr align 4 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @casted_load_i32_insert_v8i32( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 4 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i32>, ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[R:%.*]] = shufflevector <1 x i32> [[TMP1]], <1 x i32> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[R]] ; %s = load i32, ptr %p, align 4 @@ -490,8 +579,8 @@ define <8 x i32> @casted_load_i32_insert_v8i32(ptr align 4 dereferenceable(16) % define <16 x float> @load_f32_insert_v16f32(ptr align 16 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @load_f32_insert_v16f32( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <16 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = load <1 x float>, ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[R:%.*]] = shufflevector <1 x float> [[TMP1]], <1 x float> poison, <16 x i32> ; CHECK-NEXT: ret <16 x float> [[R]] ; %s = load float, ptr %p, align 4 @@ -501,8 +590,7 @@ define <16 x float> @load_f32_insert_v16f32(ptr align 16 dereferenceable(16) %p) define <2 x float> @load_f32_insert_v2f32(ptr align 16 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @load_f32_insert_v2f32( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[R:%.*]] = load <2 x float>, ptr [[P:%.*]], align 16 ; CHECK-NEXT: ret <2 x float> [[R]] ; %s = load float, ptr %p, align 4 @@ -552,8 +640,7 @@ define void @PR47558_multiple_use_load(ptr nocapture nonnull %resultptr, ptr noc define <4 x float> @load_v2f32_extract_insert_v4f32(ptr align 16 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @load_v2f32_extract_insert_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: ret <4 x float> [[R]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; %l = load <2 x float>, ptr %p, align 4 %s = extractelement <2 x float> %l, i32 0 @@ -564,8 +651,7 @@ define <4 x float> @load_v2f32_extract_insert_v4f32(ptr align 16 dereferenceable define <4 x float> @load_v8f32_extract_insert_v4f32(ptr align 16 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @load_v8f32_extract_insert_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: ret <4 x float> [[R]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; %l = load <8 x float>, ptr %p, align 4 %s = extractelement <8 x float> %l, i32 0 @@ -645,8 +731,7 @@ define <4 x float> @load_v2f32_extract_insert_v4f32_tsan(ptr align 16 dereferenc define <2 x float> @load_f32_insert_v2f32_msan(ptr align 16 dereferenceable(16) %p) nofree nosync sanitize_memory { ; CHECK-LABEL: @load_f32_insert_v2f32_msan( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[R:%.*]] = load <2 x float>, ptr [[P:%.*]], align 16 ; CHECK-NEXT: ret <2 x float> [[R]] ; %s = load float, ptr %p, align 4