diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 5e41273556d3d..f693580929518 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -344,9 +344,12 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const { unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { if (Opcode == Instruction::Load || Opcode == Instruction::Store) return 32 * 4 / ElemWidth; - return (ElemWidth == 16 && ST->has16BitInsts()) ? 2 - : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2 - : 1; + // For a given width return the max 0number of elements that can be combined + // into a wider bit value: + return (ElemWidth == 8 && ST->has16BitInsts()) ? 4 + : (ElemWidth == 16 && ST->has16BitInsts()) ? 2 + : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2 + : 1; } unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize, @@ -1195,14 +1198,15 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp); - // Larger vector widths may require additional instructions, but are - // typically cheaper than scalarized versions. - unsigned NumVectorElts = cast(SrcTy)->getNumElements(); + unsigned ScalarSize = DL.getTypeSizeInBits(SrcTy->getElementType()); if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && - DL.getTypeSizeInBits(SrcTy->getElementType()) == 16) { - bool HasVOP3P = ST->hasVOP3PInsts(); + (ScalarSize == 16 || ScalarSize == 8)) { + // Larger vector widths may require additional instructions, but are + // typically cheaper than scalarized versions. + unsigned NumVectorElts = cast(SrcTy)->getNumElements(); unsigned RequestedElts = count_if(Mask, [](int MaskElt) { return MaskElt != -1; }); + unsigned EltsPerReg = 32 / ScalarSize; if (RequestedElts == 0) return 0; switch (Kind) { @@ -1211,9 +1215,9 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, case TTI::SK_PermuteSingleSrc: { // With op_sel VOP3P instructions freely can access the low half or high // half of a register, so any swizzle of two elements is free. - if (HasVOP3P && NumVectorElts == 2) + if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumVectorElts == 2) return 0; - unsigned NumPerms = alignTo(RequestedElts, 2) / 2; + unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg; // SK_Broadcast just reuses the same mask unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms; return NumPerms + NumPermMasks; @@ -1225,12 +1229,12 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, return 0; // Insert/extract subvectors only require shifts / extract code to get the // relevant bits - return alignTo(RequestedElts, 2) / 2; + return alignTo(RequestedElts, EltsPerReg) / EltsPerReg; } case TTI::SK_PermuteTwoSrc: case TTI::SK_Splice: case TTI::SK_Select: { - unsigned NumPerms = alignTo(RequestedElts, 2) / 2; + unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg; // SK_Select just reuses the same mask unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms; return NumPerms + NumPermMasks; @@ -1505,3 +1509,30 @@ GCNTTIImpl::fpenvIEEEMode(const Instruction &I) const { return AMDGPU::isShader(F->getCallingConv()) ? KnownIEEEMode::Off : KnownIEEEMode::On; } + +InstructionCost GCNTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, + Align Alignment, + unsigned AddressSpace, + TTI::TargetCostKind CostKind, + TTI::OperandValueInfo OpInfo, + const Instruction *I) const { + if (VectorType *VecTy = dyn_cast(Src)) { + if ((Opcode == Instruction::Load || Opcode == Instruction::Store) && + VecTy->getElementType()->isIntegerTy(8)) { + return divideCeil(DL.getTypeSizeInBits(VecTy) - 1, + getLoadStoreVecRegBitWidth(AddressSpace)); + } + } + return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind, + OpInfo, I); +} + +unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) const { + if (VectorType *VecTy = dyn_cast(Tp)) { + if (VecTy->getElementType()->isIntegerTy(8)) { + unsigned ElementCount = VecTy->getElementCount().getFixedValue(); + return divideCeil(ElementCount - 1, 4); + } + } + return BaseT::getNumberOfParts(Tp); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 64a244e33f18f..20da8344c9d37 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -288,6 +288,20 @@ class GCNTTIImpl final : public BasicTTIImplBase { /// "amdgpu-ieee"="true" and KnownIEEEMode::Off if we can assume /// "amdgpu-ieee"="false". KnownIEEEMode fpenvIEEEMode(const Instruction &I) const; + + /// Account for loads of i8 vector types to have reduced cost. For + /// example the cost of load 4 i8s values is one is the cost of loading + /// a single i32 value. + InstructionCost getMemoryOpCost( + unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, + TTI::TargetCostKind CostKind, + TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None}, + const Instruction *I = nullptr) const override; + + /// When counting parts on AMD GPUs, account for i8s being grouped + /// together under a single i32 value. Otherwise fall back to base + /// implementation. + unsigned getNumberOfParts(Type *Tp) const override; }; } // end namespace llvm diff --git a/llvm/test/Analysis/CostModel/AMDGPU/load.ll b/llvm/test/Analysis/CostModel/AMDGPU/load.ll index 3f8016178e719..6ec84bd88cd4d 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/load.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/load.ll @@ -21,17 +21,17 @@ define void @loads_i1(i32 %arg) { define void @loads_i8(i32 %arg) { ; GFX90A-LABEL: 'loads_i8' ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = load i8, ptr poison, align 1 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %2 = load <2 x i8>, ptr poison, align 2 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %3 = load <3 x i8>, ptr poison, align 4 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %4 = load <4 x i8>, ptr poison, align 4 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = load <2 x i8>, ptr poison, align 2 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = load <3 x i8>, ptr poison, align 4 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = load <4 x i8>, ptr poison, align 4 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %5 = load i8, ptr poison, align 1 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %6 = load <2 x i8>, ptr poison, align 1 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %7 = load <3 x i8>, ptr poison, align 1 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %8 = load <4 x i8>, ptr poison, align 1 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = load <2 x i8>, ptr poison, align 1 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = load <3 x i8>, ptr poison, align 1 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %8 = load <4 x i8>, ptr poison, align 1 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %9 = load i8, ptr poison, align 4 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %10 = load <2 x i8>, ptr poison, align 4 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %11 = load <3 x i8>, ptr poison, align 4 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %12 = load <4 x i8>, ptr poison, align 4 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %10 = load <2 x i8>, ptr poison, align 4 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %11 = load <3 x i8>, ptr poison, align 4 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %12 = load <4 x i8>, ptr poison, align 4 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; load i8, ptr poison @@ -154,35 +154,35 @@ define void @loads_addrspace_1(i32 %arg) { ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = load <1 x i16>, ptr addrspace(1) poison, align 2 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = load <1 x i32>, ptr addrspace(1) poison, align 4 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %5 = load <2 x i1>, ptr addrspace(1) poison, align 1 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %6 = load <2 x i8>, ptr addrspace(1) poison, align 2 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = load <2 x i8>, ptr addrspace(1) poison, align 2 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = load <2 x i16>, ptr addrspace(1) poison, align 4 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %8 = load <2 x i32>, ptr addrspace(1) poison, align 8 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %9 = load <3 x i1>, ptr addrspace(1) poison, align 1 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %10 = load <3 x i8>, ptr addrspace(1) poison, align 4 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %10 = load <3 x i8>, ptr addrspace(1) poison, align 4 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %11 = load <3 x i16>, ptr addrspace(1) poison, align 8 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %12 = load <3 x i32>, ptr addrspace(1) poison, align 16 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %13 = load <4 x i1>, ptr addrspace(1) poison, align 1 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %14 = load <4 x i8>, ptr addrspace(1) poison, align 4 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %14 = load <4 x i8>, ptr addrspace(1) poison, align 4 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %15 = load <4 x i16>, ptr addrspace(1) poison, align 8 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %16 = load <4 x i32>, ptr addrspace(1) poison, align 16 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %17 = load <8 x i1>, ptr addrspace(1) poison, align 1 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %18 = load <8 x i8>, ptr addrspace(1) poison, align 8 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %18 = load <8 x i8>, ptr addrspace(1) poison, align 8 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %19 = load <8 x i16>, ptr addrspace(1) poison, align 16 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %20 = load <8 x i32>, ptr addrspace(1) poison, align 32 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %21 = load <16 x i1>, ptr addrspace(1) poison, align 2 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %22 = load <16 x i8>, ptr addrspace(1) poison, align 16 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %22 = load <16 x i8>, ptr addrspace(1) poison, align 16 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %23 = load <16 x i16>, ptr addrspace(1) poison, align 32 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %24 = load <16 x i32>, ptr addrspace(1) poison, align 64 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %25 = load <32 x i1>, ptr addrspace(1) poison, align 4 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %26 = load <32 x i8>, ptr addrspace(1) poison, align 32 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %26 = load <32 x i8>, ptr addrspace(1) poison, align 32 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %27 = load <32 x i16>, ptr addrspace(1) poison, align 64 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %28 = load <32 x i32>, ptr addrspace(1) poison, align 128 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %29 = load <64 x i1>, ptr addrspace(1) poison, align 8 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %30 = load <64 x i8>, ptr addrspace(1) poison, align 64 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %30 = load <64 x i8>, ptr addrspace(1) poison, align 64 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %31 = load <64 x i16>, ptr addrspace(1) poison, align 128 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %32 = load <64 x i32>, ptr addrspace(1) poison, align 256 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %33 = load <128 x i1>, ptr addrspace(1) poison, align 16 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %34 = load <128 x i8>, ptr addrspace(1) poison, align 128 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %34 = load <128 x i8>, ptr addrspace(1) poison, align 128 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %35 = load <128 x i16>, ptr addrspace(1) poison, align 256 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %36 = load <128 x i32>, ptr addrspace(1) poison, align 512 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void @@ -241,35 +241,35 @@ define void @loads_addrspace_3(i32 %arg) { ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = load <1 x i16>, ptr addrspace(3) poison, align 2 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = load <1 x i32>, ptr addrspace(3) poison, align 4 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %5 = load <2 x i1>, ptr addrspace(3) poison, align 1 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %6 = load <2 x i8>, ptr addrspace(3) poison, align 2 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = load <2 x i8>, ptr addrspace(3) poison, align 2 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = load <2 x i16>, ptr addrspace(3) poison, align 4 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %8 = load <2 x i32>, ptr addrspace(3) poison, align 8 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %9 = load <3 x i1>, ptr addrspace(3) poison, align 1 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %10 = load <3 x i8>, ptr addrspace(3) poison, align 4 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %10 = load <3 x i8>, ptr addrspace(3) poison, align 4 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %11 = load <3 x i16>, ptr addrspace(3) poison, align 8 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %12 = load <3 x i32>, ptr addrspace(3) poison, align 16 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %13 = load <4 x i1>, ptr addrspace(3) poison, align 1 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %14 = load <4 x i8>, ptr addrspace(3) poison, align 4 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %14 = load <4 x i8>, ptr addrspace(3) poison, align 4 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %15 = load <4 x i16>, ptr addrspace(3) poison, align 8 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %16 = load <4 x i32>, ptr addrspace(3) poison, align 16 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %17 = load <8 x i1>, ptr addrspace(3) poison, align 1 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %18 = load <8 x i8>, ptr addrspace(3) poison, align 8 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %18 = load <8 x i8>, ptr addrspace(3) poison, align 8 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %19 = load <8 x i16>, ptr addrspace(3) poison, align 16 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %20 = load <8 x i32>, ptr addrspace(3) poison, align 32 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %21 = load <16 x i1>, ptr addrspace(3) poison, align 2 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %22 = load <16 x i8>, ptr addrspace(3) poison, align 16 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %22 = load <16 x i8>, ptr addrspace(3) poison, align 16 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %23 = load <16 x i16>, ptr addrspace(3) poison, align 32 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %24 = load <16 x i32>, ptr addrspace(3) poison, align 64 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %25 = load <32 x i1>, ptr addrspace(3) poison, align 4 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %26 = load <32 x i8>, ptr addrspace(3) poison, align 32 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %26 = load <32 x i8>, ptr addrspace(3) poison, align 32 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %27 = load <32 x i16>, ptr addrspace(3) poison, align 64 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %28 = load <32 x i32>, ptr addrspace(3) poison, align 128 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %29 = load <64 x i1>, ptr addrspace(3) poison, align 8 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %30 = load <64 x i8>, ptr addrspace(3) poison, align 64 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %30 = load <64 x i8>, ptr addrspace(3) poison, align 64 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %31 = load <64 x i16>, ptr addrspace(3) poison, align 128 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %32 = load <64 x i32>, ptr addrspace(3) poison, align 256 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %33 = load <128 x i1>, ptr addrspace(3) poison, align 16 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %34 = load <128 x i8>, ptr addrspace(3) poison, align 128 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %34 = load <128 x i8>, ptr addrspace(3) poison, align 128 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %35 = load <128 x i16>, ptr addrspace(3) poison, align 256 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %36 = load <128 x i32>, ptr addrspace(3) poison, align 512 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void @@ -328,35 +328,35 @@ define void @loads_addrspace_5(i32 %arg) { ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = load <1 x i16>, ptr addrspace(5) poison, align 2 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = load <1 x i32>, ptr addrspace(5) poison, align 4 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %5 = load <2 x i1>, ptr addrspace(5) poison, align 1 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %6 = load <2 x i8>, ptr addrspace(5) poison, align 2 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = load <2 x i8>, ptr addrspace(5) poison, align 2 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = load <2 x i16>, ptr addrspace(5) poison, align 4 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %8 = load <2 x i32>, ptr addrspace(5) poison, align 8 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %9 = load <3 x i1>, ptr addrspace(5) poison, align 1 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %10 = load <3 x i8>, ptr addrspace(5) poison, align 4 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %10 = load <3 x i8>, ptr addrspace(5) poison, align 4 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %11 = load <3 x i16>, ptr addrspace(5) poison, align 8 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %12 = load <3 x i32>, ptr addrspace(5) poison, align 16 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %13 = load <4 x i1>, ptr addrspace(5) poison, align 1 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %14 = load <4 x i8>, ptr addrspace(5) poison, align 4 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %14 = load <4 x i8>, ptr addrspace(5) poison, align 4 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %15 = load <4 x i16>, ptr addrspace(5) poison, align 8 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %16 = load <4 x i32>, ptr addrspace(5) poison, align 16 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %17 = load <8 x i1>, ptr addrspace(5) poison, align 1 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %18 = load <8 x i8>, ptr addrspace(5) poison, align 8 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %18 = load <8 x i8>, ptr addrspace(5) poison, align 8 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %19 = load <8 x i16>, ptr addrspace(5) poison, align 16 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %20 = load <8 x i32>, ptr addrspace(5) poison, align 32 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %21 = load <16 x i1>, ptr addrspace(5) poison, align 2 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %22 = load <16 x i8>, ptr addrspace(5) poison, align 16 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %22 = load <16 x i8>, ptr addrspace(5) poison, align 16 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %23 = load <16 x i16>, ptr addrspace(5) poison, align 32 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %24 = load <16 x i32>, ptr addrspace(5) poison, align 64 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %25 = load <32 x i1>, ptr addrspace(5) poison, align 4 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %26 = load <32 x i8>, ptr addrspace(5) poison, align 32 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %26 = load <32 x i8>, ptr addrspace(5) poison, align 32 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %27 = load <32 x i16>, ptr addrspace(5) poison, align 64 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %28 = load <32 x i32>, ptr addrspace(5) poison, align 128 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %29 = load <64 x i1>, ptr addrspace(5) poison, align 8 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %30 = load <64 x i8>, ptr addrspace(5) poison, align 64 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %30 = load <64 x i8>, ptr addrspace(5) poison, align 64 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %31 = load <64 x i16>, ptr addrspace(5) poison, align 128 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %32 = load <64 x i32>, ptr addrspace(5) poison, align 256 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %33 = load <128 x i1>, ptr addrspace(5) poison, align 16 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %34 = load <128 x i8>, ptr addrspace(5) poison, align 128 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %34 = load <128 x i8>, ptr addrspace(5) poison, align 128 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %35 = load <128 x i16>, ptr addrspace(5) poison, align 256 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %36 = load <128 x i32>, ptr addrspace(5) poison, align 512 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void diff --git a/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll b/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll index 4fc8c44e12668..3aa682cd2971c 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll @@ -396,157 +396,157 @@ define amdgpu_kernel void @shufflevector_i16(<2 x i16> %vec1, <2 x i16> %vec2) { ; Should not assert define amdgpu_kernel void @shufflevector_i8(<2 x i8> %vec1, <2 x i8> %vec2) { ; ALL-LABEL: 'shufflevector_i8' -; ALL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf00 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> zeroinitializer +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf00 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> zeroinitializer ; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf10 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf11 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf10 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf11 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf02 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf20 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf22 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf03 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf30 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf33 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf12 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf21 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf20 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf22 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf03 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf30 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf33 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf12 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf21 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf13 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf31 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf31 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf23 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf32 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf000 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> zeroinitializer -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf001 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf010 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf011 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf100 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf101 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf110 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf111 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf002 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf020 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf022 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf200 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf202 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf220 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf222 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf112 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf121 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf122 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf211 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf212 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf221 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf00_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> zeroinitializer +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf32 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf000 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> zeroinitializer +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf001 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf010 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf011 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf100 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf101 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf110 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf111 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf002 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf020 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf022 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf200 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf202 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf220 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf222 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf112 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf121 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf122 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf211 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf212 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf221 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf00_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> zeroinitializer ; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf01_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf10_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf11_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf10_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf11_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf02_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf20_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf22_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf03_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf30_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf33_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf12_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf21_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf20_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf22_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf03_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf30_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf33_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf12_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf21_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf13_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf31_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf31_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf23_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf32_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf000_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> zeroinitializer -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf001_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf010_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf011_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf100_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf101_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf110_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf111_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf002_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf020_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf022_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf200_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf202_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf220_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf222_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf112_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf121_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf122_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf211_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf212_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf221_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf32_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf000_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> zeroinitializer +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf001_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf010_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf011_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf100_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf101_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf110_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf111_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf002_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf020_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf022_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf200_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf202_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf220_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf222_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf112_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf121_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf122_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf211_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf212_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf221_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; ALL-SIZE-LABEL: 'shufflevector_i8' -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf00 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> zeroinitializer +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf00 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> zeroinitializer ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf10 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf11 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf10 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf11 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf02 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf20 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf22 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf03 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf30 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf33 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf12 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf21 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf20 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf22 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf03 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf30 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf33 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf12 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf21 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf13 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf31 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf31 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf23 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf32 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf000 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> zeroinitializer -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf001 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf010 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf011 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf100 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf101 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf110 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf111 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf002 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf020 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf022 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf200 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf202 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf220 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf222 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf112 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf121 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf122 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf211 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf212 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf221 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf00_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> zeroinitializer +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf32 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf000 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> zeroinitializer +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf001 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf010 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf011 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf100 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf101 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf110 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf111 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf002 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf020 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf022 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf200 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf202 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf220 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf222 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf112 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf121 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf122 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf211 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf212 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf221 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf00_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> zeroinitializer ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf01_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf10_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf11_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf10_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf11_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf02_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf20_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf22_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf03_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf30_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf33_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf12_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf21_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf20_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf22_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf03_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf30_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf33_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf12_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf21_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf13_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf31_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf31_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf23_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf32_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf000_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> zeroinitializer -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf001_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf010_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf011_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf100_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf101_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf110_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf111_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf002_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf020_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf022_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf200_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf202_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf220_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf222_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf112_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf121_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf122_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf211_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf212_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf221_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf32_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf000_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> zeroinitializer +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf001_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf010_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf011_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf100_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf101_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf110_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf111_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf002_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf020_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf022_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf200_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf202_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf220_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf222_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf112_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf121_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf122_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf211_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf212_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf221_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %shuf00 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> zeroinitializer @@ -861,22 +861,22 @@ define amdgpu_kernel void @shufflevector_i32(<2 x i32> %vec1, <2 x i32> %vec2) { ; Other shuffle cases define void @shuffle(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8> %i8v4_2, <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i8> %i8v8, <8 x i8> %i8v8_2, <16 x i8> %i8v16, <16 x i8> %i8v16_2, <2 x i16> %i16v2, <2 x i16> %i16v2_2, <4 x i16> %i16v4, <4 x i16> %i16v4_2, <8 x i16> %i16v8, <8 x i16> %i16v8_2, <2 x i32> %i32v2, <2 x i32> %i32v2_2, <4 x i32> %i32v4, <4 x i32> %i32v4_2, <2 x float> %floatv2, <2 x float> %floatv2_2, <4 x float> %floatv4, <4 x float> %floatv4_2,<2 x i64> %i64v2, <2 x i64> %i64v2_2,<2 x double> %doublev2, <2 x double> %doublev2_2) { ; GFX9-10-LABEL: 'shuffle' -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <2 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <2 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <4 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <4 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <8 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <8 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6, <8 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8_2 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <8 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <8 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16, <16 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16_2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> +; GFX9-10-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <2 x i32> +; GFX9-10-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i8_2_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <2 x i32> +; GFX9-10-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i8_4 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> +; GFX9-10-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i8_4_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> +; GFX9-10-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8_4 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <4 x i32> +; GFX9-10-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8_4_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <4 x i32> +; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i8_8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <8 x i32> +; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i8_8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <8 x i32> +; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8_8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> +; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8_8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> +; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v6i8_8 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6, <8 x i32> +; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v6i8_8_2 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i32> +; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i8_8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <8 x i32> +; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i8_8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <8 x i32> +; GFX9-10-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i8_16 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16, <16 x i32> +; GFX9-10-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i8_16_2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> ; GFX9-10-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <2 x i32> ; GFX9-10-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2i16_2_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <2 x i32> ; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i16_4 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <4 x i32> @@ -898,22 +898,22 @@ define void @shuffle(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8> ; GFX9-10-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; VI-LABEL: 'shuffle' -; VI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <2 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <2 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <4 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <4 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <8 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <8 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6, <8 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8_2 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <8 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <8 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16, <16 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16_2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> +; VI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <2 x i32> +; VI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i8_2_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <2 x i32> +; VI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i8_4 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> +; VI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i8_4_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> +; VI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8_4 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <4 x i32> +; VI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8_4_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <4 x i32> +; VI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i8_8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <8 x i32> +; VI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i8_8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <8 x i32> +; VI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8_8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> +; VI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8_8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> +; VI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v6i8_8 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6, <8 x i32> +; VI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v6i8_8_2 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i32> +; VI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i8_8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <8 x i32> +; VI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i8_8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <8 x i32> +; VI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i8_16 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16, <16 x i32> +; VI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i8_16_2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> ; VI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <2 x i32> ; VI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i16_2_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <2 x i32> ; VI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i16_4 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <4 x i32> @@ -935,22 +935,22 @@ define void @shuffle(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8> ; VI-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; GFX9-10-SIZE-LABEL: 'shuffle' -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <2 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <2 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <4 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <4 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <8 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <8 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6, <8 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8_2 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <8 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <8 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16, <16 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16_2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> +; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <2 x i32> +; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i8_2_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <2 x i32> +; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i8_4 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> +; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i8_4_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> +; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8_4 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <4 x i32> +; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8_4_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <4 x i32> +; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i8_8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <8 x i32> +; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i8_8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <8 x i32> +; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8_8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> +; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8_8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> +; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v6i8_8 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6, <8 x i32> +; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v6i8_8_2 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i32> +; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i8_8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <8 x i32> +; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i8_8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <8 x i32> +; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i8_16 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16, <16 x i32> +; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i8_16_2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> ; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <2 x i32> ; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2i16_2_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <2 x i32> ; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i16_4 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <4 x i32> @@ -972,22 +972,22 @@ define void @shuffle(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8> ; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; VI-SIZE-LABEL: 'shuffle' -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <2 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <2 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <4 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <4 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <8 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <8 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6, <8 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8_2 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <8 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <8 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16, <16 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16_2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <2 x i32> +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i8_2_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <2 x i32> +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i8_4 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i8_4_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8_4 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <4 x i32> +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8_4_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <4 x i32> +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i8_8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <8 x i32> +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i8_8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <8 x i32> +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8_8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8_8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v6i8_8 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6, <8 x i32> +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v6i8_8_2 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i32> +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i8_8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <8 x i32> +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i8_8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <8 x i32> +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i8_16 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16, <16 x i32> +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i8_16_2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <2 x i32> ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i16_2_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <2 x i32> ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i16_4 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <4 x i32> @@ -1047,9 +1047,9 @@ define void @shuffle(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8> define void @concat(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i8> %i8v8, <8 x i8> %i8v8_2, <2 x half> %halfv2, <2 x half> %halfv2_2, <4 x half> %halfv4, <4 x half> %halfv4_2, <8 x half> %halfv8, <8 x half> %halfv8_2, <2 x i16> %i16v2, <2 x i16> %i16v2_2, <4 x i16> %i16v4, <4 x i16> %i16v4_2, <8 x i16> %i16v8, <8 x i16> %i16v8_2, <2 x i32> %i32v2, <2 x i32> %i32v2_2, <4 x i32> %i32v4, <4 x i32> %i32v4_2, <2 x float> %floatv2, <2 x float> %floatv2_2, <4 x float> %floatv4, <4 x float> %floatv4_2,<2 x i64> %i64v2, <2 x i64> %i64v2_2,<2 x double> %doublev2, <2 x double> %doublev2_2) { ; ALL-LABEL: 'concat' -; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <16 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16i8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <16 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i16 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <4 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i16 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <8 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16i16 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8, <16 x i32> @@ -1062,9 +1062,9 @@ define void @concat(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8> % ; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4f32 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2, <4 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4, <8 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2, <4 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <16 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16i8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <16 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <4 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i16_2 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4_2, <8 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16i16_2 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8_2, <16 x i32> @@ -1080,9 +1080,9 @@ define void @concat(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8> % ; ALL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; ALL-SIZE-LABEL: 'concat' -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <16 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16i8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <16 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i16 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <4 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i16 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <8 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16i16 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8, <16 x i32> @@ -1095,9 +1095,9 @@ define void @concat(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8> % ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4f32 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2, <4 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4, <8 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2, <4 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <16 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16i8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <16 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <4 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i16_2 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4_2, <8 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16i16_2 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8_2, <16 x i32> diff --git a/llvm/test/Analysis/CostModel/AMDGPU/store.ll b/llvm/test/Analysis/CostModel/AMDGPU/store.ll index 9672c3256c751..6dc4befdfbd9e 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/store.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/store.ll @@ -20,17 +20,17 @@ define void @stores_i1(i32 %arg) { define void @stores_i8(i32 %arg) { ; GFX90A-LABEL: 'stores_i8' ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i8 poison, ptr poison, align 1 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <2 x i8> poison, ptr poison, align 2 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <3 x i8> poison, ptr poison, align 4 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <4 x i8> poison, ptr poison, align 4 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i8> poison, ptr poison, align 2 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <3 x i8> poison, ptr poison, align 4 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i8> poison, ptr poison, align 4 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i8 poison, ptr poison, align 1 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <2 x i8> poison, ptr poison, align 1 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <3 x i8> poison, ptr poison, align 1 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <4 x i8> poison, ptr poison, align 1 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i8> poison, ptr poison, align 1 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <3 x i8> poison, ptr poison, align 1 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i8> poison, ptr poison, align 1 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i8 poison, ptr poison, align 4 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <2 x i8> poison, ptr poison, align 4 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <3 x i8> poison, ptr poison, align 4 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <4 x i8> poison, ptr poison, align 4 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i8> poison, ptr poison, align 4 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <3 x i8> poison, ptr poison, align 4 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i8> poison, ptr poison, align 4 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; store i8 poison, ptr poison @@ -153,35 +153,35 @@ define void @stores_addrspace_1(i32 %arg) { ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <1 x i16> poison, ptr addrspace(1) poison, align 2 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <1 x i32> poison, ptr addrspace(1) poison, align 4 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <2 x i1> poison, ptr addrspace(1) poison, align 1 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <2 x i8> poison, ptr addrspace(1) poison, align 2 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i8> poison, ptr addrspace(1) poison, align 2 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i16> poison, ptr addrspace(1) poison, align 4 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i32> poison, ptr addrspace(1) poison, align 8 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <3 x i1> poison, ptr addrspace(1) poison, align 1 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <3 x i8> poison, ptr addrspace(1) poison, align 4 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <3 x i8> poison, ptr addrspace(1) poison, align 4 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <3 x i16> poison, ptr addrspace(1) poison, align 8 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <3 x i32> poison, ptr addrspace(1) poison, align 16 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <4 x i1> poison, ptr addrspace(1) poison, align 1 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <4 x i8> poison, ptr addrspace(1) poison, align 4 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i8> poison, ptr addrspace(1) poison, align 4 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i16> poison, ptr addrspace(1) poison, align 8 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> poison, ptr addrspace(1) poison, align 16 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 8 for instruction: store <8 x i1> poison, ptr addrspace(1) poison, align 1 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 8 for instruction: store <8 x i8> poison, ptr addrspace(1) poison, align 8 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <8 x i8> poison, ptr addrspace(1) poison, align 8 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <8 x i16> poison, ptr addrspace(1) poison, align 16 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <8 x i32> poison, ptr addrspace(1) poison, align 32 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 16 for instruction: store <16 x i1> poison, ptr addrspace(1) poison, align 2 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 16 for instruction: store <16 x i8> poison, ptr addrspace(1) poison, align 16 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <16 x i8> poison, ptr addrspace(1) poison, align 16 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <16 x i16> poison, ptr addrspace(1) poison, align 32 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <16 x i32> poison, ptr addrspace(1) poison, align 64 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 32 for instruction: store <32 x i1> poison, ptr addrspace(1) poison, align 4 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 32 for instruction: store <32 x i8> poison, ptr addrspace(1) poison, align 32 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <32 x i8> poison, ptr addrspace(1) poison, align 32 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <32 x i16> poison, ptr addrspace(1) poison, align 64 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <32 x i32> poison, ptr addrspace(1) poison, align 128 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 64 for instruction: store <64 x i1> poison, ptr addrspace(1) poison, align 8 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 64 for instruction: store <64 x i8> poison, ptr addrspace(1) poison, align 64 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <64 x i8> poison, ptr addrspace(1) poison, align 64 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <64 x i16> poison, ptr addrspace(1) poison, align 128 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <64 x i32> poison, ptr addrspace(1) poison, align 256 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 128 for instruction: store <128 x i1> poison, ptr addrspace(1) poison, align 16 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 128 for instruction: store <128 x i8> poison, ptr addrspace(1) poison, align 128 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <128 x i8> poison, ptr addrspace(1) poison, align 128 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <128 x i16> poison, ptr addrspace(1) poison, align 256 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <128 x i32> poison, ptr addrspace(1) poison, align 512 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void @@ -241,35 +241,35 @@ define void @stores_addrspace_3(i32 %arg) { ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <1 x i16> poison, ptr addrspace(3) poison, align 2 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <1 x i32> poison, ptr addrspace(3) poison, align 4 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <2 x i1> poison, ptr addrspace(3) poison, align 1 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <2 x i8> poison, ptr addrspace(3) poison, align 2 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i8> poison, ptr addrspace(3) poison, align 2 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i16> poison, ptr addrspace(3) poison, align 4 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i32> poison, ptr addrspace(3) poison, align 8 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <3 x i1> poison, ptr addrspace(3) poison, align 1 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <3 x i8> poison, ptr addrspace(3) poison, align 4 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <3 x i8> poison, ptr addrspace(3) poison, align 4 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <3 x i16> poison, ptr addrspace(3) poison, align 8 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <3 x i32> poison, ptr addrspace(3) poison, align 16 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <4 x i1> poison, ptr addrspace(3) poison, align 1 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <4 x i8> poison, ptr addrspace(3) poison, align 4 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i8> poison, ptr addrspace(3) poison, align 4 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i16> poison, ptr addrspace(3) poison, align 8 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> poison, ptr addrspace(3) poison, align 16 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 8 for instruction: store <8 x i1> poison, ptr addrspace(3) poison, align 1 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 8 for instruction: store <8 x i8> poison, ptr addrspace(3) poison, align 8 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <8 x i8> poison, ptr addrspace(3) poison, align 8 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <8 x i16> poison, ptr addrspace(3) poison, align 16 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <8 x i32> poison, ptr addrspace(3) poison, align 32 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 16 for instruction: store <16 x i1> poison, ptr addrspace(3) poison, align 2 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 16 for instruction: store <16 x i8> poison, ptr addrspace(3) poison, align 16 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <16 x i8> poison, ptr addrspace(3) poison, align 16 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <16 x i16> poison, ptr addrspace(3) poison, align 32 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <16 x i32> poison, ptr addrspace(3) poison, align 64 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 32 for instruction: store <32 x i1> poison, ptr addrspace(3) poison, align 4 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 32 for instruction: store <32 x i8> poison, ptr addrspace(3) poison, align 32 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <32 x i8> poison, ptr addrspace(3) poison, align 32 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <32 x i16> poison, ptr addrspace(3) poison, align 64 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <32 x i32> poison, ptr addrspace(3) poison, align 128 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 64 for instruction: store <64 x i1> poison, ptr addrspace(3) poison, align 8 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 64 for instruction: store <64 x i8> poison, ptr addrspace(3) poison, align 64 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <64 x i8> poison, ptr addrspace(3) poison, align 64 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <64 x i16> poison, ptr addrspace(3) poison, align 128 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <64 x i32> poison, ptr addrspace(3) poison, align 256 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 128 for instruction: store <128 x i1> poison, ptr addrspace(3) poison, align 16 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 128 for instruction: store <128 x i8> poison, ptr addrspace(3) poison, align 128 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 8 for instruction: store <128 x i8> poison, ptr addrspace(3) poison, align 128 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <128 x i16> poison, ptr addrspace(3) poison, align 256 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <128 x i32> poison, ptr addrspace(3) poison, align 512 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void @@ -329,35 +329,35 @@ define void @stores_addrspace_5(i32 %arg) { ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <1 x i16> poison, ptr addrspace(5) poison, align 2 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <1 x i32> poison, ptr addrspace(5) poison, align 4 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <2 x i1> poison, ptr addrspace(5) poison, align 1 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <2 x i8> poison, ptr addrspace(5) poison, align 2 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i8> poison, ptr addrspace(5) poison, align 2 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i16> poison, ptr addrspace(5) poison, align 4 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i32> poison, ptr addrspace(5) poison, align 8 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <3 x i1> poison, ptr addrspace(5) poison, align 1 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <3 x i8> poison, ptr addrspace(5) poison, align 4 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <3 x i8> poison, ptr addrspace(5) poison, align 4 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <3 x i16> poison, ptr addrspace(5) poison, align 8 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <3 x i32> poison, ptr addrspace(5) poison, align 16 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <4 x i1> poison, ptr addrspace(5) poison, align 1 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <4 x i8> poison, ptr addrspace(5) poison, align 4 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i8> poison, ptr addrspace(5) poison, align 4 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i16> poison, ptr addrspace(5) poison, align 8 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> poison, ptr addrspace(5) poison, align 16 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 8 for instruction: store <8 x i1> poison, ptr addrspace(5) poison, align 1 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 8 for instruction: store <8 x i8> poison, ptr addrspace(5) poison, align 8 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <8 x i8> poison, ptr addrspace(5) poison, align 8 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <8 x i16> poison, ptr addrspace(5) poison, align 16 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <8 x i32> poison, ptr addrspace(5) poison, align 32 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 16 for instruction: store <16 x i1> poison, ptr addrspace(5) poison, align 2 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 16 for instruction: store <16 x i8> poison, ptr addrspace(5) poison, align 16 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <16 x i8> poison, ptr addrspace(5) poison, align 16 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <16 x i16> poison, ptr addrspace(5) poison, align 32 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <16 x i32> poison, ptr addrspace(5) poison, align 64 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 32 for instruction: store <32 x i1> poison, ptr addrspace(5) poison, align 4 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 32 for instruction: store <32 x i8> poison, ptr addrspace(5) poison, align 32 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 8 for instruction: store <32 x i8> poison, ptr addrspace(5) poison, align 32 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <32 x i16> poison, ptr addrspace(5) poison, align 64 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <32 x i32> poison, ptr addrspace(5) poison, align 128 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 64 for instruction: store <64 x i1> poison, ptr addrspace(5) poison, align 8 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 64 for instruction: store <64 x i8> poison, ptr addrspace(5) poison, align 64 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 16 for instruction: store <64 x i8> poison, ptr addrspace(5) poison, align 64 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <64 x i16> poison, ptr addrspace(5) poison, align 128 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <64 x i32> poison, ptr addrspace(5) poison, align 256 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 128 for instruction: store <128 x i1> poison, ptr addrspace(5) poison, align 16 -; GFX90A-NEXT: Cost Model: Found an estimated cost of 128 for instruction: store <128 x i8> poison, ptr addrspace(5) poison, align 128 +; GFX90A-NEXT: Cost Model: Found an estimated cost of 32 for instruction: store <128 x i8> poison, ptr addrspace(5) poison, align 128 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <128 x i16> poison, ptr addrspace(5) poison, align 256 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <128 x i32> poison, ptr addrspace(5) poison, align 512 ; GFX90A-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/vectorize-i8.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/vectorize-i8.ll index b9b1bc1be681e..605eccf26cd3c 100644 --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/vectorize-i8.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/vectorize-i8.ll @@ -21,28 +21,20 @@ define protected amdgpu_kernel void @arith_2(<16 x i8> %invec, ptr %out, i32 %fl ; GFX8-LABEL: define protected amdgpu_kernel void @arith_2( ; GFX8-SAME: <16 x i8> [[INVEC:%.*]], ptr [[OUT:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0:[0-9]+]] { ; GFX8-NEXT: [[ENTRY:.*:]] -; GFX8-NEXT: [[EL0:%.*]] = extractelement <16 x i8> [[INVEC]], i64 0 -; GFX8-NEXT: [[EL1:%.*]] = extractelement <16 x i8> [[INVEC]], i64 1 -; GFX8-NEXT: [[MUL0:%.*]] = mul i8 [[EL0]], 1 -; GFX8-NEXT: [[MUL1:%.*]] = mul i8 [[EL1]], 1 -; GFX8-NEXT: [[ADD0:%.*]] = add i8 [[MUL0]], 1 -; GFX8-NEXT: [[ADD1:%.*]] = add i8 [[MUL1]], 1 -; GFX8-NEXT: [[VECINS0:%.*]] = insertelement <16 x i8> poison, i8 [[ADD0]], i64 0 -; GFX8-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[VECINS0]], i8 [[ADD1]], i64 1 +; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <2 x i32> +; GFX8-NEXT: [[TMP1:%.*]] = mul <2 x i8> [[TMP0]], splat (i8 1) +; GFX8-NEXT: [[TMP2:%.*]] = add <2 x i8> [[TMP1]], splat (i8 1) +; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <16 x i32> ; GFX8-NEXT: store <16 x i8> [[TMP3]], ptr [[OUT]], align 16 ; GFX8-NEXT: ret void ; ; GFX9-LABEL: define protected amdgpu_kernel void @arith_2( ; GFX9-SAME: <16 x i8> [[INVEC:%.*]], ptr [[OUT:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0:[0-9]+]] { ; GFX9-NEXT: [[ENTRY:.*:]] -; GFX9-NEXT: [[EL0:%.*]] = extractelement <16 x i8> [[INVEC]], i64 0 -; GFX9-NEXT: [[EL1:%.*]] = extractelement <16 x i8> [[INVEC]], i64 1 -; GFX9-NEXT: [[MUL0:%.*]] = mul i8 [[EL0]], 1 -; GFX9-NEXT: [[MUL1:%.*]] = mul i8 [[EL1]], 1 -; GFX9-NEXT: [[ADD0:%.*]] = add i8 [[MUL0]], 1 -; GFX9-NEXT: [[ADD1:%.*]] = add i8 [[MUL1]], 1 -; GFX9-NEXT: [[VECINS0:%.*]] = insertelement <16 x i8> poison, i8 [[ADD0]], i64 0 -; GFX9-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[VECINS0]], i8 [[ADD1]], i64 1 +; GFX9-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <2 x i32> +; GFX9-NEXT: [[TMP1:%.*]] = mul <2 x i8> [[TMP0]], splat (i8 1) +; GFX9-NEXT: [[TMP2:%.*]] = add <2 x i8> [[TMP1]], splat (i8 1) +; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <16 x i32> ; GFX9-NEXT: store <16 x i8> [[TMP3]], ptr [[OUT]], align 16 ; GFX9-NEXT: ret void ; @@ -82,17 +74,14 @@ define protected amdgpu_kernel void @arith_3(<16 x i8> %invec, ptr %out, i32 %fl ; GFX8-SAME: <16 x i8> [[INVEC:%.*]], ptr [[OUT:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] { ; GFX8-NEXT: [[ENTRY:.*:]] ; GFX8-NEXT: [[EL0:%.*]] = extractelement <16 x i8> [[INVEC]], i64 0 -; GFX8-NEXT: [[EL1:%.*]] = extractelement <16 x i8> [[INVEC]], i64 1 -; GFX8-NEXT: [[EL2:%.*]] = extractelement <16 x i8> [[INVEC]], i64 2 -; GFX8-NEXT: [[MUL2:%.*]] = mul i8 [[EL0]], 1 -; GFX8-NEXT: [[MUL1:%.*]] = mul i8 [[EL1]], 1 -; GFX8-NEXT: [[MUL3:%.*]] = mul i8 [[EL2]], 1 -; GFX8-NEXT: [[ADD2:%.*]] = add i8 [[MUL2]], 1 -; GFX8-NEXT: [[ADD1:%.*]] = add i8 [[MUL1]], 1 +; GFX8-NEXT: [[MUL3:%.*]] = mul i8 [[EL0]], 1 ; GFX8-NEXT: [[ADD3:%.*]] = add i8 [[MUL3]], 1 -; GFX8-NEXT: [[VECINS0:%.*]] = insertelement <16 x i8> poison, i8 [[ADD2]], i64 0 -; GFX8-NEXT: [[VECINS1:%.*]] = insertelement <16 x i8> [[VECINS0]], i8 [[ADD1]], i64 1 -; GFX8-NEXT: [[VECINS2:%.*]] = insertelement <16 x i8> [[VECINS1]], i8 [[ADD3]], i64 2 +; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <2 x i32> +; GFX8-NEXT: [[TMP1:%.*]] = mul <2 x i8> [[TMP0]], splat (i8 1) +; GFX8-NEXT: [[TMP2:%.*]] = add <2 x i8> [[TMP1]], splat (i8 1) +; GFX8-NEXT: [[VECINS0:%.*]] = insertelement <16 x i8> poison, i8 [[ADD3]], i64 0 +; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <16 x i32> +; GFX8-NEXT: [[VECINS2:%.*]] = shufflevector <16 x i8> [[VECINS0]], <16 x i8> [[TMP3]], <16 x i32> ; GFX8-NEXT: store <16 x i8> [[VECINS2]], ptr [[OUT]], align 16 ; GFX8-NEXT: ret void ; @@ -100,17 +89,14 @@ define protected amdgpu_kernel void @arith_3(<16 x i8> %invec, ptr %out, i32 %fl ; GFX9-SAME: <16 x i8> [[INVEC:%.*]], ptr [[OUT:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] { ; GFX9-NEXT: [[ENTRY:.*:]] ; GFX9-NEXT: [[EL0:%.*]] = extractelement <16 x i8> [[INVEC]], i64 0 -; GFX9-NEXT: [[EL1:%.*]] = extractelement <16 x i8> [[INVEC]], i64 1 -; GFX9-NEXT: [[EL2:%.*]] = extractelement <16 x i8> [[INVEC]], i64 2 -; GFX9-NEXT: [[MUL2:%.*]] = mul i8 [[EL0]], 1 -; GFX9-NEXT: [[MUL1:%.*]] = mul i8 [[EL1]], 1 -; GFX9-NEXT: [[MUL3:%.*]] = mul i8 [[EL2]], 1 -; GFX9-NEXT: [[ADD2:%.*]] = add i8 [[MUL2]], 1 -; GFX9-NEXT: [[ADD1:%.*]] = add i8 [[MUL1]], 1 +; GFX9-NEXT: [[MUL3:%.*]] = mul i8 [[EL0]], 1 ; GFX9-NEXT: [[ADD3:%.*]] = add i8 [[MUL3]], 1 -; GFX9-NEXT: [[VECINS0:%.*]] = insertelement <16 x i8> poison, i8 [[ADD2]], i64 0 -; GFX9-NEXT: [[VECINS1:%.*]] = insertelement <16 x i8> [[VECINS0]], i8 [[ADD1]], i64 1 -; GFX9-NEXT: [[VECINS2:%.*]] = insertelement <16 x i8> [[VECINS1]], i8 [[ADD3]], i64 2 +; GFX9-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <2 x i32> +; GFX9-NEXT: [[TMP1:%.*]] = mul <2 x i8> [[TMP0]], splat (i8 1) +; GFX9-NEXT: [[TMP2:%.*]] = add <2 x i8> [[TMP1]], splat (i8 1) +; GFX9-NEXT: [[VECINS0:%.*]] = insertelement <16 x i8> poison, i8 [[ADD3]], i64 0 +; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <16 x i32> +; GFX9-NEXT: [[VECINS2:%.*]] = shufflevector <16 x i8> [[VECINS0]], <16 x i8> [[TMP3]], <16 x i32> ; GFX9-NEXT: store <16 x i8> [[VECINS2]], ptr [[OUT]], align 16 ; GFX9-NEXT: ret void ; @@ -157,44 +143,20 @@ define protected amdgpu_kernel void @arith_4(<16 x i8> %invec, ptr %out, i32 %fl ; GFX8-LABEL: define protected amdgpu_kernel void @arith_4( ; GFX8-SAME: <16 x i8> [[INVEC:%.*]], ptr [[OUT:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] { ; GFX8-NEXT: [[ENTRY:.*:]] -; GFX8-NEXT: [[EL0:%.*]] = extractelement <16 x i8> [[INVEC]], i64 0 -; GFX8-NEXT: [[EL1:%.*]] = extractelement <16 x i8> [[INVEC]], i64 1 -; GFX8-NEXT: [[EL2:%.*]] = extractelement <16 x i8> [[INVEC]], i64 2 -; GFX8-NEXT: [[EL3:%.*]] = extractelement <16 x i8> [[INVEC]], i64 3 -; GFX8-NEXT: [[MUL0:%.*]] = mul i8 [[EL0]], 1 -; GFX8-NEXT: [[MUL1:%.*]] = mul i8 [[EL1]], 1 -; GFX8-NEXT: [[MUL2:%.*]] = mul i8 [[EL2]], 1 -; GFX8-NEXT: [[MUL3:%.*]] = mul i8 [[EL3]], 1 -; GFX8-NEXT: [[ADD0:%.*]] = add i8 [[MUL0]], 1 -; GFX8-NEXT: [[ADD1:%.*]] = add i8 [[MUL1]], 1 -; GFX8-NEXT: [[ADD2:%.*]] = add i8 [[MUL2]], 1 -; GFX8-NEXT: [[ADD3:%.*]] = add i8 [[MUL3]], 1 -; GFX8-NEXT: [[VECINS0:%.*]] = insertelement <16 x i8> poison, i8 [[ADD0]], i64 0 -; GFX8-NEXT: [[VECINS1:%.*]] = insertelement <16 x i8> [[VECINS0]], i8 [[ADD1]], i64 1 -; GFX8-NEXT: [[VECINS2:%.*]] = insertelement <16 x i8> [[VECINS1]], i8 [[ADD2]], i64 2 -; GFX8-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[VECINS2]], i8 [[ADD3]], i64 3 +; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> +; GFX8-NEXT: [[TMP1:%.*]] = mul <4 x i8> [[TMP0]], splat (i8 1) +; GFX8-NEXT: [[TMP2:%.*]] = add <4 x i8> [[TMP1]], splat (i8 1) +; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> ; GFX8-NEXT: store <16 x i8> [[TMP3]], ptr [[OUT]], align 16 ; GFX8-NEXT: ret void ; ; GFX9-LABEL: define protected amdgpu_kernel void @arith_4( ; GFX9-SAME: <16 x i8> [[INVEC:%.*]], ptr [[OUT:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] { ; GFX9-NEXT: [[ENTRY:.*:]] -; GFX9-NEXT: [[EL0:%.*]] = extractelement <16 x i8> [[INVEC]], i64 0 -; GFX9-NEXT: [[EL1:%.*]] = extractelement <16 x i8> [[INVEC]], i64 1 -; GFX9-NEXT: [[EL2:%.*]] = extractelement <16 x i8> [[INVEC]], i64 2 -; GFX9-NEXT: [[EL3:%.*]] = extractelement <16 x i8> [[INVEC]], i64 3 -; GFX9-NEXT: [[MUL0:%.*]] = mul i8 [[EL0]], 1 -; GFX9-NEXT: [[MUL1:%.*]] = mul i8 [[EL1]], 1 -; GFX9-NEXT: [[MUL2:%.*]] = mul i8 [[EL2]], 1 -; GFX9-NEXT: [[MUL3:%.*]] = mul i8 [[EL3]], 1 -; GFX9-NEXT: [[ADD0:%.*]] = add i8 [[MUL0]], 1 -; GFX9-NEXT: [[ADD1:%.*]] = add i8 [[MUL1]], 1 -; GFX9-NEXT: [[ADD2:%.*]] = add i8 [[MUL2]], 1 -; GFX9-NEXT: [[ADD3:%.*]] = add i8 [[MUL3]], 1 -; GFX9-NEXT: [[VECINS0:%.*]] = insertelement <16 x i8> poison, i8 [[ADD0]], i64 0 -; GFX9-NEXT: [[VECINS1:%.*]] = insertelement <16 x i8> [[VECINS0]], i8 [[ADD1]], i64 1 -; GFX9-NEXT: [[VECINS2:%.*]] = insertelement <16 x i8> [[VECINS1]], i8 [[ADD2]], i64 2 -; GFX9-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[VECINS2]], i8 [[ADD3]], i64 3 +; GFX9-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> +; GFX9-NEXT: [[TMP1:%.*]] = mul <4 x i8> [[TMP0]], splat (i8 1) +; GFX9-NEXT: [[TMP2:%.*]] = add <4 x i8> [[TMP1]], splat (i8 1) +; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> ; GFX9-NEXT: store <16 x i8> [[TMP3]], ptr [[OUT]], align 16 ; GFX9-NEXT: ret void ; @@ -293,140 +255,50 @@ define protected amdgpu_kernel void @arith_16(<16 x i8> %invec, ptr %out, i32 %f ; GFX8-LABEL: define protected amdgpu_kernel void @arith_16( ; GFX8-SAME: <16 x i8> [[INVEC:%.*]], ptr [[OUT:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] { ; GFX8-NEXT: [[ENTRY:.*:]] -; GFX8-NEXT: [[EL0:%.*]] = extractelement <16 x i8> [[INVEC]], i64 0 -; GFX8-NEXT: [[EL1:%.*]] = extractelement <16 x i8> [[INVEC]], i64 1 -; GFX8-NEXT: [[EL2:%.*]] = extractelement <16 x i8> [[INVEC]], i64 2 -; GFX8-NEXT: [[EL3:%.*]] = extractelement <16 x i8> [[INVEC]], i64 3 -; GFX8-NEXT: [[EL4:%.*]] = extractelement <16 x i8> [[INVEC]], i64 4 -; GFX8-NEXT: [[EL5:%.*]] = extractelement <16 x i8> [[INVEC]], i64 5 -; GFX8-NEXT: [[EL6:%.*]] = extractelement <16 x i8> [[INVEC]], i64 6 -; GFX8-NEXT: [[EL7:%.*]] = extractelement <16 x i8> [[INVEC]], i64 7 -; GFX8-NEXT: [[EL8:%.*]] = extractelement <16 x i8> [[INVEC]], i64 8 -; GFX8-NEXT: [[EL9:%.*]] = extractelement <16 x i8> [[INVEC]], i64 9 -; GFX8-NEXT: [[EL10:%.*]] = extractelement <16 x i8> [[INVEC]], i64 10 -; GFX8-NEXT: [[EL11:%.*]] = extractelement <16 x i8> [[INVEC]], i64 11 -; GFX8-NEXT: [[EL12:%.*]] = extractelement <16 x i8> [[INVEC]], i64 12 -; GFX8-NEXT: [[EL13:%.*]] = extractelement <16 x i8> [[INVEC]], i64 13 -; GFX8-NEXT: [[EL14:%.*]] = extractelement <16 x i8> [[INVEC]], i64 14 -; GFX8-NEXT: [[EL15:%.*]] = extractelement <16 x i8> [[INVEC]], i64 15 -; GFX8-NEXT: [[MUL0:%.*]] = mul i8 [[EL0]], 1 -; GFX8-NEXT: [[MUL1:%.*]] = mul i8 [[EL1]], 1 -; GFX8-NEXT: [[MUL2:%.*]] = mul i8 [[EL2]], 1 -; GFX8-NEXT: [[MUL3:%.*]] = mul i8 [[EL3]], 1 -; GFX8-NEXT: [[MUL4:%.*]] = mul i8 [[EL4]], 1 -; GFX8-NEXT: [[MUL5:%.*]] = mul i8 [[EL5]], 1 -; GFX8-NEXT: [[MUL6:%.*]] = mul i8 [[EL6]], 1 -; GFX8-NEXT: [[MUL7:%.*]] = mul i8 [[EL7]], 1 -; GFX8-NEXT: [[MUL8:%.*]] = mul i8 [[EL8]], 1 -; GFX8-NEXT: [[MUL9:%.*]] = mul i8 [[EL9]], 1 -; GFX8-NEXT: [[MUL10:%.*]] = mul i8 [[EL10]], 1 -; GFX8-NEXT: [[MUL11:%.*]] = mul i8 [[EL11]], 1 -; GFX8-NEXT: [[MUL12:%.*]] = mul i8 [[EL12]], 1 -; GFX8-NEXT: [[MUL13:%.*]] = mul i8 [[EL13]], 1 -; GFX8-NEXT: [[MUL14:%.*]] = mul i8 [[EL14]], 1 -; GFX8-NEXT: [[MUL15:%.*]] = mul i8 [[EL15]], 1 -; GFX8-NEXT: [[ADD0:%.*]] = add i8 [[MUL0]], 1 -; GFX8-NEXT: [[ADD1:%.*]] = add i8 [[MUL1]], 1 -; GFX8-NEXT: [[ADD2:%.*]] = add i8 [[MUL2]], 1 -; GFX8-NEXT: [[ADD3:%.*]] = add i8 [[MUL3]], 1 -; GFX8-NEXT: [[ADD4:%.*]] = add i8 [[MUL4]], 1 -; GFX8-NEXT: [[ADD5:%.*]] = add i8 [[MUL5]], 1 -; GFX8-NEXT: [[ADD6:%.*]] = add i8 [[MUL6]], 1 -; GFX8-NEXT: [[ADD7:%.*]] = add i8 [[MUL7]], 1 -; GFX8-NEXT: [[ADD8:%.*]] = add i8 [[MUL8]], 1 -; GFX8-NEXT: [[ADD9:%.*]] = add i8 [[MUL9]], 1 -; GFX8-NEXT: [[ADD10:%.*]] = add i8 [[MUL10]], 1 -; GFX8-NEXT: [[ADD11:%.*]] = add i8 [[MUL11]], 1 -; GFX8-NEXT: [[ADD12:%.*]] = add i8 [[MUL12]], 1 -; GFX8-NEXT: [[ADD13:%.*]] = add i8 [[MUL13]], 1 -; GFX8-NEXT: [[ADD14:%.*]] = add i8 [[MUL14]], 1 -; GFX8-NEXT: [[ADD15:%.*]] = add i8 [[MUL15]], 1 -; GFX8-NEXT: [[VECINS0:%.*]] = insertelement <16 x i8> poison, i8 [[ADD0]], i64 0 -; GFX8-NEXT: [[VECINS1:%.*]] = insertelement <16 x i8> [[VECINS0]], i8 [[ADD1]], i64 1 -; GFX8-NEXT: [[VECINS2:%.*]] = insertelement <16 x i8> [[VECINS1]], i8 [[ADD2]], i64 2 -; GFX8-NEXT: [[VECINS3:%.*]] = insertelement <16 x i8> [[VECINS2]], i8 [[ADD3]], i64 3 -; GFX8-NEXT: [[VECINS4:%.*]] = insertelement <16 x i8> [[VECINS3]], i8 [[ADD4]], i64 4 -; GFX8-NEXT: [[VECINS5:%.*]] = insertelement <16 x i8> [[VECINS4]], i8 [[ADD5]], i64 5 -; GFX8-NEXT: [[VECINS6:%.*]] = insertelement <16 x i8> [[VECINS5]], i8 [[ADD6]], i64 6 -; GFX8-NEXT: [[VECINS7:%.*]] = insertelement <16 x i8> [[VECINS6]], i8 [[ADD7]], i64 7 -; GFX8-NEXT: [[VECINS8:%.*]] = insertelement <16 x i8> [[VECINS7]], i8 [[ADD8]], i64 8 -; GFX8-NEXT: [[VECINS9:%.*]] = insertelement <16 x i8> [[VECINS8]], i8 [[ADD9]], i64 9 -; GFX8-NEXT: [[VECINS10:%.*]] = insertelement <16 x i8> [[VECINS9]], i8 [[ADD10]], i64 10 -; GFX8-NEXT: [[VECINS11:%.*]] = insertelement <16 x i8> [[VECINS10]], i8 [[ADD11]], i64 11 -; GFX8-NEXT: [[VECINS12:%.*]] = insertelement <16 x i8> [[VECINS11]], i8 [[ADD12]], i64 12 -; GFX8-NEXT: [[VECINS13:%.*]] = insertelement <16 x i8> [[VECINS12]], i8 [[ADD13]], i64 13 -; GFX8-NEXT: [[VECINS14:%.*]] = insertelement <16 x i8> [[VECINS13]], i8 [[ADD14]], i64 14 -; GFX8-NEXT: [[VECINS153:%.*]] = insertelement <16 x i8> [[VECINS14]], i8 [[ADD15]], i64 15 +; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> +; GFX8-NEXT: [[TMP1:%.*]] = mul <4 x i8> [[TMP0]], splat (i8 1) +; GFX8-NEXT: [[TMP2:%.*]] = add <4 x i8> [[TMP1]], splat (i8 1) +; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> +; GFX8-NEXT: [[TMP4:%.*]] = mul <4 x i8> [[TMP3]], splat (i8 1) +; GFX8-NEXT: [[TMP5:%.*]] = add <4 x i8> [[TMP4]], splat (i8 1) +; GFX8-NEXT: [[TMP6:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> +; GFX8-NEXT: [[TMP7:%.*]] = mul <4 x i8> [[TMP6]], splat (i8 1) +; GFX8-NEXT: [[TMP8:%.*]] = add <4 x i8> [[TMP7]], splat (i8 1) +; GFX8-NEXT: [[TMP9:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> +; GFX8-NEXT: [[TMP10:%.*]] = mul <4 x i8> [[TMP9]], splat (i8 1) +; GFX8-NEXT: [[TMP11:%.*]] = add <4 x i8> [[TMP10]], splat (i8 1) +; GFX8-NEXT: [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> +; GFX8-NEXT: [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> +; GFX8-NEXT: [[VECINS71:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> [[TMP13]], <16 x i32> +; GFX8-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> +; GFX8-NEXT: [[VECINS112:%.*]] = shufflevector <16 x i8> [[VECINS71]], <16 x i8> [[TMP14]], <16 x i32> +; GFX8-NEXT: [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> +; GFX8-NEXT: [[VECINS153:%.*]] = shufflevector <16 x i8> [[VECINS112]], <16 x i8> [[TMP15]], <16 x i32> ; GFX8-NEXT: store <16 x i8> [[VECINS153]], ptr [[OUT]], align 16 ; GFX8-NEXT: ret void ; ; GFX9-LABEL: define protected amdgpu_kernel void @arith_16( ; GFX9-SAME: <16 x i8> [[INVEC:%.*]], ptr [[OUT:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] { ; GFX9-NEXT: [[ENTRY:.*:]] -; GFX9-NEXT: [[EL0:%.*]] = extractelement <16 x i8> [[INVEC]], i64 0 -; GFX9-NEXT: [[EL1:%.*]] = extractelement <16 x i8> [[INVEC]], i64 1 -; GFX9-NEXT: [[EL2:%.*]] = extractelement <16 x i8> [[INVEC]], i64 2 -; GFX9-NEXT: [[EL3:%.*]] = extractelement <16 x i8> [[INVEC]], i64 3 -; GFX9-NEXT: [[EL4:%.*]] = extractelement <16 x i8> [[INVEC]], i64 4 -; GFX9-NEXT: [[EL5:%.*]] = extractelement <16 x i8> [[INVEC]], i64 5 -; GFX9-NEXT: [[EL6:%.*]] = extractelement <16 x i8> [[INVEC]], i64 6 -; GFX9-NEXT: [[EL7:%.*]] = extractelement <16 x i8> [[INVEC]], i64 7 -; GFX9-NEXT: [[EL8:%.*]] = extractelement <16 x i8> [[INVEC]], i64 8 -; GFX9-NEXT: [[EL9:%.*]] = extractelement <16 x i8> [[INVEC]], i64 9 -; GFX9-NEXT: [[EL10:%.*]] = extractelement <16 x i8> [[INVEC]], i64 10 -; GFX9-NEXT: [[EL11:%.*]] = extractelement <16 x i8> [[INVEC]], i64 11 -; GFX9-NEXT: [[EL12:%.*]] = extractelement <16 x i8> [[INVEC]], i64 12 -; GFX9-NEXT: [[EL13:%.*]] = extractelement <16 x i8> [[INVEC]], i64 13 -; GFX9-NEXT: [[EL14:%.*]] = extractelement <16 x i8> [[INVEC]], i64 14 -; GFX9-NEXT: [[EL15:%.*]] = extractelement <16 x i8> [[INVEC]], i64 15 -; GFX9-NEXT: [[MUL0:%.*]] = mul i8 [[EL0]], 1 -; GFX9-NEXT: [[MUL1:%.*]] = mul i8 [[EL1]], 1 -; GFX9-NEXT: [[MUL2:%.*]] = mul i8 [[EL2]], 1 -; GFX9-NEXT: [[MUL3:%.*]] = mul i8 [[EL3]], 1 -; GFX9-NEXT: [[MUL4:%.*]] = mul i8 [[EL4]], 1 -; GFX9-NEXT: [[MUL5:%.*]] = mul i8 [[EL5]], 1 -; GFX9-NEXT: [[MUL6:%.*]] = mul i8 [[EL6]], 1 -; GFX9-NEXT: [[MUL7:%.*]] = mul i8 [[EL7]], 1 -; GFX9-NEXT: [[MUL8:%.*]] = mul i8 [[EL8]], 1 -; GFX9-NEXT: [[MUL9:%.*]] = mul i8 [[EL9]], 1 -; GFX9-NEXT: [[MUL10:%.*]] = mul i8 [[EL10]], 1 -; GFX9-NEXT: [[MUL11:%.*]] = mul i8 [[EL11]], 1 -; GFX9-NEXT: [[MUL12:%.*]] = mul i8 [[EL12]], 1 -; GFX9-NEXT: [[MUL13:%.*]] = mul i8 [[EL13]], 1 -; GFX9-NEXT: [[MUL14:%.*]] = mul i8 [[EL14]], 1 -; GFX9-NEXT: [[MUL15:%.*]] = mul i8 [[EL15]], 1 -; GFX9-NEXT: [[ADD0:%.*]] = add i8 [[MUL0]], 1 -; GFX9-NEXT: [[ADD1:%.*]] = add i8 [[MUL1]], 1 -; GFX9-NEXT: [[ADD2:%.*]] = add i8 [[MUL2]], 1 -; GFX9-NEXT: [[ADD3:%.*]] = add i8 [[MUL3]], 1 -; GFX9-NEXT: [[ADD4:%.*]] = add i8 [[MUL4]], 1 -; GFX9-NEXT: [[ADD5:%.*]] = add i8 [[MUL5]], 1 -; GFX9-NEXT: [[ADD6:%.*]] = add i8 [[MUL6]], 1 -; GFX9-NEXT: [[ADD7:%.*]] = add i8 [[MUL7]], 1 -; GFX9-NEXT: [[ADD8:%.*]] = add i8 [[MUL8]], 1 -; GFX9-NEXT: [[ADD9:%.*]] = add i8 [[MUL9]], 1 -; GFX9-NEXT: [[ADD10:%.*]] = add i8 [[MUL10]], 1 -; GFX9-NEXT: [[ADD11:%.*]] = add i8 [[MUL11]], 1 -; GFX9-NEXT: [[ADD12:%.*]] = add i8 [[MUL12]], 1 -; GFX9-NEXT: [[ADD13:%.*]] = add i8 [[MUL13]], 1 -; GFX9-NEXT: [[ADD14:%.*]] = add i8 [[MUL14]], 1 -; GFX9-NEXT: [[ADD15:%.*]] = add i8 [[MUL15]], 1 -; GFX9-NEXT: [[VECINS0:%.*]] = insertelement <16 x i8> poison, i8 [[ADD0]], i64 0 -; GFX9-NEXT: [[VECINS1:%.*]] = insertelement <16 x i8> [[VECINS0]], i8 [[ADD1]], i64 1 -; GFX9-NEXT: [[VECINS2:%.*]] = insertelement <16 x i8> [[VECINS1]], i8 [[ADD2]], i64 2 -; GFX9-NEXT: [[VECINS3:%.*]] = insertelement <16 x i8> [[VECINS2]], i8 [[ADD3]], i64 3 -; GFX9-NEXT: [[VECINS4:%.*]] = insertelement <16 x i8> [[VECINS3]], i8 [[ADD4]], i64 4 -; GFX9-NEXT: [[VECINS5:%.*]] = insertelement <16 x i8> [[VECINS4]], i8 [[ADD5]], i64 5 -; GFX9-NEXT: [[VECINS6:%.*]] = insertelement <16 x i8> [[VECINS5]], i8 [[ADD6]], i64 6 -; GFX9-NEXT: [[VECINS7:%.*]] = insertelement <16 x i8> [[VECINS6]], i8 [[ADD7]], i64 7 -; GFX9-NEXT: [[VECINS8:%.*]] = insertelement <16 x i8> [[VECINS7]], i8 [[ADD8]], i64 8 -; GFX9-NEXT: [[VECINS9:%.*]] = insertelement <16 x i8> [[VECINS8]], i8 [[ADD9]], i64 9 -; GFX9-NEXT: [[VECINS10:%.*]] = insertelement <16 x i8> [[VECINS9]], i8 [[ADD10]], i64 10 -; GFX9-NEXT: [[VECINS11:%.*]] = insertelement <16 x i8> [[VECINS10]], i8 [[ADD11]], i64 11 -; GFX9-NEXT: [[VECINS12:%.*]] = insertelement <16 x i8> [[VECINS11]], i8 [[ADD12]], i64 12 -; GFX9-NEXT: [[VECINS13:%.*]] = insertelement <16 x i8> [[VECINS12]], i8 [[ADD13]], i64 13 -; GFX9-NEXT: [[VECINS14:%.*]] = insertelement <16 x i8> [[VECINS13]], i8 [[ADD14]], i64 14 -; GFX9-NEXT: [[VECINS153:%.*]] = insertelement <16 x i8> [[VECINS14]], i8 [[ADD15]], i64 15 +; GFX9-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> +; GFX9-NEXT: [[TMP1:%.*]] = mul <4 x i8> [[TMP0]], splat (i8 1) +; GFX9-NEXT: [[TMP2:%.*]] = add <4 x i8> [[TMP1]], splat (i8 1) +; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> +; GFX9-NEXT: [[TMP4:%.*]] = mul <4 x i8> [[TMP3]], splat (i8 1) +; GFX9-NEXT: [[TMP5:%.*]] = add <4 x i8> [[TMP4]], splat (i8 1) +; GFX9-NEXT: [[TMP6:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> +; GFX9-NEXT: [[TMP7:%.*]] = mul <4 x i8> [[TMP6]], splat (i8 1) +; GFX9-NEXT: [[TMP8:%.*]] = add <4 x i8> [[TMP7]], splat (i8 1) +; GFX9-NEXT: [[TMP9:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> +; GFX9-NEXT: [[TMP10:%.*]] = mul <4 x i8> [[TMP9]], splat (i8 1) +; GFX9-NEXT: [[TMP11:%.*]] = add <4 x i8> [[TMP10]], splat (i8 1) +; GFX9-NEXT: [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> +; GFX9-NEXT: [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> +; GFX9-NEXT: [[VECINS71:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> [[TMP13]], <16 x i32> +; GFX9-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> +; GFX9-NEXT: [[VECINS112:%.*]] = shufflevector <16 x i8> [[VECINS71]], <16 x i8> [[TMP14]], <16 x i32> +; GFX9-NEXT: [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> +; GFX9-NEXT: [[VECINS153:%.*]] = shufflevector <16 x i8> [[VECINS112]], <16 x i8> [[TMP15]], <16 x i32> ; GFX9-NEXT: store <16 x i8> [[VECINS153]], ptr [[OUT]], align 16 ; GFX9-NEXT: ret void ; @@ -529,19 +401,13 @@ define protected amdgpu_kernel void @phi_2(ptr addrspace(3) %inptr0, ptr addrspa ; GFX8-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] { ; GFX8-NEXT: [[ENTRY:.*]]: ; GFX8-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0 -; GFX8-NEXT: [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8 -; GFX8-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1 -; GFX8-NEXT: [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1 +; GFX8-NEXT: [[TMP0:%.*]] = load <2 x i8>, ptr addrspace(3) [[GEP0]], align 8 ; GFX8-NEXT: br label %[[DO_BODY:.*]] ; GFX8: [[DO_BODY]]: -; GFX8-NEXT: [[PHI2:%.*]] = phi i8 [ [[ELE1]], %[[ENTRY]] ], [ [[OTHERELE1:%.*]], %[[DO_BODY]] ] -; GFX8-NEXT: [[PHI3:%.*]] = phi i8 [ [[ELE0]], %[[ENTRY]] ], [ [[OTHERELE0:%.*]], %[[DO_BODY]] ] -; GFX8-NEXT: [[OTHERELE0]] = load i8, ptr addrspace(3) [[GEP0]], align 8 -; GFX8-NEXT: [[OTHERELE1]] = load i8, ptr addrspace(3) [[GEP1]], align 1 -; GFX8-NEXT: [[VEC00:%.*]] = insertelement <16 x i8> poison, i8 [[OTHERELE0]], i64 8 -; GFX8-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[VEC00]], i8 [[OTHERELE1]], i64 9 -; GFX8-NEXT: [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8 -; GFX8-NEXT: [[VEC111:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9 +; GFX8-NEXT: [[TMP1:%.*]] = phi <2 x i8> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[DO_BODY]] ] +; GFX8-NEXT: [[TMP2]] = load <2 x i8>, ptr addrspace(3) [[GEP0]], align 8 +; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <16 x i32> +; GFX8-NEXT: [[VEC111:%.*]] = shufflevector <2 x i8> [[TMP1]], <2 x i8> poison, <16 x i32> ; GFX8-NEXT: store <16 x i8> [[VEC111]], ptr addrspace(3) [[INPTR1]], align 2 ; GFX8-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0 ; GFX8-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]] @@ -554,19 +420,13 @@ define protected amdgpu_kernel void @phi_2(ptr addrspace(3) %inptr0, ptr addrspa ; GFX9-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] { ; GFX9-NEXT: [[ENTRY:.*]]: ; GFX9-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0 -; GFX9-NEXT: [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8 -; GFX9-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1 -; GFX9-NEXT: [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1 +; GFX9-NEXT: [[TMP0:%.*]] = load <2 x i8>, ptr addrspace(3) [[GEP0]], align 8 ; GFX9-NEXT: br label %[[DO_BODY:.*]] ; GFX9: [[DO_BODY]]: -; GFX9-NEXT: [[PHI2:%.*]] = phi i8 [ [[ELE1]], %[[ENTRY]] ], [ [[OTHERELE1:%.*]], %[[DO_BODY]] ] -; GFX9-NEXT: [[PHI3:%.*]] = phi i8 [ [[ELE0]], %[[ENTRY]] ], [ [[OTHERELE0:%.*]], %[[DO_BODY]] ] -; GFX9-NEXT: [[OTHERELE0]] = load i8, ptr addrspace(3) [[GEP0]], align 8 -; GFX9-NEXT: [[OTHERELE1]] = load i8, ptr addrspace(3) [[GEP1]], align 1 -; GFX9-NEXT: [[VEC00:%.*]] = insertelement <16 x i8> poison, i8 [[OTHERELE0]], i64 8 -; GFX9-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[VEC00]], i8 [[OTHERELE1]], i64 9 -; GFX9-NEXT: [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8 -; GFX9-NEXT: [[VEC111:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9 +; GFX9-NEXT: [[TMP1:%.*]] = phi <2 x i8> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[DO_BODY]] ] +; GFX9-NEXT: [[TMP2]] = load <2 x i8>, ptr addrspace(3) [[GEP0]], align 8 +; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <16 x i32> +; GFX9-NEXT: [[VEC111:%.*]] = shufflevector <2 x i8> [[TMP1]], <2 x i8> poison, <16 x i32> ; GFX9-NEXT: store <16 x i8> [[VEC111]], ptr addrspace(3) [[INPTR1]], align 2 ; GFX9-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0 ; GFX9-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]] @@ -637,24 +497,18 @@ define protected amdgpu_kernel void @phi_3(ptr addrspace(3) %inptr0, ptr addrspa ; GFX8-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] { ; GFX8-NEXT: [[ENTRY:.*]]: ; GFX8-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0 -; GFX8-NEXT: [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8 -; GFX8-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1 -; GFX8-NEXT: [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1 +; GFX8-NEXT: [[TMP0:%.*]] = load <2 x i8>, ptr addrspace(3) [[GEP0]], align 8 ; GFX8-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2 ; GFX8-NEXT: [[ELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2 ; GFX8-NEXT: br label %[[DO_BODY:.*]] ; GFX8: [[DO_BODY]]: ; GFX8-NEXT: [[PHI1:%.*]] = phi i8 [ [[ELE2]], %[[ENTRY]] ], [ [[OTHERELE2:%.*]], %[[DO_BODY]] ] -; GFX8-NEXT: [[PHI2:%.*]] = phi i8 [ [[ELE1]], %[[ENTRY]] ], [ [[OTHERELE1:%.*]], %[[DO_BODY]] ] -; GFX8-NEXT: [[PHI3:%.*]] = phi i8 [ [[ELE0]], %[[ENTRY]] ], [ [[OTHERELE0:%.*]], %[[DO_BODY]] ] -; GFX8-NEXT: [[OTHERELE0]] = load i8, ptr addrspace(3) [[GEP0]], align 8 -; GFX8-NEXT: [[OTHERELE1]] = load i8, ptr addrspace(3) [[GEP1]], align 1 +; GFX8-NEXT: [[TMP1:%.*]] = phi <2 x i8> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[DO_BODY]] ] +; GFX8-NEXT: [[TMP2]] = load <2 x i8>, ptr addrspace(3) [[GEP0]], align 8 ; GFX8-NEXT: [[OTHERELE2]] = load i8, ptr addrspace(3) [[GEP2]], align 2 -; GFX8-NEXT: [[VEC00:%.*]] = insertelement <16 x i8> poison, i8 [[OTHERELE0]], i64 8 -; GFX8-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[VEC00]], i8 [[OTHERELE1]], i64 9 +; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <16 x i32> ; GFX8-NEXT: [[VEC02:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[OTHERELE2]], i64 10 -; GFX8-NEXT: [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8 -; GFX8-NEXT: [[VEC111:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9 +; GFX8-NEXT: [[VEC111:%.*]] = shufflevector <2 x i8> [[TMP1]], <2 x i8> poison, <16 x i32> ; GFX8-NEXT: [[VEC12:%.*]] = insertelement <16 x i8> [[VEC111]], i8 [[PHI1]], i64 10 ; GFX8-NEXT: store <16 x i8> [[VEC12]], ptr addrspace(3) [[INPTR1]], align 2 ; GFX8-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0 @@ -668,24 +522,18 @@ define protected amdgpu_kernel void @phi_3(ptr addrspace(3) %inptr0, ptr addrspa ; GFX9-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] { ; GFX9-NEXT: [[ENTRY:.*]]: ; GFX9-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0 -; GFX9-NEXT: [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8 -; GFX9-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1 -; GFX9-NEXT: [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1 +; GFX9-NEXT: [[TMP0:%.*]] = load <2 x i8>, ptr addrspace(3) [[GEP0]], align 8 ; GFX9-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2 ; GFX9-NEXT: [[ELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2 ; GFX9-NEXT: br label %[[DO_BODY:.*]] ; GFX9: [[DO_BODY]]: ; GFX9-NEXT: [[PHI1:%.*]] = phi i8 [ [[ELE2]], %[[ENTRY]] ], [ [[OTHERELE2:%.*]], %[[DO_BODY]] ] -; GFX9-NEXT: [[PHI2:%.*]] = phi i8 [ [[ELE1]], %[[ENTRY]] ], [ [[OTHERELE1:%.*]], %[[DO_BODY]] ] -; GFX9-NEXT: [[PHI3:%.*]] = phi i8 [ [[ELE0]], %[[ENTRY]] ], [ [[OTHERELE0:%.*]], %[[DO_BODY]] ] -; GFX9-NEXT: [[OTHERELE0]] = load i8, ptr addrspace(3) [[GEP0]], align 8 -; GFX9-NEXT: [[OTHERELE1]] = load i8, ptr addrspace(3) [[GEP1]], align 1 +; GFX9-NEXT: [[TMP1:%.*]] = phi <2 x i8> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[DO_BODY]] ] +; GFX9-NEXT: [[TMP2]] = load <2 x i8>, ptr addrspace(3) [[GEP0]], align 8 ; GFX9-NEXT: [[OTHERELE2]] = load i8, ptr addrspace(3) [[GEP2]], align 2 -; GFX9-NEXT: [[VEC00:%.*]] = insertelement <16 x i8> poison, i8 [[OTHERELE0]], i64 8 -; GFX9-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[VEC00]], i8 [[OTHERELE1]], i64 9 +; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <16 x i32> ; GFX9-NEXT: [[VEC02:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[OTHERELE2]], i64 10 -; GFX9-NEXT: [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8 -; GFX9-NEXT: [[VEC111:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9 +; GFX9-NEXT: [[VEC111:%.*]] = shufflevector <2 x i8> [[TMP1]], <2 x i8> poison, <16 x i32> ; GFX9-NEXT: [[VEC12:%.*]] = insertelement <16 x i8> [[VEC111]], i8 [[PHI1]], i64 10 ; GFX9-NEXT: store <16 x i8> [[VEC12]], ptr addrspace(3) [[INPTR1]], align 2 ; GFX9-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0 @@ -769,31 +617,13 @@ define protected amdgpu_kernel void @phi_4(ptr addrspace(3) %inptr0, ptr addrspa ; GFX8-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] { ; GFX8-NEXT: [[ENTRY:.*]]: ; GFX8-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0 -; GFX8-NEXT: [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8 -; GFX8-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1 -; GFX8-NEXT: [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1 -; GFX8-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2 -; GFX8-NEXT: [[ELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2 -; GFX8-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 3 -; GFX8-NEXT: [[ELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1 +; GFX8-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8 ; GFX8-NEXT: br label %[[DO_BODY:.*]] ; GFX8: [[DO_BODY]]: -; GFX8-NEXT: [[PHI0:%.*]] = phi i8 [ [[ELE3]], %[[ENTRY]] ], [ [[OTHERELE3:%.*]], %[[DO_BODY]] ] -; GFX8-NEXT: [[PHI1:%.*]] = phi i8 [ [[ELE2]], %[[ENTRY]] ], [ [[OTHERELE2:%.*]], %[[DO_BODY]] ] -; GFX8-NEXT: [[PHI2:%.*]] = phi i8 [ [[ELE1]], %[[ENTRY]] ], [ [[OTHERELE1:%.*]], %[[DO_BODY]] ] -; GFX8-NEXT: [[PHI3:%.*]] = phi i8 [ [[ELE0]], %[[ENTRY]] ], [ [[OTHERELE0:%.*]], %[[DO_BODY]] ] -; GFX8-NEXT: [[OTHERELE0]] = load i8, ptr addrspace(3) [[GEP0]], align 8 -; GFX8-NEXT: [[OTHERELE1]] = load i8, ptr addrspace(3) [[GEP1]], align 1 -; GFX8-NEXT: [[OTHERELE2]] = load i8, ptr addrspace(3) [[GEP2]], align 2 -; GFX8-NEXT: [[OTHERELE3]] = load i8, ptr addrspace(3) [[GEP3]], align 1 -; GFX8-NEXT: [[VEC00:%.*]] = insertelement <16 x i8> poison, i8 [[OTHERELE0]], i64 8 -; GFX8-NEXT: [[VEC01:%.*]] = insertelement <16 x i8> [[VEC00]], i8 [[OTHERELE1]], i64 9 -; GFX8-NEXT: [[VEC02:%.*]] = insertelement <16 x i8> [[VEC01]], i8 [[OTHERELE2]], i64 10 -; GFX8-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[VEC02]], i8 [[OTHERELE3]], i64 11 -; GFX8-NEXT: [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8 -; GFX8-NEXT: [[VEC11:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9 -; GFX8-NEXT: [[VEC12:%.*]] = insertelement <16 x i8> [[VEC11]], i8 [[PHI1]], i64 10 -; GFX8-NEXT: [[VEC131:%.*]] = insertelement <16 x i8> [[VEC12]], i8 [[PHI0]], i64 11 +; GFX8-NEXT: [[TMP1:%.*]] = phi <4 x i8> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[DO_BODY]] ] +; GFX8-NEXT: [[TMP2]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8 +; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> +; GFX8-NEXT: [[VEC131:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> ; GFX8-NEXT: store <16 x i8> [[VEC131]], ptr addrspace(3) [[INPTR1]], align 2 ; GFX8-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0 ; GFX8-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]] @@ -806,31 +636,13 @@ define protected amdgpu_kernel void @phi_4(ptr addrspace(3) %inptr0, ptr addrspa ; GFX9-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] { ; GFX9-NEXT: [[ENTRY:.*]]: ; GFX9-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0 -; GFX9-NEXT: [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8 -; GFX9-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1 -; GFX9-NEXT: [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1 -; GFX9-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2 -; GFX9-NEXT: [[ELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2 -; GFX9-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 3 -; GFX9-NEXT: [[ELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1 +; GFX9-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8 ; GFX9-NEXT: br label %[[DO_BODY:.*]] ; GFX9: [[DO_BODY]]: -; GFX9-NEXT: [[PHI0:%.*]] = phi i8 [ [[ELE3]], %[[ENTRY]] ], [ [[OTHERELE3:%.*]], %[[DO_BODY]] ] -; GFX9-NEXT: [[PHI1:%.*]] = phi i8 [ [[ELE2]], %[[ENTRY]] ], [ [[OTHERELE2:%.*]], %[[DO_BODY]] ] -; GFX9-NEXT: [[PHI2:%.*]] = phi i8 [ [[ELE1]], %[[ENTRY]] ], [ [[OTHERELE1:%.*]], %[[DO_BODY]] ] -; GFX9-NEXT: [[PHI3:%.*]] = phi i8 [ [[ELE0]], %[[ENTRY]] ], [ [[OTHERELE0:%.*]], %[[DO_BODY]] ] -; GFX9-NEXT: [[OTHERELE0]] = load i8, ptr addrspace(3) [[GEP0]], align 8 -; GFX9-NEXT: [[OTHERELE1]] = load i8, ptr addrspace(3) [[GEP1]], align 1 -; GFX9-NEXT: [[OTHERELE2]] = load i8, ptr addrspace(3) [[GEP2]], align 2 -; GFX9-NEXT: [[OTHERELE3]] = load i8, ptr addrspace(3) [[GEP3]], align 1 -; GFX9-NEXT: [[VEC00:%.*]] = insertelement <16 x i8> poison, i8 [[OTHERELE0]], i64 8 -; GFX9-NEXT: [[VEC01:%.*]] = insertelement <16 x i8> [[VEC00]], i8 [[OTHERELE1]], i64 9 -; GFX9-NEXT: [[VEC02:%.*]] = insertelement <16 x i8> [[VEC01]], i8 [[OTHERELE2]], i64 10 -; GFX9-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[VEC02]], i8 [[OTHERELE3]], i64 11 -; GFX9-NEXT: [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8 -; GFX9-NEXT: [[VEC11:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9 -; GFX9-NEXT: [[VEC12:%.*]] = insertelement <16 x i8> [[VEC11]], i8 [[PHI1]], i64 10 -; GFX9-NEXT: [[VEC131:%.*]] = insertelement <16 x i8> [[VEC12]], i8 [[PHI0]], i64 11 +; GFX9-NEXT: [[TMP1:%.*]] = phi <4 x i8> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[DO_BODY]] ] +; GFX9-NEXT: [[TMP2]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8 +; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> +; GFX9-NEXT: [[VEC131:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> ; GFX9-NEXT: store <16 x i8> [[VEC131]], ptr addrspace(3) [[INPTR1]], align 2 ; GFX9-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0 ; GFX9-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]] @@ -882,31 +694,13 @@ define protected amdgpu_kernel void @phi_4_with_stores(ptr addrspace(3) %inptr0, ; GFX7-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] { ; GFX7-NEXT: [[ENTRY:.*]]: ; GFX7-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0 -; GFX7-NEXT: [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8 -; GFX7-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1 -; GFX7-NEXT: [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1 -; GFX7-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2 -; GFX7-NEXT: [[ELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2 -; GFX7-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 3 -; GFX7-NEXT: [[ELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1 +; GFX7-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8 ; GFX7-NEXT: br label %[[DO_BODY:.*]] ; GFX7: [[DO_BODY]]: -; GFX7-NEXT: [[PHI0:%.*]] = phi i8 [ [[ELE3]], %[[ENTRY]] ], [ [[OTHERELE3:%.*]], %[[DO_BODY]] ] -; GFX7-NEXT: [[PHI1:%.*]] = phi i8 [ [[ELE2]], %[[ENTRY]] ], [ [[OTHERELE2:%.*]], %[[DO_BODY]] ] -; GFX7-NEXT: [[PHI2:%.*]] = phi i8 [ [[ELE1]], %[[ENTRY]] ], [ [[OTHERELE1:%.*]], %[[DO_BODY]] ] -; GFX7-NEXT: [[PHI3:%.*]] = phi i8 [ [[ELE0]], %[[ENTRY]] ], [ [[OTHERELE0:%.*]], %[[DO_BODY]] ] -; GFX7-NEXT: [[OTHERELE0]] = load i8, ptr addrspace(3) [[GEP0]], align 8 -; GFX7-NEXT: [[OTHERELE1]] = load i8, ptr addrspace(3) [[GEP1]], align 1 -; GFX7-NEXT: [[OTHERELE2]] = load i8, ptr addrspace(3) [[GEP2]], align 2 -; GFX7-NEXT: [[OTHERELE3]] = load i8, ptr addrspace(3) [[GEP3]], align 1 -; GFX7-NEXT: store i8 [[PHI3]], ptr addrspace(3) [[GEP0]], align 2 -; GFX7-NEXT: store i8 [[PHI2]], ptr addrspace(3) [[GEP1]], align 2 -; GFX7-NEXT: store i8 [[PHI1]], ptr addrspace(3) [[GEP2]], align 2 -; GFX7-NEXT: store i8 [[PHI0]], ptr addrspace(3) [[GEP3]], align 2 -; GFX7-NEXT: [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8 -; GFX7-NEXT: [[VEC11:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9 -; GFX7-NEXT: [[VEC12:%.*]] = insertelement <16 x i8> [[VEC11]], i8 [[PHI1]], i64 10 -; GFX7-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[VEC12]], i8 [[PHI0]], i64 11 +; GFX7-NEXT: [[TMP1:%.*]] = phi <4 x i8> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[DO_BODY]] ] +; GFX7-NEXT: [[TMP2]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8 +; GFX7-NEXT: store <4 x i8> [[TMP1]], ptr addrspace(3) [[GEP0]], align 2 +; GFX7-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> ; GFX7-NEXT: store <16 x i8> [[TMP3]], ptr addrspace(3) [[INPTR1]], align 2 ; GFX7-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0 ; GFX7-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]] @@ -919,31 +713,13 @@ define protected amdgpu_kernel void @phi_4_with_stores(ptr addrspace(3) %inptr0, ; GFX8-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] { ; GFX8-NEXT: [[ENTRY:.*]]: ; GFX8-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0 -; GFX8-NEXT: [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8 -; GFX8-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1 -; GFX8-NEXT: [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1 -; GFX8-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2 -; GFX8-NEXT: [[ELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2 -; GFX8-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 3 -; GFX8-NEXT: [[ELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1 +; GFX8-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8 ; GFX8-NEXT: br label %[[DO_BODY:.*]] ; GFX8: [[DO_BODY]]: -; GFX8-NEXT: [[PHI0:%.*]] = phi i8 [ [[ELE3]], %[[ENTRY]] ], [ [[OTHERELE3:%.*]], %[[DO_BODY]] ] -; GFX8-NEXT: [[PHI1:%.*]] = phi i8 [ [[ELE2]], %[[ENTRY]] ], [ [[OTHERELE2:%.*]], %[[DO_BODY]] ] -; GFX8-NEXT: [[PHI2:%.*]] = phi i8 [ [[ELE1]], %[[ENTRY]] ], [ [[OTHERELE1:%.*]], %[[DO_BODY]] ] -; GFX8-NEXT: [[PHI3:%.*]] = phi i8 [ [[ELE0]], %[[ENTRY]] ], [ [[OTHERELE0:%.*]], %[[DO_BODY]] ] -; GFX8-NEXT: [[OTHERELE0]] = load i8, ptr addrspace(3) [[GEP0]], align 8 -; GFX8-NEXT: [[OTHERELE1]] = load i8, ptr addrspace(3) [[GEP1]], align 1 -; GFX8-NEXT: [[OTHERELE2]] = load i8, ptr addrspace(3) [[GEP2]], align 2 -; GFX8-NEXT: [[OTHERELE3]] = load i8, ptr addrspace(3) [[GEP3]], align 1 -; GFX8-NEXT: store i8 [[PHI3]], ptr addrspace(3) [[GEP0]], align 2 -; GFX8-NEXT: store i8 [[PHI2]], ptr addrspace(3) [[GEP1]], align 2 -; GFX8-NEXT: store i8 [[PHI1]], ptr addrspace(3) [[GEP2]], align 2 -; GFX8-NEXT: store i8 [[PHI0]], ptr addrspace(3) [[GEP3]], align 2 -; GFX8-NEXT: [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8 -; GFX8-NEXT: [[VEC11:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9 -; GFX8-NEXT: [[VEC12:%.*]] = insertelement <16 x i8> [[VEC11]], i8 [[PHI1]], i64 10 -; GFX8-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[VEC12]], i8 [[PHI0]], i64 11 +; GFX8-NEXT: [[TMP1:%.*]] = phi <4 x i8> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[DO_BODY]] ] +; GFX8-NEXT: [[TMP2]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8 +; GFX8-NEXT: store <4 x i8> [[TMP1]], ptr addrspace(3) [[GEP0]], align 2 +; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> ; GFX8-NEXT: store <16 x i8> [[TMP3]], ptr addrspace(3) [[INPTR1]], align 2 ; GFX8-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0 ; GFX8-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]] @@ -956,31 +732,13 @@ define protected amdgpu_kernel void @phi_4_with_stores(ptr addrspace(3) %inptr0, ; GFX9-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] { ; GFX9-NEXT: [[ENTRY:.*]]: ; GFX9-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0 -; GFX9-NEXT: [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8 -; GFX9-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1 -; GFX9-NEXT: [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1 -; GFX9-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2 -; GFX9-NEXT: [[ELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2 -; GFX9-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 3 -; GFX9-NEXT: [[ELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1 +; GFX9-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8 ; GFX9-NEXT: br label %[[DO_BODY:.*]] ; GFX9: [[DO_BODY]]: -; GFX9-NEXT: [[PHI0:%.*]] = phi i8 [ [[ELE3]], %[[ENTRY]] ], [ [[OTHERELE3:%.*]], %[[DO_BODY]] ] -; GFX9-NEXT: [[PHI1:%.*]] = phi i8 [ [[ELE2]], %[[ENTRY]] ], [ [[OTHERELE2:%.*]], %[[DO_BODY]] ] -; GFX9-NEXT: [[PHI2:%.*]] = phi i8 [ [[ELE1]], %[[ENTRY]] ], [ [[OTHERELE1:%.*]], %[[DO_BODY]] ] -; GFX9-NEXT: [[PHI3:%.*]] = phi i8 [ [[ELE0]], %[[ENTRY]] ], [ [[OTHERELE0:%.*]], %[[DO_BODY]] ] -; GFX9-NEXT: [[OTHERELE0]] = load i8, ptr addrspace(3) [[GEP0]], align 8 -; GFX9-NEXT: [[OTHERELE1]] = load i8, ptr addrspace(3) [[GEP1]], align 1 -; GFX9-NEXT: [[OTHERELE2]] = load i8, ptr addrspace(3) [[GEP2]], align 2 -; GFX9-NEXT: [[OTHERELE3]] = load i8, ptr addrspace(3) [[GEP3]], align 1 -; GFX9-NEXT: store i8 [[PHI3]], ptr addrspace(3) [[GEP0]], align 2 -; GFX9-NEXT: store i8 [[PHI2]], ptr addrspace(3) [[GEP1]], align 2 -; GFX9-NEXT: store i8 [[PHI1]], ptr addrspace(3) [[GEP2]], align 2 -; GFX9-NEXT: store i8 [[PHI0]], ptr addrspace(3) [[GEP3]], align 2 -; GFX9-NEXT: [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8 -; GFX9-NEXT: [[VEC11:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9 -; GFX9-NEXT: [[VEC12:%.*]] = insertelement <16 x i8> [[VEC11]], i8 [[PHI1]], i64 10 -; GFX9-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[VEC12]], i8 [[PHI0]], i64 11 +; GFX9-NEXT: [[TMP1:%.*]] = phi <4 x i8> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[DO_BODY]] ] +; GFX9-NEXT: [[TMP2]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8 +; GFX9-NEXT: store <4 x i8> [[TMP1]], ptr addrspace(3) [[GEP0]], align 2 +; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> ; GFX9-NEXT: store <16 x i8> [[TMP3]], ptr addrspace(3) [[INPTR1]], align 2 ; GFX9-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0 ; GFX9-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]] @@ -1031,10 +789,6 @@ define protected amdgpu_kernel void @phi_4_with_stores_outside_loop(<4 x i8> %in ; GFX7-LABEL: define protected amdgpu_kernel void @phi_4_with_stores_outside_loop( ; GFX7-SAME: <4 x i8> [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr addrspace(3) [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] { ; GFX7-NEXT: [[ENTRY:.*]]: -; GFX7-NEXT: [[TMP4:%.*]] = extractelement <4 x i8> [[INPTR0]], i32 3 -; GFX7-NEXT: [[TMP1:%.*]] = extractelement <4 x i8> [[INPTR0]], i32 2 -; GFX7-NEXT: [[TMP2:%.*]] = extractelement <4 x i8> [[INPTR0]], i32 1 -; GFX7-NEXT: [[TMP3:%.*]] = extractelement <4 x i8> [[INPTR0]], i32 0 ; GFX7-NEXT: br label %[[DO_BODY:.*]] ; GFX7: [[DO_BODY]]: ; GFX7-NEXT: [[TMP0:%.*]] = phi <4 x i8> [ [[INPTR0]], %[[ENTRY]] ], [ [[INPTR0]], %[[DO_BODY]] ] @@ -1044,22 +798,12 @@ define protected amdgpu_kernel void @phi_4_with_stores_outside_loop(<4 x i8> %in ; GFX7-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]] ; GFX7: [[EXIT]]: ; GFX7-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[OUT1]], i32 0 -; GFX7-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[OUT1]], i32 1 -; GFX7-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[OUT1]], i32 2 -; GFX7-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[OUT1]], i32 3 -; GFX7-NEXT: store i8 [[TMP3]], ptr addrspace(3) [[GEP0]], align 1 -; GFX7-NEXT: store i8 [[TMP2]], ptr addrspace(3) [[GEP1]], align 1 -; GFX7-NEXT: store i8 [[TMP1]], ptr addrspace(3) [[GEP2]], align 1 -; GFX7-NEXT: store i8 [[TMP4]], ptr addrspace(3) [[GEP3]], align 1 +; GFX7-NEXT: store <4 x i8> [[INPTR0]], ptr addrspace(3) [[GEP0]], align 1 ; GFX7-NEXT: ret void ; ; GFX8-LABEL: define protected amdgpu_kernel void @phi_4_with_stores_outside_loop( ; GFX8-SAME: <4 x i8> [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr addrspace(3) [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] { ; GFX8-NEXT: [[ENTRY:.*]]: -; GFX8-NEXT: [[TMP4:%.*]] = extractelement <4 x i8> [[INPTR0]], i32 3 -; GFX8-NEXT: [[TMP1:%.*]] = extractelement <4 x i8> [[INPTR0]], i32 2 -; GFX8-NEXT: [[TMP2:%.*]] = extractelement <4 x i8> [[INPTR0]], i32 1 -; GFX8-NEXT: [[TMP3:%.*]] = extractelement <4 x i8> [[INPTR0]], i32 0 ; GFX8-NEXT: br label %[[DO_BODY:.*]] ; GFX8: [[DO_BODY]]: ; GFX8-NEXT: [[TMP0:%.*]] = phi <4 x i8> [ [[INPTR0]], %[[ENTRY]] ], [ [[INPTR0]], %[[DO_BODY]] ] @@ -1069,22 +813,12 @@ define protected amdgpu_kernel void @phi_4_with_stores_outside_loop(<4 x i8> %in ; GFX8-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]] ; GFX8: [[EXIT]]: ; GFX8-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[OUT1]], i32 0 -; GFX8-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[OUT1]], i32 1 -; GFX8-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[OUT1]], i32 2 -; GFX8-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[OUT1]], i32 3 -; GFX8-NEXT: store i8 [[TMP3]], ptr addrspace(3) [[GEP0]], align 1 -; GFX8-NEXT: store i8 [[TMP2]], ptr addrspace(3) [[GEP1]], align 1 -; GFX8-NEXT: store i8 [[TMP1]], ptr addrspace(3) [[GEP2]], align 1 -; GFX8-NEXT: store i8 [[TMP4]], ptr addrspace(3) [[GEP3]], align 1 +; GFX8-NEXT: store <4 x i8> [[INPTR0]], ptr addrspace(3) [[GEP0]], align 1 ; GFX8-NEXT: ret void ; ; GFX9-LABEL: define protected amdgpu_kernel void @phi_4_with_stores_outside_loop( ; GFX9-SAME: <4 x i8> [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr addrspace(3) [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] { ; GFX9-NEXT: [[ENTRY:.*]]: -; GFX9-NEXT: [[TMP4:%.*]] = extractelement <4 x i8> [[INPTR0]], i32 3 -; GFX9-NEXT: [[TMP1:%.*]] = extractelement <4 x i8> [[INPTR0]], i32 2 -; GFX9-NEXT: [[TMP2:%.*]] = extractelement <4 x i8> [[INPTR0]], i32 1 -; GFX9-NEXT: [[TMP3:%.*]] = extractelement <4 x i8> [[INPTR0]], i32 0 ; GFX9-NEXT: br label %[[DO_BODY:.*]] ; GFX9: [[DO_BODY]]: ; GFX9-NEXT: [[TMP0:%.*]] = phi <4 x i8> [ [[INPTR0]], %[[ENTRY]] ], [ [[INPTR0]], %[[DO_BODY]] ] @@ -1094,13 +828,7 @@ define protected amdgpu_kernel void @phi_4_with_stores_outside_loop(<4 x i8> %in ; GFX9-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]] ; GFX9: [[EXIT]]: ; GFX9-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[OUT1]], i32 0 -; GFX9-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[OUT1]], i32 1 -; GFX9-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[OUT1]], i32 2 -; GFX9-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[OUT1]], i32 3 -; GFX9-NEXT: store i8 [[TMP3]], ptr addrspace(3) [[GEP0]], align 1 -; GFX9-NEXT: store i8 [[TMP2]], ptr addrspace(3) [[GEP1]], align 1 -; GFX9-NEXT: store i8 [[TMP1]], ptr addrspace(3) [[GEP2]], align 1 -; GFX9-NEXT: store i8 [[TMP4]], ptr addrspace(3) [[GEP3]], align 1 +; GFX9-NEXT: store <4 x i8> [[INPTR0]], ptr addrspace(3) [[GEP0]], align 1 ; GFX9-NEXT: ret void ; entry: