diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 09f7877b13b3a..5d3502e5d810c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1120,6 +1120,64 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, } } +/// Certain shufflemasks may not be either Identity masks or InsertSubvector +/// masks, but do not require instructions to produce. An example is if we are +/// shuffling two <16 x i8> sources with the 16 element mask: {0, 1, 2, 3, 4, 5, +/// 6, 7, 24, 25, 26, 27, poison, poison, posion, poison}. The result of this +/// shuffle is {first v4i8 of src0, second v4i8 of src0, third v4i8 of src1, +/// posion}. In order to produce this result, we do not need to insert shuffle +/// code, as these vectors already exist the source registers. Thus, we simply +/// need to ensure these registers are contiguous to produce the result. +/// countIdentityPerms analyzes the \p Mask to count the number of such register +/// aligned vectors (based on the provided \p ScalarSize ). +static unsigned countIdentityPerms(ArrayRef Mask, unsigned ScalarSize) { + unsigned IdentityPerms = 0; + unsigned EltsPerPerm = 32 / ScalarSize; + if (!EltsPerPerm) + return 0; + + // Split the shuffle mask into a number of 32 bit wide shuffles. + for (unsigned PermCand = 0; PermCand < (Mask.size() / EltsPerPerm); + PermCand++) { + std::pair BasisIndex(-1, -1); + bool FoundMismatch = false; + + // Analyze the 32 bit mask for register-aligned vectors. + for (int PermElement = 0; PermElement < (int)EltsPerPerm; PermElement++) { + unsigned Index = PermCand * EltsPerPerm + PermElement; + assert(Index < Mask.size()); + int MaskVal = Mask[Index]; + + // Maskval of -1 is dont-care. + if (MaskVal == -1) + continue; + if (BasisIndex.second == -1) { + // Check if this mask represents alignment to bit position in the + // regsiter. + if (PermElement > MaskVal || ((MaskVal - PermElement) % EltsPerPerm)) { + FoundMismatch = true; + } + BasisIndex = {MaskVal, PermElement}; + continue; + } + + if (MaskVal < BasisIndex.first) { + FoundMismatch = true; + break; + } + + // Check if this mask is contiguous with the previously matched mask + if ((MaskVal - BasisIndex.first) != (PermElement - BasisIndex.second)) { + FoundMismatch = true; + break; + } + } + if (!FoundMismatch) + IdentityPerms += 1; + } + return IdentityPerms; +} + InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *VT, ArrayRef Mask, TTI::TargetCostKind CostKind, @@ -1133,12 +1191,13 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, // Larger vector widths may require additional instructions, but are // typically cheaper than scalarized versions. - unsigned NumVectorElts = cast(VT)->getNumElements(); + unsigned ScalarSize = DL.getTypeSizeInBits(VT->getElementType()); if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && - DL.getTypeSizeInBits(VT->getElementType()) == 16) { + ScalarSize == 16) { bool HasVOP3P = ST->hasVOP3PInsts(); unsigned RequestedElts = count_if(Mask, [](int MaskElt) { return MaskElt != -1; }); + unsigned NumVectorElts = cast(VT)->getNumElements(); if (RequestedElts == 0) return 0; switch (Kind) { @@ -1149,9 +1208,13 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, // half of a register, so any swizzle of two elements is free. if (HasVOP3P && NumVectorElts == 2) return 0; - unsigned NumPerms = alignTo(RequestedElts, 2) / 2; + unsigned NumPerms = alignTo(Mask.size(), 2) / 2; + unsigned IdentPerms = countIdentityPerms(Mask, ScalarSize); + assert(IdentPerms <= NumPerms); + NumPerms -= IdentPerms; // SK_Broadcast just reuses the same mask - unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms; + unsigned NumPermMasks = + Kind == (TTI::SK_Broadcast && NumPerms > 1) ? 1 : NumPerms; return NumPerms + NumPermMasks; } case TTI::SK_ExtractSubvector: @@ -1166,9 +1229,13 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, case TTI::SK_PermuteTwoSrc: case TTI::SK_Splice: case TTI::SK_Select: { - unsigned NumPerms = alignTo(RequestedElts, 2) / 2; + unsigned NumPerms = alignTo(Mask.size(), 2) / 2; + unsigned IdentPerms = countIdentityPerms(Mask, ScalarSize); + assert(IdentPerms <= NumPerms); + NumPerms -= IdentPerms; // SK_Select just reuses the same mask - unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms; + unsigned NumPermMasks = + Kind == (TTI::SK_Select && NumPerms > 1) ? 1 : NumPerms; return NumPerms + NumPermMasks; } diff --git a/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll b/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll index 7107d2be579c6..988dc5f7920fe 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll @@ -27,8 +27,8 @@ define amdgpu_kernel void @shufflevector_i16(<2 x i16> %vec1, <2 x i16> %vec2) { ; GFX9-10-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf32 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> ; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf000 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> zeroinitializer ; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf001 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf010 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf011 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> +; GFX9-10-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf010 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> +; GFX9-10-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf011 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> ; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf100 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> ; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf101 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> ; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf110 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> @@ -64,8 +64,8 @@ define amdgpu_kernel void @shufflevector_i16(<2 x i16> %vec1, <2 x i16> %vec2) { ; GFX9-10-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf32_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> ; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf000_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> zeroinitializer ; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf001_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf010_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf011_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> +; GFX9-10-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf010_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> +; GFX9-10-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf011_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> ; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf100_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> ; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf101_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> ; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf110_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> @@ -104,8 +104,8 @@ define amdgpu_kernel void @shufflevector_i16(<2 x i16> %vec1, <2 x i16> %vec2) { ; VI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf32 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> ; VI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf000 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> zeroinitializer ; VI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf001 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf010 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf011 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> +; VI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf010 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> +; VI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf011 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> ; VI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf100 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> ; VI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf101 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> ; VI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf110 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> @@ -141,8 +141,8 @@ define amdgpu_kernel void @shufflevector_i16(<2 x i16> %vec1, <2 x i16> %vec2) { ; VI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf32_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> ; VI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf000_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> zeroinitializer ; VI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf001_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf010_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf011_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> +; VI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf010_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> +; VI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf011_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> ; VI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf100_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> ; VI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf101_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> ; VI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf110_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> @@ -181,8 +181,8 @@ define amdgpu_kernel void @shufflevector_i16(<2 x i16> %vec1, <2 x i16> %vec2) { ; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf32 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> ; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf000 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> zeroinitializer ; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf001 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf010 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf011 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> +; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf010 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> +; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf011 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> ; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf100 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> ; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf101 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> ; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf110 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> @@ -218,8 +218,8 @@ define amdgpu_kernel void @shufflevector_i16(<2 x i16> %vec1, <2 x i16> %vec2) { ; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf32_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> ; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf000_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> zeroinitializer ; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf001_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf010_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf011_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> +; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf010_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> +; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf011_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> ; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf100_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> ; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf101_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> ; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf110_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> @@ -258,8 +258,8 @@ define amdgpu_kernel void @shufflevector_i16(<2 x i16> %vec1, <2 x i16> %vec2) { ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf32 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf000 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> zeroinitializer ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf001 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf010 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf011 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf010 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf011 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf100 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf101 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf110 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> @@ -295,8 +295,8 @@ define amdgpu_kernel void @shufflevector_i16(<2 x i16> %vec1, <2 x i16> %vec2) { ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf32_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf000_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> zeroinitializer ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf001_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf010_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf011_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf010_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf011_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf100_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf101_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf110_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> @@ -881,8 +881,8 @@ define void @shuffle(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8> ; GFX9-10-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2i16_2_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <2 x i32> ; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i16_4 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <4 x i32> ; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i16_4_2 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4_2, <4 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i16_8 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8, <8 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i16_8_2 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8_2, <8 x i32> +; GFX9-10-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i16_8 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8, <8 x i32> +; GFX9-10-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i16_8_2 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8_2, <8 x i32> ; GFX9-10-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2i32_2 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2, <2 x i32> ; GFX9-10-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2i32_2_2 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2_2, <2 x i32> ; GFX9-10-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i32_4 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4, <4 x i32> @@ -918,8 +918,8 @@ define void @shuffle(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8> ; VI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i16_2_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <2 x i32> ; VI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i16_4 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <4 x i32> ; VI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i16_4_2 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4_2, <4 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i16_8 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8, <8 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i16_8_2 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8_2, <8 x i32> +; VI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i16_8 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8, <8 x i32> +; VI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i16_8_2 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8_2, <8 x i32> ; VI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2i32_2 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2, <2 x i32> ; VI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2i32_2_2 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2_2, <2 x i32> ; VI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i32_4 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4, <4 x i32> @@ -955,8 +955,8 @@ define void @shuffle(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8> ; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2i16_2_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <2 x i32> ; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i16_4 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <4 x i32> ; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i16_4_2 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4_2, <4 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i16_8 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8, <8 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i16_8_2 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8_2, <8 x i32> +; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i16_8 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8, <8 x i32> +; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i16_8_2 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8_2, <8 x i32> ; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2i32_2 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2, <2 x i32> ; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2i32_2_2 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2_2, <2 x i32> ; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i32_4 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4, <4 x i32> @@ -992,8 +992,8 @@ define void @shuffle(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8> ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i16_2_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <2 x i32> ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i16_4 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <4 x i32> ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i16_4_2 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4_2, <4 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i16_8 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8, <8 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i16_8_2 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8_2, <8 x i32> +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i16_8 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8, <8 x i32> +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i16_8_2 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8_2, <8 x i32> ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2i32_2 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2, <2 x i32> ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2i32_2_2 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2_2, <2 x i32> ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i32_4 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4, <4 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/register_aligned_shuffle.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/register_aligned_shuffle.ll new file mode 100644 index 0000000000000..aa83547d070e0 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/register_aligned_shuffle.ll @@ -0,0 +1,226 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,dce < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer,dce < %s | FileCheck -check-prefixes=GCN,VI %s + +define protected amdgpu_kernel void @phi_4(ptr addrspace(3) %inptr0, ptr addrspace(3) %inptr1, ptr %out, ptr %out1, ptr %out2, i32 %flag) { +; GCN-LABEL: define protected amdgpu_kernel void @phi_4( +; GCN-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], ptr [[OUT2:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0:[0-9]+]] { +; GCN-NEXT: [[ENTRY:.*]]: +; GCN-NEXT: [[GEP0:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 0 +; GCN-NEXT: [[TMP0:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP0]], align 8 +; GCN-NEXT: [[GEP2:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 2 +; GCN-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP2]], align 2 +; GCN-NEXT: [[GEP4:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 4 +; GCN-NEXT: [[TMP2:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP4]], align 8 +; GCN-NEXT: [[GEP6:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 6 +; GCN-NEXT: [[TMP3:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP6]], align 2 +; GCN-NEXT: [[GEP8:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 8 +; GCN-NEXT: [[TMP4:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP8]], align 8 +; GCN-NEXT: [[GEP10:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 10 +; GCN-NEXT: [[TMP5:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP10]], align 2 +; GCN-NEXT: [[GEP12:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 12 +; GCN-NEXT: [[TMP6:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP12]], align 8 +; GCN-NEXT: [[GEP14:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 14 +; GCN-NEXT: [[TMP7:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP14]], align 2 +; GCN-NEXT: br label %[[DO_BODY:.*]] +; GCN: [[DO_BODY]]: +; GCN-NEXT: [[TMP8:%.*]] = phi <2 x i16> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP16:%.*]], %[[DO_BODY]] ] +; GCN-NEXT: [[TMP9:%.*]] = phi <2 x i16> [ [[TMP1]], %[[ENTRY]] ], [ [[TMP17:%.*]], %[[DO_BODY]] ] +; GCN-NEXT: [[TMP10:%.*]] = phi <2 x i16> [ [[TMP2]], %[[ENTRY]] ], [ [[TMP18:%.*]], %[[DO_BODY]] ] +; GCN-NEXT: [[TMP11:%.*]] = phi <2 x i16> [ [[TMP3]], %[[ENTRY]] ], [ [[TMP19:%.*]], %[[DO_BODY]] ] +; GCN-NEXT: [[TMP12:%.*]] = phi <2 x i16> [ [[TMP4]], %[[ENTRY]] ], [ [[TMP20:%.*]], %[[DO_BODY]] ] +; GCN-NEXT: [[TMP13:%.*]] = phi <2 x i16> [ [[TMP5]], %[[ENTRY]] ], [ [[TMP21:%.*]], %[[DO_BODY]] ] +; GCN-NEXT: [[TMP14:%.*]] = phi <2 x i16> [ [[TMP6]], %[[ENTRY]] ], [ [[TMP22:%.*]], %[[DO_BODY]] ] +; GCN-NEXT: [[TMP15:%.*]] = phi <2 x i16> [ [[TMP7]], %[[ENTRY]] ], [ [[TMP23:%.*]], %[[DO_BODY]] ] +; GCN-NEXT: [[TMP16]] = load <2 x i16>, ptr addrspace(3) [[GEP0]], align 8 +; GCN-NEXT: [[TMP17]] = load <2 x i16>, ptr addrspace(3) [[GEP2]], align 2 +; GCN-NEXT: [[TMP18]] = load <2 x i16>, ptr addrspace(3) [[GEP4]], align 8 +; GCN-NEXT: [[TMP19]] = load <2 x i16>, ptr addrspace(3) [[GEP6]], align 2 +; GCN-NEXT: [[TMP20]] = load <2 x i16>, ptr addrspace(3) [[GEP8]], align 8 +; GCN-NEXT: [[TMP21]] = load <2 x i16>, ptr addrspace(3) [[GEP10]], align 2 +; GCN-NEXT: [[TMP22]] = load <2 x i16>, ptr addrspace(3) [[GEP12]], align 8 +; GCN-NEXT: [[TMP23]] = load <2 x i16>, ptr addrspace(3) [[GEP14]], align 2 +; GCN-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0 +; GCN-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]] +; GCN: [[EXIT]]: +; GCN-NEXT: [[TMP24:%.*]] = shufflevector <2 x i16> [[TMP16]], <2 x i16> [[TMP17]], <16 x i32> +; GCN-NEXT: [[TMP25:%.*]] = shufflevector <2 x i16> [[TMP18]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[TMP26:%.*]] = shufflevector <16 x i16> [[TMP24]], <16 x i16> [[TMP25]], <16 x i32> +; GCN-NEXT: [[TMP27:%.*]] = shufflevector <2 x i16> [[TMP19]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[TMP28:%.*]] = shufflevector <16 x i16> [[TMP26]], <16 x i16> [[TMP27]], <16 x i32> +; GCN-NEXT: [[TMP29:%.*]] = shufflevector <2 x i16> [[TMP20]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[TMP30:%.*]] = shufflevector <16 x i16> [[TMP28]], <16 x i16> [[TMP29]], <16 x i32> +; GCN-NEXT: [[TMP31:%.*]] = shufflevector <2 x i16> [[TMP21]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[TMP32:%.*]] = shufflevector <16 x i16> [[TMP30]], <16 x i16> [[TMP31]], <16 x i32> +; GCN-NEXT: [[TMP33:%.*]] = shufflevector <2 x i16> [[TMP22]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[TMP34:%.*]] = shufflevector <16 x i16> [[TMP32]], <16 x i16> [[TMP33]], <16 x i32> +; GCN-NEXT: [[TMP35:%.*]] = shufflevector <2 x i16> [[TMP23]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[TMP36:%.*]] = shufflevector <16 x i16> [[TMP34]], <16 x i16> [[TMP35]], <16 x i32> +; GCN-NEXT: [[TMP37:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> [[TMP1]], <16 x i32> +; GCN-NEXT: [[TMP38:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[TMP39:%.*]] = shufflevector <16 x i16> [[TMP37]], <16 x i16> [[TMP38]], <16 x i32> +; GCN-NEXT: [[TMP40:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[TMP41:%.*]] = shufflevector <16 x i16> [[TMP39]], <16 x i16> [[TMP40]], <16 x i32> +; GCN-NEXT: [[TMP42:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[TMP43:%.*]] = shufflevector <16 x i16> [[TMP41]], <16 x i16> [[TMP42]], <16 x i32> +; GCN-NEXT: [[TMP44:%.*]] = shufflevector <2 x i16> [[TMP5]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[TMP45:%.*]] = shufflevector <16 x i16> [[TMP43]], <16 x i16> [[TMP44]], <16 x i32> +; GCN-NEXT: [[TMP46:%.*]] = shufflevector <2 x i16> [[TMP6]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[TMP47:%.*]] = shufflevector <16 x i16> [[TMP45]], <16 x i16> [[TMP46]], <16 x i32> +; GCN-NEXT: [[TMP48:%.*]] = shufflevector <2 x i16> [[TMP7]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[TMP49:%.*]] = shufflevector <16 x i16> [[TMP47]], <16 x i16> [[TMP48]], <16 x i32> +; GCN-NEXT: [[TMP50:%.*]] = shufflevector <2 x i16> [[TMP8]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[TMP51:%.*]] = shufflevector <2 x i16> [[TMP9]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[VEC231:%.*]] = shufflevector <16 x i16> [[TMP50]], <16 x i16> [[TMP51]], <16 x i32> +; GCN-NEXT: [[TMP52:%.*]] = shufflevector <2 x i16> [[TMP10]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[VEC252:%.*]] = shufflevector <16 x i16> [[VEC231]], <16 x i16> [[TMP52]], <16 x i32> +; GCN-NEXT: [[TMP53:%.*]] = shufflevector <2 x i16> [[TMP11]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[VEC273:%.*]] = shufflevector <16 x i16> [[VEC252]], <16 x i16> [[TMP53]], <16 x i32> +; GCN-NEXT: [[TMP54:%.*]] = shufflevector <2 x i16> [[TMP12]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[VEC294:%.*]] = shufflevector <16 x i16> [[VEC273]], <16 x i16> [[TMP54]], <16 x i32> +; GCN-NEXT: [[TMP55:%.*]] = shufflevector <2 x i16> [[TMP13]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[VEC2115:%.*]] = shufflevector <16 x i16> [[VEC294]], <16 x i16> [[TMP55]], <16 x i32> +; GCN-NEXT: [[TMP56:%.*]] = shufflevector <2 x i16> [[TMP14]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[VEC2136:%.*]] = shufflevector <16 x i16> [[VEC2115]], <16 x i16> [[TMP56]], <16 x i32> +; GCN-NEXT: [[TMP57:%.*]] = shufflevector <2 x i16> [[TMP15]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[VEC2157:%.*]] = shufflevector <16 x i16> [[VEC2136]], <16 x i16> [[TMP57]], <16 x i32> +; GCN-NEXT: store <16 x i16> [[TMP49]], ptr [[OUT]], align 32 +; GCN-NEXT: store <16 x i16> [[TMP36]], ptr [[OUT1]], align 32 +; GCN-NEXT: store <16 x i16> [[VEC2157]], ptr [[OUT2]], align 32 +; GCN-NEXT: ret void +; +entry: + %gep0 = getelementptr i16, ptr addrspace(3) %inptr0, i32 0 + %ele0 = load i16, ptr addrspace(3) %gep0, align 8 + %gep1 = getelementptr i16, ptr addrspace(3) %inptr0, i32 1 + %ele1 = load i16, ptr addrspace(3) %gep1, align 1 + %gep2 = getelementptr i16, ptr addrspace(3) %inptr0, i32 2 + %ele2 = load i16, ptr addrspace(3) %gep2, align 2 + %gep3 = getelementptr i16, ptr addrspace(3) %inptr0, i32 3 + %ele3 = load i16, ptr addrspace(3) %gep3, align 1 + %gep4 = getelementptr i16, ptr addrspace(3) %inptr0, i32 4 + %ele4 = load i16, ptr addrspace(3) %gep4, align 8 + %gep5 = getelementptr i16, ptr addrspace(3) %inptr0, i32 5 + %ele5 = load i16, ptr addrspace(3) %gep5, align 1 + %gep6 = getelementptr i16, ptr addrspace(3) %inptr0, i32 6 + %ele6 = load i16, ptr addrspace(3) %gep6, align 2 + %gep7 = getelementptr i16, ptr addrspace(3) %inptr0, i32 7 + %ele7 = load i16, ptr addrspace(3) %gep7, align 1 + %gep8 = getelementptr i16, ptr addrspace(3) %inptr0, i32 8 + %ele8 = load i16, ptr addrspace(3) %gep8, align 8 + %gep9 = getelementptr i16, ptr addrspace(3) %inptr0, i32 9 + %ele9 = load i16, ptr addrspace(3) %gep9, align 1 + %gep10 = getelementptr i16, ptr addrspace(3) %inptr0, i32 10 + %ele10 = load i16, ptr addrspace(3) %gep10, align 2 + %gep11 = getelementptr i16, ptr addrspace(3) %inptr0, i32 11 + %ele11 = load i16, ptr addrspace(3) %gep11, align 1 + %gep12 = getelementptr i16, ptr addrspace(3) %inptr0, i32 12 + %ele12 = load i16, ptr addrspace(3) %gep12, align 8 + %gep13 = getelementptr i16, ptr addrspace(3) %inptr0, i32 13 + %ele13 = load i16, ptr addrspace(3) %gep13, align 1 + %gep14 = getelementptr i16, ptr addrspace(3) %inptr0, i32 14 + %ele14 = load i16, ptr addrspace(3) %gep14, align 2 + %gep15 = getelementptr i16, ptr addrspace(3) %inptr0, i32 15 + %ele15 = load i16, ptr addrspace(3) %gep15, align 1 + br label %do.body + +do.body: + %phi0 = phi i16 [ %ele0, %entry ], [ %otherele0, %do.body ] + %phi1 = phi i16 [ %ele1, %entry ], [ %otherele1, %do.body ] + %phi2 = phi i16 [ %ele2, %entry ], [ %otherele2, %do.body ] + %phi3 = phi i16 [ %ele3, %entry ], [ %otherele3, %do.body ] + %phi4 = phi i16 [ %ele4, %entry ], [ %otherele4, %do.body ] + %phi5 = phi i16 [ %ele5, %entry ], [ %otherele5, %do.body ] + %phi6 = phi i16 [ %ele6, %entry ], [ %otherele6, %do.body ] + %phi7 = phi i16 [ %ele7, %entry ], [ %otherele7, %do.body ] + %phi8 = phi i16 [ %ele8, %entry ], [ %otherele8, %do.body ] + %phi9 = phi i16 [ %ele9, %entry ], [ %otherele9, %do.body ] + %phi10 = phi i16 [ %ele10, %entry ], [ %otherele10, %do.body ] + %phi11 = phi i16 [ %ele11, %entry ], [ %otherele11, %do.body ] + %phi12 = phi i16 [ %ele12, %entry ], [ %otherele12, %do.body ] + %phi13 = phi i16 [ %ele13, %entry ], [ %otherele13, %do.body ] + %phi14 = phi i16 [ %ele14, %entry ], [ %otherele14, %do.body ] + %phi15 = phi i16 [ %ele15, %entry ], [ %otherele15, %do.body ] + + %otherele0 = load i16, ptr addrspace(3) %gep0, align 8 + %otherele1 = load i16, ptr addrspace(3) %gep1, align 1 + %otherele2 = load i16, ptr addrspace(3) %gep2, align 2 + %otherele3 = load i16, ptr addrspace(3) %gep3, align 1 + %otherele4 = load i16, ptr addrspace(3) %gep4, align 8 + %otherele5 = load i16, ptr addrspace(3) %gep5, align 1 + %otherele6 = load i16, ptr addrspace(3) %gep6, align 2 + %otherele7 = load i16, ptr addrspace(3) %gep7, align 1 + %otherele8 = load i16, ptr addrspace(3) %gep8, align 8 + %otherele9 = load i16, ptr addrspace(3) %gep9, align 1 + %otherele10 = load i16, ptr addrspace(3) %gep10, align 2 + %otherele11 = load i16, ptr addrspace(3) %gep11, align 1 + %otherele12 = load i16, ptr addrspace(3) %gep12, align 8 + %otherele13 = load i16, ptr addrspace(3) %gep13, align 1 + %otherele14 = load i16, ptr addrspace(3) %gep14, align 2 + %otherele15 = load i16, ptr addrspace(3) %gep15, align 1 + %cmp = icmp eq i32 %flag, 0 + br i1 %cmp, label %exit, label %do.body + +exit: + %vec00 = insertelement <16 x i16> poison, i16 %otherele0, i64 0 + %vec01 = insertelement <16 x i16> %vec00, i16 %otherele1, i64 1 + %vec02 = insertelement <16 x i16> %vec01, i16 %otherele2, i64 2 + %vec03 = insertelement <16 x i16> %vec02, i16 %otherele3, i64 3 + %vec04 = insertelement <16 x i16> %vec03, i16 %otherele4, i64 4 + %vec05 = insertelement <16 x i16> %vec04, i16 %otherele5, i64 5 + %vec06 = insertelement <16 x i16> %vec05, i16 %otherele6, i64 6 + %vec07 = insertelement <16 x i16> %vec06, i16 %otherele7, i64 7 + %vec08 = insertelement <16 x i16> %vec07, i16 %otherele8, i64 8 + %vec09 = insertelement <16 x i16> %vec08, i16 %otherele9, i64 9 + %vec010 = insertelement <16 x i16> %vec09, i16 %otherele10, i64 10 + %vec011 = insertelement <16 x i16> %vec010, i16 %otherele11, i64 11 + %vec012 = insertelement <16 x i16> %vec011, i16 %otherele12, i64 12 + %vec013 = insertelement <16 x i16> %vec012, i16 %otherele13, i64 13 + %vec014 = insertelement <16 x i16> %vec013, i16 %otherele14, i64 14 + %vec015 = insertelement <16 x i16> %vec014, i16 %otherele15, i64 15 + + %vec10 = insertelement <16 x i16> poison, i16 %ele0, i64 0 + %vec11 = insertelement <16 x i16> %vec10, i16 %ele1, i64 1 + %vec12 = insertelement <16 x i16> %vec11, i16 %ele2, i64 2 + %vec13 = insertelement <16 x i16> %vec12, i16 %ele3, i64 3 + %vec14 = insertelement <16 x i16> %vec13, i16 %ele4, i64 4 + %vec15 = insertelement <16 x i16> %vec14, i16 %ele5, i64 5 + %vec16 = insertelement <16 x i16> %vec15, i16 %ele6, i64 6 + %vec17 = insertelement <16 x i16> %vec16, i16 %ele7, i64 7 + %vec18 = insertelement <16 x i16> %vec17, i16 %ele8, i64 8 + %vec19 = insertelement <16 x i16> %vec18, i16 %ele9, i64 9 + %vec110 = insertelement <16 x i16> %vec19, i16 %ele10, i64 10 + %vec111 = insertelement <16 x i16> %vec110, i16 %ele11, i64 11 + %vec112 = insertelement <16 x i16> %vec111, i16 %ele12, i64 12 + %vec113 = insertelement <16 x i16> %vec112, i16 %ele13, i64 13 + %vec114 = insertelement <16 x i16> %vec113, i16 %ele14, i64 14 + %vec115 = insertelement <16 x i16> %vec114, i16 %ele15, i64 15 + + %vec20 = insertelement <16 x i16> poison, i16 %phi0, i64 0 + %vec21 = insertelement <16 x i16> %vec20, i16 %phi1, i64 1 + %vec22 = insertelement <16 x i16> %vec21, i16 %phi2, i64 2 + %vec23 = insertelement <16 x i16> %vec22, i16 %phi3, i64 3 + %vec24 = insertelement <16 x i16> %vec23, i16 %phi4, i64 4 + %vec25 = insertelement <16 x i16> %vec24, i16 %phi5, i64 5 + %vec26 = insertelement <16 x i16> %vec25, i16 %phi6, i64 6 + %vec27 = insertelement <16 x i16> %vec26, i16 %phi7, i64 7 + %vec28 = insertelement <16 x i16> %vec27, i16 %phi8, i64 8 + %vec29 = insertelement <16 x i16> %vec28, i16 %phi9, i64 9 + %vec210 = insertelement <16 x i16> %vec29, i16 %phi10, i64 10 + %vec211 = insertelement <16 x i16> %vec210, i16 %phi11, i64 11 + %vec212 = insertelement <16 x i16> %vec211, i16 %phi12, i64 12 + %vec213 = insertelement <16 x i16> %vec212, i16 %phi13, i64 13 + %vec214 = insertelement <16 x i16> %vec213, i16 %phi14, i64 14 + %vec215 = insertelement <16 x i16> %vec214, i16 %phi15, i64 15 + + + + store <16 x i16> %vec115, ptr %out + store <16 x i16> %vec015, ptr %out1 + store <16 x i16> %vec215, ptr %out2 + + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX9: {{.*}} +; VI: {{.*}}