diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index 9240a3c3127eb..87409c88788e6 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -922,6 +922,10 @@ class CombinerHelper { bool matchUnmergeValuesAnyExtBuildVector(const MachineInstr &MI, BuildFnTy &MatchInfo); + // unmerge_values(opaque vector) -> extract vector elt + bool matchUnmergeValuesOfScalarAndVector(const MachineInstr &MI, + BuildFnTy &MatchInfo); + private: /// Checks for legality of an indexed variant of \p LdSt. bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const; diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index ead4149fc1106..39dd58837d575 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -840,6 +840,14 @@ def unmerge_anyext_build_vector : GICombineRule< (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }]) >; +// Transform unmerge opaque vector -> extract vector elt +def unmerge_opaque_vector : GICombineRule< + (defs root:$root, build_fn_matchinfo:$matchinfo), + (match (wip_match_opcode G_UNMERGE_VALUES): $root, + [{ return Helper.matchUnmergeValuesOfScalarAndVector(*${root}, ${matchinfo}); }]), + (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }]) +>; + // Transform x,y = unmerge(zext(z)) -> x = zext z; y = 0. def unmerge_zext_to_zext : GICombineRule< (defs root:$d), @@ -855,7 +863,8 @@ def merge_combines: GICombineGroup<[ unmerge_cst, unmerge_undef, unmerge_dead_to_trunc, - unmerge_zext_to_zext + unmerge_zext_to_zext, + unmerge_opaque_vector ]>; // Under certain conditions, transform: diff --git a/llvm/lib/CodeGen/GlobalISel/CMakeLists.txt b/llvm/lib/CodeGen/GlobalISel/CMakeLists.txt index af1717dbf76f3..a45024d120be6 100644 --- a/llvm/lib/CodeGen/GlobalISel/CMakeLists.txt +++ b/llvm/lib/CodeGen/GlobalISel/CMakeLists.txt @@ -6,6 +6,7 @@ add_llvm_component_library(LLVMGlobalISel GlobalISel.cpp Combiner.cpp CombinerHelper.cpp + CombinerHelperArtifacts.cpp CombinerHelperCasts.cpp CombinerHelperCompares.cpp CombinerHelperVectorOps.cpp diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index b7ddf9f479ef8..f9b1621955c21 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -7611,85 +7611,3 @@ bool CombinerHelper::matchFoldAMinusC1PlusC2(const MachineInstr &MI, return true; } - -bool CombinerHelper::matchUnmergeValuesAnyExtBuildVector(const MachineInstr &MI, - BuildFnTy &MatchInfo) { - const GUnmerge *Unmerge = cast(&MI); - - if (!MRI.hasOneNonDBGUse(Unmerge->getSourceReg())) - return false; - - const MachineInstr *Source = MRI.getVRegDef(Unmerge->getSourceReg()); - - LLT DstTy = MRI.getType(Unmerge->getReg(0)); - - // $bv:_(<8 x s8>) = G_BUILD_VECTOR .... - // $any:_(<8 x s16>) = G_ANYEXT $bv - // $uv:_(<4 x s16>), $uv1:_(<4 x s16>) = G_UNMERGE_VALUES $any - // - // -> - // - // $any:_(s16) = G_ANYEXT $bv[0] - // $any1:_(s16) = G_ANYEXT $bv[1] - // $any2:_(s16) = G_ANYEXT $bv[2] - // $any3:_(s16) = G_ANYEXT $bv[3] - // $any4:_(s16) = G_ANYEXT $bv[4] - // $any5:_(s16) = G_ANYEXT $bv[5] - // $any6:_(s16) = G_ANYEXT $bv[6] - // $any7:_(s16) = G_ANYEXT $bv[7] - // $uv:_(<4 x s16>) = G_BUILD_VECTOR $any, $any1, $any2, $any3 - // $uv1:_(<4 x s16>) = G_BUILD_VECTOR $any4, $any5, $any6, $any7 - - // We want to unmerge into vectors. - if (!DstTy.isFixedVector()) - return false; - - const GAnyExt *Any = dyn_cast(Source); - if (!Any) - return false; - - const MachineInstr *NextSource = MRI.getVRegDef(Any->getSrcReg()); - - if (const GBuildVector *BV = dyn_cast(NextSource)) { - // G_UNMERGE_VALUES G_ANYEXT G_BUILD_VECTOR - - if (!MRI.hasOneNonDBGUse(BV->getReg(0))) - return false; - - // FIXME: check element types? - if (BV->getNumSources() % Unmerge->getNumDefs() != 0) - return false; - - LLT BigBvTy = MRI.getType(BV->getReg(0)); - LLT SmallBvTy = DstTy; - LLT SmallBvElemenTy = SmallBvTy.getElementType(); - - if (!isLegalOrBeforeLegalizer( - {TargetOpcode::G_BUILD_VECTOR, {SmallBvTy, SmallBvElemenTy}})) - return false; - - // We check the legality of scalar anyext. - if (!isLegalOrBeforeLegalizer( - {TargetOpcode::G_ANYEXT, - {SmallBvElemenTy, BigBvTy.getElementType()}})) - return false; - - MatchInfo = [=](MachineIRBuilder &B) { - // Build into each G_UNMERGE_VALUES def - // a small build vector with anyext from the source build vector. - for (unsigned I = 0; I < Unmerge->getNumDefs(); ++I) { - SmallVector Ops; - for (unsigned J = 0; J < SmallBvTy.getNumElements(); ++J) { - Register SourceArray = - BV->getSourceReg(I * SmallBvTy.getNumElements() + J); - auto AnyExt = B.buildAnyExt(SmallBvElemenTy, SourceArray); - Ops.push_back(AnyExt.getReg(0)); - } - B.buildBuildVector(Unmerge->getOperand(I).getReg(), Ops); - }; - }; - return true; - }; - - return false; -} diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp new file mode 100644 index 0000000000000..cab250ee7e62f --- /dev/null +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp @@ -0,0 +1,169 @@ +//===- CombinerHelperArtifacts.cpp-----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements CombinerHelper for legalization artifacts. +// +//===----------------------------------------------------------------------===// +// +// G_UNMERGE_VALUES +// +//===----------------------------------------------------------------------===// +#include "llvm/CodeGen/GlobalISel/CombinerHelper.h" +#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" +#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/GlobalISel/Utils.h" +#include "llvm/CodeGen/LowLevelTypeUtils.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetOpcodes.h" +#include "llvm/Support/Casting.h" + +#define DEBUG_TYPE "gi-combiner" + +using namespace llvm; + +bool CombinerHelper::matchUnmergeValuesAnyExtBuildVector(const MachineInstr &MI, + BuildFnTy &MatchInfo) { + const GUnmerge *Unmerge = cast(&MI); + + if (!MRI.hasOneNonDBGUse(Unmerge->getSourceReg())) + return false; + + const MachineInstr *Source = MRI.getVRegDef(Unmerge->getSourceReg()); + + LLT DstTy = MRI.getType(Unmerge->getReg(0)); + + // $bv:_(<8 x s8>) = G_BUILD_VECTOR .... + // $any:_(<8 x s16>) = G_ANYEXT $bv + // $uv:_(<4 x s16>), $uv1:_(<4 x s16>) = G_UNMERGE_VALUES $any + // + // -> + // + // $any:_(s16) = G_ANYEXT $bv[0] + // $any1:_(s16) = G_ANYEXT $bv[1] + // $any2:_(s16) = G_ANYEXT $bv[2] + // $any3:_(s16) = G_ANYEXT $bv[3] + // $any4:_(s16) = G_ANYEXT $bv[4] + // $any5:_(s16) = G_ANYEXT $bv[5] + // $any6:_(s16) = G_ANYEXT $bv[6] + // $any7:_(s16) = G_ANYEXT $bv[7] + // $uv:_(<4 x s16>) = G_BUILD_VECTOR $any, $any1, $any2, $any3 + // $uv1:_(<4 x s16>) = G_BUILD_VECTOR $any4, $any5, $any6, $any7 + + // We want to unmerge into vectors. + if (!DstTy.isFixedVector()) + return false; + + const GAnyExt *Any = dyn_cast(Source); + if (!Any) + return false; + + const MachineInstr *NextSource = MRI.getVRegDef(Any->getSrcReg()); + + if (const GBuildVector *BV = dyn_cast(NextSource)) { + // G_UNMERGE_VALUES G_ANYEXT G_BUILD_VECTOR + + if (!MRI.hasOneNonDBGUse(BV->getReg(0))) + return false; + + // FIXME: check element types? + if (BV->getNumSources() % Unmerge->getNumDefs() != 0) + return false; + + LLT BigBvTy = MRI.getType(BV->getReg(0)); + LLT SmallBvTy = DstTy; + LLT SmallBvElemenTy = SmallBvTy.getElementType(); + + if (!isLegalOrBeforeLegalizer( + {TargetOpcode::G_BUILD_VECTOR, {SmallBvTy, SmallBvElemenTy}})) + return false; + + // We check the legality of scalar anyext. + if (!isLegalOrBeforeLegalizer( + {TargetOpcode::G_ANYEXT, + {SmallBvElemenTy, BigBvTy.getElementType()}})) + return false; + + MatchInfo = [=](MachineIRBuilder &B) { + // Build into each G_UNMERGE_VALUES def + // a small build vector with anyext from the source build vector. + for (unsigned I = 0; I < Unmerge->getNumDefs(); ++I) { + SmallVector Ops; + for (unsigned J = 0; J < SmallBvTy.getNumElements(); ++J) { + Register SourceArray = + BV->getSourceReg(I * SmallBvTy.getNumElements() + J); + auto AnyExt = B.buildAnyExt(SmallBvElemenTy, SourceArray); + Ops.push_back(AnyExt.getReg(0)); + } + B.buildBuildVector(Unmerge->getOperand(I).getReg(), Ops); + }; + }; + return true; + }; + + return false; +} + +bool CombinerHelper::matchUnmergeValuesOfScalarAndVector(const MachineInstr &MI, + BuildFnTy &MatchInfo) { + + constexpr unsigned MAX_NUM_DEFS_LIMIT = 4; + + // %opaque:_(<2 x s64>) = G_OPAQUE + // %un1:_(s64), %un2:_(s64) = G_UNMERGE_VALUES %opaque(<2 x s64>) + // + // -> + // + // %zero:_(s64) = G_CONSTANT i64 0 + // %one:_(s64) = G_CONSTANT i64 1 + // %un1:_(s64) = G_EXTRACT_VECTOR_ELT %opaque, $zero + // %un2:_(s64) = G_EXTRACT_VECTOR_ELT %opaque, $one + + const GUnmerge *Unmerge = cast(&MI); + + if (Unmerge->getNumDefs() > MAX_NUM_DEFS_LIMIT) + return false; + + LLT DstTy = MRI.getType(Unmerge->getReg(0)); + LLT SrcTy = MRI.getType(Unmerge->getSourceReg()); + + // We want to unmerge a vector into scalars. + if (!DstTy.isScalar() || !SrcTy.isFixedVector() || DstTy.getSizeInBits() > 64) + return false; + + if (DstTy != SrcTy.getElementType()) + return false; + + // We want to unmerge from an opaque vector. + const MachineInstr *Source = MRI.getVRegDef(Unmerge->getSourceReg()); + if (isa(Source)) + return false; + + unsigned PreferredVecIdxWidth = + getTargetLowering().getVectorIdxTy(getDataLayout()).getSizeInBits(); + + LLT IdxTy = LLT::scalar(PreferredVecIdxWidth); + + if (!isLegalOrBeforeLegalizer( + {TargetOpcode::G_EXTRACT_VECTOR_ELT, {DstTy, SrcTy, IdxTy}})) + return false; + + if (!isConstantLegalOrBeforeLegalizer(IdxTy)) + return false; + + MatchInfo = [=](MachineIRBuilder &B) { + for (unsigned I = 0; I < Unmerge->getNumDefs(); ++I) { + auto Index = B.buildConstant(IdxTy, I); + B.buildExtractVectorElement(Unmerge->getOperand(I).getReg(), + Unmerge->getSourceReg(), Index); + } + }; + + return true; +} diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir index 7566d38e6c6cf..e401cebd93a92 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir @@ -422,9 +422,12 @@ body: | ; CHECK-LABEL: name: test_dont_combine_unmerge_zext_to_zext_src_vector ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $w0 ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(<2 x s32>) = G_ZEXT [[COPY]](<2 x s16>) - ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ZEXT]](<2 x s32>) - ; CHECK-NEXT: $w0 = COPY [[UV]](s32) - ; CHECK-NEXT: $w1 = COPY [[UV1]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[ZEXT]](<2 x s32>), [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CHECK-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[ZEXT]](<2 x s32>), [[C1]](s64) + ; CHECK-NEXT: $w0 = COPY [[EVEC]](s32) + ; CHECK-NEXT: $w1 = COPY [[EVEC1]](s32) %0:_(<2 x s16>) = COPY $w0 %3:_(<2 x s32>) = G_ZEXT %0(<2 x s16>) %1:_(s32),%2:_(s32) = G_UNMERGE_VALUES %3(<2 x s32>) @@ -539,3 +542,83 @@ body: | $q0 = COPY %un1(s128) $q1 = COPY %un2(s128) ... + +# Check that we unmerge the opaque vector into extract vector elt +--- +name: test_opaque_vector_scalar +body: | + bb.1: + ; CHECK-LABEL: name: test_opaque_vector_scalar + ; CHECK: %opaque:_(<2 x s64>) = COPY $q0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: %un1:_(s64) = G_EXTRACT_VECTOR_ELT %opaque(<2 x s64>), [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CHECK-NEXT: %un2:_(s64) = G_EXTRACT_VECTOR_ELT %opaque(<2 x s64>), [[C1]](s64) + ; CHECK-NEXT: $x0 = COPY %un1(s64) + ; CHECK-NEXT: $x1 = COPY %un2(s64) + %opaque:_(<2 x s64>) = COPY $q0 + %un1:_(s64), %un2:_(s64) = G_UNMERGE_VALUES %opaque(<2 x s64>) + $x0 = COPY %un1(s64) + $x1 = COPY %un2(s64) +... + +# Check that we don't unmerge the opaque vector into scalars +--- +name: test_opaque_vector_vector +body: | + bb.1: + ; CHECK-LABEL: name: test_opaque_vector_vector + ; CHECK: %opaque:_(s128) = COPY $q0 + ; CHECK-NEXT: %un1:_(s64), %un2:_(s64) = G_UNMERGE_VALUES %opaque(s128) + ; CHECK-NEXT: $x0 = COPY %un1(s64) + ; CHECK-NEXT: $x1 = COPY %un2(s64) + %opaque:_(s128) = COPY $q0 + %un1:_(s64), %un2:_(s64) = G_UNMERGE_VALUES %opaque(s128) + $x0 = COPY %un1(s64) + $x1 = COPY %un2(s64) +... + +# Check that we unmerge the long opaque vector into extract vector elt +--- +name: test_long_opaque_vector_scalar +body: | + bb.1: + ; CHECK-LABEL: name: test_long_opaque_vector_scalar + ; CHECK: %opaque:_(<8 x s16>) = COPY $q0 + ; CHECK-NEXT: %un1:_(s16), %un2:_(s16), %un3:_(s16), %un4:_(s16), %un5:_(s16), %un6:_(s16), %un7:_(s16), %un8:_(s16) = G_UNMERGE_VALUES %opaque(<8 x s16>) + ; CHECK-NEXT: %zext1:_(s32) = G_ZEXT %un1(s16) + ; CHECK-NEXT: %zext2:_(s32) = G_ZEXT %un2(s16) + ; CHECK-NEXT: %zext3:_(s32) = G_ZEXT %un3(s16) + ; CHECK-NEXT: %zext4:_(s32) = G_ZEXT %un4(s16) + ; CHECK-NEXT: %zext5:_(s32) = G_ZEXT %un5(s16) + ; CHECK-NEXT: %zext6:_(s32) = G_ZEXT %un6(s16) + ; CHECK-NEXT: %zext7:_(s32) = G_ZEXT %un7(s16) + ; CHECK-NEXT: %zext8:_(s32) = G_ZEXT %un8(s16) + ; CHECK-NEXT: $w0 = COPY %zext1(s32) + ; CHECK-NEXT: $w1 = COPY %zext2(s32) + ; CHECK-NEXT: $w0 = COPY %zext3(s32) + ; CHECK-NEXT: $w1 = COPY %zext4(s32) + ; CHECK-NEXT: $w0 = COPY %zext5(s32) + ; CHECK-NEXT: $w1 = COPY %zext6(s32) + ; CHECK-NEXT: $w0 = COPY %zext7(s32) + ; CHECK-NEXT: $w1 = COPY %zext8(s32) + %opaque:_(<8 x s16>) = COPY $q0 + %un1:_(s16), %un2:_(s16), %un3:_(s16), %un4:_(s16), %un5:_(s16), %un6:_(s16), %un7:_(s16), %un8:_(s16) = G_UNMERGE_VALUES %opaque(<8 x s16>) + %zext1:_(s32) = G_ZEXT %un1 + %zext2:_(s32) = G_ZEXT %un2 + %zext3:_(s32) = G_ZEXT %un3 + %zext4:_(s32) = G_ZEXT %un4 + %zext5:_(s32) = G_ZEXT %un5 + %zext6:_(s32) = G_ZEXT %un6 + %zext7:_(s32) = G_ZEXT %un7 + %zext8:_(s32) = G_ZEXT %un8 + $w0 = COPY %zext1(s32) + $w1 = COPY %zext2(s32) + $w0 = COPY %zext3(s32) + $w1 = COPY %zext4(s32) + $w0 = COPY %zext5(s32) + $w1 = COPY %zext6(s32) + $w0 = COPY %zext7(s32) + $w1 = COPY %zext8(s32) +... + diff --git a/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll b/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll index f7aa57a068a4c..4d75367fa06b4 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll @@ -590,14 +590,26 @@ entry: } define i16 @sminv_v3i16(<3 x i16> %a) { -; CHECK-LABEL: sminv_v3i16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov w8, #32767 // =0x7fff -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: sminv h0, v0.4h -; CHECK-NEXT: fmov w0, s0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sminv_v3i16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: mov w8, #32767 // =0x7fff +; CHECK-SD-NEXT: mov v0.h[3], w8 +; CHECK-SD-NEXT: sminv h0, v0.4h +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sminv_v3i16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: mov v1.h[0], v0.h[0] +; CHECK-GI-NEXT: mov w8, #32767 // =0x7fff +; CHECK-GI-NEXT: mov v1.h[1], v0.h[1] +; CHECK-GI-NEXT: mov v1.h[2], v0.h[2] +; CHECK-GI-NEXT: mov v1.h[3], w8 +; CHECK-GI-NEXT: sminv h0, v1.4h +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: ret entry: %arg1 = call i16 @llvm.vector.reduce.smin.v3i16(<3 x i16> %a) ret i16 %arg1 @@ -649,13 +661,24 @@ entry: } define i32 @sminv_v3i32(<3 x i32> %a) { -; CHECK-LABEL: sminv_v3i32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #2147483647 // =0x7fffffff -; CHECK-NEXT: mov v0.s[3], w8 -; CHECK-NEXT: sminv s0, v0.4s -; CHECK-NEXT: fmov w0, s0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sminv_v3i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov w8, #2147483647 // =0x7fffffff +; CHECK-SD-NEXT: mov v0.s[3], w8 +; CHECK-SD-NEXT: sminv s0, v0.4s +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sminv_v3i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov w8, #2147483647 // =0x7fffffff +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v1.s[2], v0.s[2] +; CHECK-GI-NEXT: mov v1.s[3], w8 +; CHECK-GI-NEXT: sminv s0, v1.4s +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: ret entry: %arg1 = call i32 @llvm.vector.reduce.smin.v3i32(<3 x i32> %a) ret i32 %arg1 @@ -954,9 +977,12 @@ define i16 @smaxv_v3i16(<3 x i16> %a) { ; CHECK-GI-LABEL: smaxv_v3i16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: mov v1.h[0], v0.h[0] ; CHECK-GI-NEXT: mov w8, #32768 // =0x8000 -; CHECK-GI-NEXT: mov v0.h[3], w8 -; CHECK-GI-NEXT: smaxv h0, v0.4h +; CHECK-GI-NEXT: mov v1.h[1], v0.h[1] +; CHECK-GI-NEXT: mov v1.h[2], v0.h[2] +; CHECK-GI-NEXT: mov v1.h[3], w8 +; CHECK-GI-NEXT: smaxv h0, v1.4h ; CHECK-GI-NEXT: fmov w0, s0 ; CHECK-GI-NEXT: ret entry: @@ -1010,13 +1036,24 @@ entry: } define i32 @smaxv_v3i32(<3 x i32> %a) { -; CHECK-LABEL: smaxv_v3i32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #-2147483648 // =0x80000000 -; CHECK-NEXT: mov v0.s[3], w8 -; CHECK-NEXT: smaxv s0, v0.4s -; CHECK-NEXT: fmov w0, s0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: smaxv_v3i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov w8, #-2147483648 // =0x80000000 +; CHECK-SD-NEXT: mov v0.s[3], w8 +; CHECK-SD-NEXT: smaxv s0, v0.4s +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: smaxv_v3i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov w8, #-2147483648 // =0x80000000 +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v1.s[2], v0.s[2] +; CHECK-GI-NEXT: mov v1.s[3], w8 +; CHECK-GI-NEXT: smaxv s0, v1.4s +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: ret entry: %arg1 = call i32 @llvm.vector.reduce.smax.v3i32(<3 x i32> %a) ret i32 %arg1 @@ -1313,9 +1350,12 @@ define i16 @uminv_v3i16(<3 x i16> %a) { ; CHECK-GI-LABEL: uminv_v3i16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: mov v1.h[0], v0.h[0] ; CHECK-GI-NEXT: mov w8, #65535 // =0xffff -; CHECK-GI-NEXT: mov v0.h[3], w8 -; CHECK-GI-NEXT: uminv h0, v0.4h +; CHECK-GI-NEXT: mov v1.h[1], v0.h[1] +; CHECK-GI-NEXT: mov v1.h[2], v0.h[2] +; CHECK-GI-NEXT: mov v1.h[3], w8 +; CHECK-GI-NEXT: uminv h0, v1.4h ; CHECK-GI-NEXT: fmov w0, s0 ; CHECK-GI-NEXT: ret entry: @@ -1369,13 +1409,24 @@ entry: } define i32 @uminv_v3i32(<3 x i32> %a) { -; CHECK-LABEL: uminv_v3i32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #-1 // =0xffffffff -; CHECK-NEXT: mov v0.s[3], w8 -; CHECK-NEXT: uminv s0, v0.4s -; CHECK-NEXT: fmov w0, s0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: uminv_v3i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov w8, #-1 // =0xffffffff +; CHECK-SD-NEXT: mov v0.s[3], w8 +; CHECK-SD-NEXT: uminv s0, v0.4s +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uminv_v3i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov w8, #-1 // =0xffffffff +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v1.s[2], v0.s[2] +; CHECK-GI-NEXT: mov v1.s[3], w8 +; CHECK-GI-NEXT: uminv s0, v1.4s +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: ret entry: %arg1 = call i32 @llvm.vector.reduce.umin.v3i32(<3 x i32> %a) ret i32 %arg1 @@ -1671,9 +1722,12 @@ define i16 @umaxv_v3i16(<3 x i16> %a) { ; CHECK-GI-LABEL: umaxv_v3i16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: mov v1.h[0], v0.h[0] ; CHECK-GI-NEXT: mov w8, #0 // =0x0 -; CHECK-GI-NEXT: mov v0.h[3], w8 -; CHECK-GI-NEXT: umaxv h0, v0.4h +; CHECK-GI-NEXT: mov v1.h[1], v0.h[1] +; CHECK-GI-NEXT: mov v1.h[2], v0.h[2] +; CHECK-GI-NEXT: mov v1.h[3], w8 +; CHECK-GI-NEXT: umaxv h0, v1.4h ; CHECK-GI-NEXT: fmov w0, s0 ; CHECK-GI-NEXT: ret entry: @@ -1727,12 +1781,22 @@ entry: } define i32 @umaxv_v3i32(<3 x i32> %a) { -; CHECK-LABEL: umaxv_v3i32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov v0.s[3], wzr -; CHECK-NEXT: umaxv s0, v0.4s -; CHECK-NEXT: fmov w0, s0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: umaxv_v3i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov v0.s[3], wzr +; CHECK-SD-NEXT: umaxv s0, v0.4s +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: umaxv_v3i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v1.s[2], v0.s[2] +; CHECK-GI-NEXT: mov v1.s[3], wzr +; CHECK-GI-NEXT: umaxv s0, v1.4s +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: ret entry: %arg1 = call i32 @llvm.vector.reduce.umax.v3i32(<3 x i32> %a) ret i32 %arg1 diff --git a/llvm/test/CodeGen/AArch64/abs.ll b/llvm/test/CodeGen/AArch64/abs.ll index 25a14ef9a49ee..29fe2d02a93e1 100644 --- a/llvm/test/CodeGen/AArch64/abs.ll +++ b/llvm/test/CodeGen/AArch64/abs.ll @@ -336,9 +336,17 @@ define <3 x i8> @abs_v3i8(<3 x i8> %a){ ; CHECK-GI-NEXT: mov v0.b[1], w1 ; CHECK-GI-NEXT: mov v0.b[2], w2 ; CHECK-GI-NEXT: abs v0.8b, v0.8b -; CHECK-GI-NEXT: umov w0, v0.b[0] -; CHECK-GI-NEXT: umov w1, v0.b[1] -; CHECK-GI-NEXT: umov w2, v0.b[2] +; CHECK-GI-NEXT: umov w8, v0.b[0] +; CHECK-GI-NEXT: umov w9, v0.b[1] +; CHECK-GI-NEXT: mov v1.s[0], w8 +; CHECK-GI-NEXT: umov w8, v0.b[2] +; CHECK-GI-NEXT: mov v1.s[1], w9 +; CHECK-GI-NEXT: mov v1.s[2], w8 +; CHECK-GI-NEXT: mov s0, v1.s[1] +; CHECK-GI-NEXT: mov s2, v1.s[2] +; CHECK-GI-NEXT: fmov w0, s1 +; CHECK-GI-NEXT: fmov w1, s0 +; CHECK-GI-NEXT: fmov w2, s2 ; CHECK-GI-NEXT: ret entry: %res = call <3 x i8> @llvm.abs.v3i8(<3 x i8> %a, i1 0) @@ -358,10 +366,30 @@ entry: declare <7 x i8> @llvm.abs.v7i8(<7 x i8>, i1) define <3 x i16> @abs_v3i16(<3 x i16> %a){ -; CHECK-LABEL: abs_v3i16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: abs v0.4h, v0.4h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: abs_v3i16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: abs v0.4h, v0.4h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: abs_v3i16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: mov v1.h[0], v0.h[0] +; CHECK-GI-NEXT: mov v1.h[1], v0.h[1] +; CHECK-GI-NEXT: mov v1.h[2], v0.h[2] +; CHECK-GI-NEXT: abs v1.4h, v1.4h +; CHECK-GI-NEXT: umov w8, v1.h[0] +; CHECK-GI-NEXT: umov w9, v1.h[1] +; CHECK-GI-NEXT: mov v0.s[0], w8 +; CHECK-GI-NEXT: umov w8, v1.h[2] +; CHECK-GI-NEXT: mov v0.s[1], w9 +; CHECK-GI-NEXT: mov v0.s[2], w8 +; CHECK-GI-NEXT: mov w8, v0.s[1] +; CHECK-GI-NEXT: mov w9, v0.s[2] +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: mov v0.h[2], w9 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret entry: %res = call <3 x i16> @llvm.abs.v3i16(<3 x i16> %a, i1 0) ret <3 x i16> %res @@ -380,10 +408,21 @@ entry: declare <7 x i16> @llvm.abs.v7i16(<7 x i16>, i1) define <3 x i32> @abs_v3i32(<3 x i32> %a){ -; CHECK-LABEL: abs_v3i32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: abs v0.4s, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: abs_v3i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: abs v0.4s, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: abs_v3i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v1.s[2], v0.s[2] +; CHECK-GI-NEXT: abs v1.4s, v1.4s +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] +; CHECK-GI-NEXT: ret entry: %res = call <3 x i32> @llvm.abs.v3i32(<3 x i32> %a, i1 0) ret <3 x i32> %res diff --git a/llvm/test/CodeGen/AArch64/add.ll b/llvm/test/CodeGen/AArch64/add.ll index e3072dc41d933..5d11deaac40be 100644 --- a/llvm/test/CodeGen/AArch64/add.ll +++ b/llvm/test/CodeGen/AArch64/add.ll @@ -343,10 +343,24 @@ entry: } define <3 x i32> @v3i32(<3 x i32> %d, <3 x i32> %e) { -; CHECK-LABEL: v3i32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: add v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: v3i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v3i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v2.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v3.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v2.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v3.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v2.s[2], v0.s[2] +; CHECK-GI-NEXT: mov v3.s[2], v1.s[2] +; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] +; CHECK-GI-NEXT: ret entry: %s = add <3 x i32> %d, %e ret <3 x i32> %s @@ -408,8 +422,9 @@ define <3 x i64> @v3i64(<3 x i64> %d, <3 x i64> %e) { ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: mov v3.d[1], v4.d[0] ; CHECK-GI-NEXT: add x8, x8, x9 -; CHECK-GI-NEXT: fmov d2, x8 +; CHECK-GI-NEXT: mov v2.d[0], x8 ; CHECK-GI-NEXT: add v0.2d, v0.2d, v3.2d +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/andorxor.ll b/llvm/test/CodeGen/AArch64/andorxor.ll index 5c7429aebb31e..70477b0c98c77 100644 --- a/llvm/test/CodeGen/AArch64/andorxor.ll +++ b/llvm/test/CodeGen/AArch64/andorxor.ll @@ -1050,30 +1050,72 @@ entry: } define <3 x i32> @and_v3i32(<3 x i32> %d, <3 x i32> %e) { -; CHECK-LABEL: and_v3i32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: and_v3i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_v3i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v2.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v3.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v2.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v3.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v2.s[2], v0.s[2] +; CHECK-GI-NEXT: mov v3.s[2], v1.s[2] +; CHECK-GI-NEXT: and v1.16b, v2.16b, v3.16b +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] +; CHECK-GI-NEXT: ret entry: %s = and <3 x i32> %d, %e ret <3 x i32> %s } define <3 x i32> @or_v3i32(<3 x i32> %d, <3 x i32> %e) { -; CHECK-LABEL: or_v3i32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: or_v3i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: or_v3i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v2.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v3.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v2.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v3.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v2.s[2], v0.s[2] +; CHECK-GI-NEXT: mov v3.s[2], v1.s[2] +; CHECK-GI-NEXT: orr v1.16b, v2.16b, v3.16b +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] +; CHECK-GI-NEXT: ret entry: %s = or <3 x i32> %d, %e ret <3 x i32> %s } define <3 x i32> @xor_v3i32(<3 x i32> %d, <3 x i32> %e) { -; CHECK-LABEL: xor_v3i32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: xor_v3i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: xor_v3i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v2.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v3.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v2.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v3.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v2.s[2], v0.s[2] +; CHECK-GI-NEXT: mov v3.s[2], v1.s[2] +; CHECK-GI-NEXT: eor v1.16b, v2.16b, v3.16b +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] +; CHECK-GI-NEXT: ret entry: %s = xor <3 x i32> %d, %e ret <3 x i32> %s @@ -1209,8 +1251,9 @@ define <3 x i64> @and_v3i64(<3 x i64> %d, <3 x i64> %e) { ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: mov v3.d[1], v4.d[0] ; CHECK-GI-NEXT: and x8, x8, x9 -; CHECK-GI-NEXT: fmov d2, x8 +; CHECK-GI-NEXT: mov v2.d[0], x8 ; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret @@ -1238,8 +1281,9 @@ define <3 x i64> @or_v3i64(<3 x i64> %d, <3 x i64> %e) { ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: mov v3.d[1], v4.d[0] ; CHECK-GI-NEXT: orr x8, x8, x9 -; CHECK-GI-NEXT: fmov d2, x8 +; CHECK-GI-NEXT: mov v2.d[0], x8 ; CHECK-GI-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret @@ -1267,8 +1311,9 @@ define <3 x i64> @xor_v3i64(<3 x i64> %d, <3 x i64> %e) { ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: mov v3.d[1], v4.d[0] ; CHECK-GI-NEXT: eor x8, x8, x9 -; CHECK-GI-NEXT: fmov d2, x8 +; CHECK-GI-NEXT: mov v2.d[0], x8 ; CHECK-GI-NEXT: eor v0.16b, v0.16b, v3.16b +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/arm64-fp128.ll b/llvm/test/CodeGen/AArch64/arm64-fp128.ll index 7eb26096ed156..8ec8ba877d854 100644 --- a/llvm/test/CodeGen/AArch64/arm64-fp128.ll +++ b/llvm/test/CodeGen/AArch64/arm64-fp128.ll @@ -1216,9 +1216,12 @@ define <2 x half> @vec_round_f16(<2 x fp128> %val) { ; CHECK-GI-NEXT: bl __trunctfhf2 ; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: bl __trunctfhf2 -; CHECK-GI-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp q0, q1, [sp] // 32-byte Folded Reload ; CHECK-GI-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: mov v1.h[1], v0.h[0] +; CHECK-GI-NEXT: mov h0, v1.h[1] +; CHECK-GI-NEXT: mov v1.h[1], v0.h[0] +; CHECK-GI-NEXT: mov v0.16b, v1.16b ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: add sp, sp, #64 ; CHECK-GI-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/bitcast.ll b/llvm/test/CodeGen/AArch64/bitcast.ll index 39f2572d9fd35..e34bac2e2fa69 100644 --- a/llvm/test/CodeGen/AArch64/bitcast.ll +++ b/llvm/test/CodeGen/AArch64/bitcast.ll @@ -634,10 +634,21 @@ define <8 x i64> @bitcast_v16i32_v8i64(<16 x i32> %a, <16 x i32> %b){ ; ===== Vectors with Non-Pow 2 Widths ===== define <6 x i16> @bitcast_v3i32_v6i16(<3 x i32> %a, <3 x i32> %b){ -; CHECK-LABEL: bitcast_v3i32_v6i16: -; CHECK: // %bb.0: -; CHECK-NEXT: add v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: bitcast_v3i32_v6i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: bitcast_v3i32_v6i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov v2.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v3.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v2.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v3.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v2.s[2], v0.s[2] +; CHECK-GI-NEXT: mov v3.s[2], v1.s[2] +; CHECK-GI-NEXT: add v0.4s, v2.4s, v3.4s +; CHECK-GI-NEXT: ret %c = add <3 x i32> %a, %b %d = bitcast <3 x i32> %c to <6 x i16> ret <6 x i16> %d diff --git a/llvm/test/CodeGen/AArch64/bswap.ll b/llvm/test/CodeGen/AArch64/bswap.ll index 74e4a167ae14c..fd1ac47bef7d1 100644 --- a/llvm/test/CodeGen/AArch64/bswap.ll +++ b/llvm/test/CodeGen/AArch64/bswap.ll @@ -246,10 +246,30 @@ declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>) ; ===== Vectors with Non-Pow 2 Widths ===== define <3 x i16> @bswap_v3i16(<3 x i16> %a){ -; CHECK-LABEL: bswap_v3i16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: rev16 v0.8b, v0.8b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: bswap_v3i16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: rev16 v0.8b, v0.8b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: bswap_v3i16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: mov v1.h[0], v0.h[0] +; CHECK-GI-NEXT: mov v1.h[1], v0.h[1] +; CHECK-GI-NEXT: mov v1.h[2], v0.h[2] +; CHECK-GI-NEXT: rev16 v1.8b, v1.8b +; CHECK-GI-NEXT: umov w8, v1.h[0] +; CHECK-GI-NEXT: umov w9, v1.h[1] +; CHECK-GI-NEXT: mov v0.s[0], w8 +; CHECK-GI-NEXT: umov w8, v1.h[2] +; CHECK-GI-NEXT: mov v0.s[1], w9 +; CHECK-GI-NEXT: mov v0.s[2], w8 +; CHECK-GI-NEXT: mov w8, v0.s[1] +; CHECK-GI-NEXT: mov w9, v0.s[2] +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: mov v0.h[2], w9 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret entry: %res = call <3 x i16> @llvm.bswap.v3i16(<3 x i16> %a) ret <3 x i16> %res @@ -268,10 +288,21 @@ entry: declare <7 x i16> @llvm.bswap.v7i16(<7 x i16>) define <3 x i32> @bswap_v3i32(<3 x i32> %a){ -; CHECK-LABEL: bswap_v3i32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: rev32 v0.16b, v0.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: bswap_v3i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: rev32 v0.16b, v0.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: bswap_v3i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v1.s[2], v0.s[2] +; CHECK-GI-NEXT: rev32 v1.16b, v1.16b +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] +; CHECK-GI-NEXT: ret entry: %res = call <3 x i32> @llvm.bswap.v3i32(<3 x i32> %a) ret <3 x i32> %res diff --git a/llvm/test/CodeGen/AArch64/fabs.ll b/llvm/test/CodeGen/AArch64/fabs.ll index 43e9007073634..0e1f9fba307ad 100644 --- a/llvm/test/CodeGen/AArch64/fabs.ll +++ b/llvm/test/CodeGen/AArch64/fabs.ll @@ -88,6 +88,7 @@ define <3 x double> @fabs_v3f64(<3 x double> %a) { ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: fabs d2, d2 ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-GI-NEXT: fabs v0.2d, v0.2d ; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -119,10 +120,21 @@ entry: } define <3 x float> @fabs_v3f32(<3 x float> %a) { -; CHECK-LABEL: fabs_v3f32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fabs v0.4s, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: fabs_v3f32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fabs v0.4s, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: fabs_v3f32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v1.s[2], v0.s[2] +; CHECK-GI-NEXT: fabs v1.4s, v1.4s +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] +; CHECK-GI-NEXT: ret entry: %c = call <3 x float> @llvm.fabs.v3f32(<3 x float> %a) ret <3 x float> %c diff --git a/llvm/test/CodeGen/AArch64/faddsub.ll b/llvm/test/CodeGen/AArch64/faddsub.ll index b15579199a059..de9a458a98b60 100644 --- a/llvm/test/CodeGen/AArch64/faddsub.ll +++ b/llvm/test/CodeGen/AArch64/faddsub.ll @@ -93,6 +93,7 @@ define <3 x double> @fadd_v3f64(<3 x double> %a, <3 x double> %b) { ; CHECK-GI-NEXT: fadd d2, d2, d5 ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: mov v3.d[1], v4.d[0] +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-GI-NEXT: fadd v0.2d, v0.2d, v3.2d ; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -130,10 +131,24 @@ entry: } define <3 x float> @fadd_v3f32(<3 x float> %a, <3 x float> %b) { -; CHECK-LABEL: fadd_v3f32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: fadd_v3f32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fadd v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: fadd_v3f32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v2.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v3.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v2.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v3.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v2.s[2], v0.s[2] +; CHECK-GI-NEXT: mov v3.s[2], v1.s[2] +; CHECK-GI-NEXT: fadd v1.4s, v2.4s, v3.4s +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] +; CHECK-GI-NEXT: ret entry: %c = fadd <3 x float> %a, %b ret <3 x float> %c @@ -434,6 +449,7 @@ define <3 x double> @fsub_v3f64(<3 x double> %a, <3 x double> %b) { ; CHECK-GI-NEXT: fsub d2, d2, d5 ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: mov v3.d[1], v4.d[0] +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-GI-NEXT: fsub v0.2d, v0.2d, v3.2d ; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -471,10 +487,24 @@ entry: } define <3 x float> @fsub_v3f32(<3 x float> %a, <3 x float> %b) { -; CHECK-LABEL: fsub_v3f32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fsub v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: fsub_v3f32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fsub v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: fsub_v3f32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v2.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v3.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v2.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v3.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v2.s[2], v0.s[2] +; CHECK-GI-NEXT: mov v3.s[2], v1.s[2] +; CHECK-GI-NEXT: fsub v1.4s, v2.4s, v3.4s +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] +; CHECK-GI-NEXT: ret entry: %c = fsub <3 x float> %a, %b ret <3 x float> %c diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll index 66f26fc9d8597..c1459ac5b5643 100644 --- a/llvm/test/CodeGen/AArch64/fcmp.ll +++ b/llvm/test/CodeGen/AArch64/fcmp.ll @@ -783,7 +783,8 @@ define <3 x double> @v3f128_double(<3 x fp128> %a, <3 x fp128> %b, <3 x double> ; CHECK-GI-NEXT: ldp x20, x19, [sp, #160] // 16-byte Folded Reload ; CHECK-GI-NEXT: orr x8, x9, x8 ; CHECK-GI-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-GI-NEXT: fmov d2, x8 +; CHECK-GI-NEXT: mov v2.d[0], x8 +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: add sp, sp, #176 @@ -856,8 +857,9 @@ define <3 x double> @v3f64_double(<3 x double> %a, <3 x double> %b, <3 x double> ; CHECK-GI-NEXT: and x8, x8, x9 ; CHECK-GI-NEXT: bic x9, x10, x9 ; CHECK-GI-NEXT: orr x8, x8, x9 -; CHECK-GI-NEXT: fmov d2, x8 +; CHECK-GI-NEXT: mov v2.d[0], x8 ; CHECK-GI-NEXT: bsl v0.16b, v6.16b, v1.16b +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret @@ -930,24 +932,33 @@ define <3 x i32> @v3f64_i32(<3 x double> %a, <3 x double> %b, <3 x i32> %d, <3 x ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: mov v3.d[1], v4.d[0] ; CHECK-GI-NEXT: mov v1.s[0], w8 +; CHECK-GI-NEXT: mov v4.s[0], v7.s[0] ; CHECK-GI-NEXT: cset w9, mi ; CHECK-GI-NEXT: mov v2.s[0], w9 ; CHECK-GI-NEXT: mov w9, #-1 // =0xffffffff ; CHECK-GI-NEXT: fcmgt v0.2d, v3.2d, v0.2d ; CHECK-GI-NEXT: mov v1.s[1], w8 ; CHECK-GI-NEXT: mov v3.s[0], w9 +; CHECK-GI-NEXT: mov v4.s[1], v7.s[1] ; CHECK-GI-NEXT: xtn v0.2s, v0.2d ; CHECK-GI-NEXT: mov v1.s[2], w8 ; CHECK-GI-NEXT: mov v3.s[1], w9 +; CHECK-GI-NEXT: mov v4.s[2], v7.s[2] ; CHECK-GI-NEXT: mov v0.d[1], v2.d[0] +; CHECK-GI-NEXT: mov v2.s[0], v6.s[0] ; CHECK-GI-NEXT: mov v3.s[2], w9 ; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v1.4s ; CHECK-GI-NEXT: neg v1.4s, v1.4s +; CHECK-GI-NEXT: mov v2.s[1], v6.s[1] ; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: mov v2.s[2], v6.s[2] ; CHECK-GI-NEXT: eor v1.16b, v0.16b, v3.16b -; CHECK-GI-NEXT: and v0.16b, v6.16b, v0.16b -; CHECK-GI-NEXT: and v1.16b, v7.16b, v1.16b -; CHECK-GI-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: and v0.16b, v2.16b, v0.16b +; CHECK-GI-NEXT: and v1.16b, v4.16b, v1.16b +; CHECK-GI-NEXT: orr v1.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] ; CHECK-GI-NEXT: ret entry: %c = fcmp olt <3 x double> %a, %b @@ -1000,22 +1011,37 @@ define <3 x float> @v3f32_float(<3 x float> %a, <3 x float> %b, <3 x float> %d, ; ; CHECK-GI-LABEL: v3f32_float: ; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v4.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v5.s[0], v1.s[0] ; CHECK-GI-NEXT: mov w8, #31 // =0x1f +; CHECK-GI-NEXT: mov v6.s[0], w8 ; CHECK-GI-NEXT: mov w9, #-1 // =0xffffffff -; CHECK-GI-NEXT: fcmgt v0.4s, v1.4s, v0.4s -; CHECK-GI-NEXT: mov v4.s[0], w8 -; CHECK-GI-NEXT: mov v5.s[0], w9 -; CHECK-GI-NEXT: mov v4.s[1], w8 -; CHECK-GI-NEXT: mov v5.s[1], w9 -; CHECK-GI-NEXT: mov v4.s[2], w8 -; CHECK-GI-NEXT: mov v5.s[2], w9 -; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v4.4s -; CHECK-GI-NEXT: neg v1.4s, v4.4s -; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v1.4s -; CHECK-GI-NEXT: eor v1.16b, v0.16b, v5.16b -; CHECK-GI-NEXT: and v0.16b, v2.16b, v0.16b -; CHECK-GI-NEXT: and v1.16b, v3.16b, v1.16b -; CHECK-GI-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: mov v4.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v5.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v6.s[1], w8 +; CHECK-GI-NEXT: mov v4.s[2], v0.s[2] +; CHECK-GI-NEXT: mov v5.s[2], v1.s[2] +; CHECK-GI-NEXT: mov v0.s[0], w9 +; CHECK-GI-NEXT: mov v6.s[2], w8 +; CHECK-GI-NEXT: fcmgt v1.4s, v5.4s, v4.4s +; CHECK-GI-NEXT: mov v4.s[0], v2.s[0] +; CHECK-GI-NEXT: mov v5.s[0], v3.s[0] +; CHECK-GI-NEXT: mov v0.s[1], w9 +; CHECK-GI-NEXT: ushl v1.4s, v1.4s, v6.4s +; CHECK-GI-NEXT: neg v6.4s, v6.4s +; CHECK-GI-NEXT: mov v4.s[1], v2.s[1] +; CHECK-GI-NEXT: mov v5.s[1], v3.s[1] +; CHECK-GI-NEXT: mov v0.s[2], w9 +; CHECK-GI-NEXT: sshl v1.4s, v1.4s, v6.4s +; CHECK-GI-NEXT: mov v4.s[2], v2.s[2] +; CHECK-GI-NEXT: mov v5.s[2], v3.s[2] +; CHECK-GI-NEXT: eor v0.16b, v1.16b, v0.16b +; CHECK-GI-NEXT: and v1.16b, v4.16b, v1.16b +; CHECK-GI-NEXT: and v0.16b, v5.16b, v0.16b +; CHECK-GI-NEXT: orr v1.16b, v1.16b, v0.16b +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] ; CHECK-GI-NEXT: ret entry: %c = fcmp olt <3 x float> %a, %b @@ -1078,22 +1104,37 @@ define <3 x i32> @v3f32_i32(<3 x float> %a, <3 x float> %b, <3 x i32> %d, <3 x i ; ; CHECK-GI-LABEL: v3f32_i32: ; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v4.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v5.s[0], v1.s[0] ; CHECK-GI-NEXT: mov w8, #31 // =0x1f +; CHECK-GI-NEXT: mov v6.s[0], w8 ; CHECK-GI-NEXT: mov w9, #-1 // =0xffffffff -; CHECK-GI-NEXT: fcmgt v0.4s, v1.4s, v0.4s -; CHECK-GI-NEXT: mov v4.s[0], w8 -; CHECK-GI-NEXT: mov v5.s[0], w9 -; CHECK-GI-NEXT: mov v4.s[1], w8 -; CHECK-GI-NEXT: mov v5.s[1], w9 -; CHECK-GI-NEXT: mov v4.s[2], w8 -; CHECK-GI-NEXT: mov v5.s[2], w9 -; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v4.4s -; CHECK-GI-NEXT: neg v1.4s, v4.4s -; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v1.4s -; CHECK-GI-NEXT: eor v1.16b, v0.16b, v5.16b -; CHECK-GI-NEXT: and v0.16b, v2.16b, v0.16b -; CHECK-GI-NEXT: and v1.16b, v3.16b, v1.16b -; CHECK-GI-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: mov v4.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v5.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v6.s[1], w8 +; CHECK-GI-NEXT: mov v4.s[2], v0.s[2] +; CHECK-GI-NEXT: mov v5.s[2], v1.s[2] +; CHECK-GI-NEXT: mov v0.s[0], w9 +; CHECK-GI-NEXT: mov v6.s[2], w8 +; CHECK-GI-NEXT: fcmgt v1.4s, v5.4s, v4.4s +; CHECK-GI-NEXT: mov v4.s[0], v2.s[0] +; CHECK-GI-NEXT: mov v5.s[0], v3.s[0] +; CHECK-GI-NEXT: mov v0.s[1], w9 +; CHECK-GI-NEXT: ushl v1.4s, v1.4s, v6.4s +; CHECK-GI-NEXT: neg v6.4s, v6.4s +; CHECK-GI-NEXT: mov v4.s[1], v2.s[1] +; CHECK-GI-NEXT: mov v5.s[1], v3.s[1] +; CHECK-GI-NEXT: mov v0.s[2], w9 +; CHECK-GI-NEXT: sshl v1.4s, v1.4s, v6.4s +; CHECK-GI-NEXT: mov v4.s[2], v2.s[2] +; CHECK-GI-NEXT: mov v5.s[2], v3.s[2] +; CHECK-GI-NEXT: eor v0.16b, v1.16b, v0.16b +; CHECK-GI-NEXT: and v1.16b, v4.16b, v1.16b +; CHECK-GI-NEXT: and v0.16b, v5.16b, v0.16b +; CHECK-GI-NEXT: orr v1.16b, v1.16b, v0.16b +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] ; CHECK-GI-NEXT: ret entry: %c = fcmp olt <3 x float> %a, %b diff --git a/llvm/test/CodeGen/AArch64/fcopysign.ll b/llvm/test/CodeGen/AArch64/fcopysign.ll index a42ec8e253be2..6eb2d958540be 100644 --- a/llvm/test/CodeGen/AArch64/fcopysign.ll +++ b/llvm/test/CodeGen/AArch64/fcopysign.ll @@ -111,7 +111,8 @@ define <3 x double> @copysign_v3f64(<3 x double> %a, <3 x double> %b) { ; CHECK-GI-NEXT: and x9, x9, #0x8000000000000000 ; CHECK-GI-NEXT: fneg v1.2d, v6.2d ; CHECK-GI-NEXT: orr x8, x8, x9 -; CHECK-GI-NEXT: fmov d2, x8 +; CHECK-GI-NEXT: mov v2.d[0], x8 +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-GI-NEXT: bif v0.16b, v3.16b, v1.16b ; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -156,15 +157,24 @@ define <3 x float> @copysign_v3f32(<3 x float> %a, <3 x float> %b) { ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov w8, #-2147483648 // =0x80000000 ; CHECK-GI-NEXT: mov w9, #2147483647 // =0x7fffffff -; CHECK-GI-NEXT: mov v2.s[0], w9 -; CHECK-GI-NEXT: mov v3.s[0], w8 -; CHECK-GI-NEXT: mov v2.s[1], w9 -; CHECK-GI-NEXT: mov v3.s[1], w8 -; CHECK-GI-NEXT: mov v2.s[2], w9 -; CHECK-GI-NEXT: mov v3.s[2], w8 -; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b -; CHECK-GI-NEXT: and v1.16b, v1.16b, v3.16b -; CHECK-GI-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: mov v2.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v3.s[0], w9 +; CHECK-GI-NEXT: mov v4.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v5.s[0], w8 +; CHECK-GI-NEXT: mov v2.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v3.s[1], w9 +; CHECK-GI-NEXT: mov v4.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v5.s[1], w8 +; CHECK-GI-NEXT: mov v2.s[2], v0.s[2] +; CHECK-GI-NEXT: mov v4.s[2], v1.s[2] +; CHECK-GI-NEXT: mov v3.s[2], w9 +; CHECK-GI-NEXT: mov v5.s[2], w8 +; CHECK-GI-NEXT: and v0.16b, v2.16b, v3.16b +; CHECK-GI-NEXT: and v1.16b, v4.16b, v5.16b +; CHECK-GI-NEXT: orr v1.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] ; CHECK-GI-NEXT: ret entry: %c = call <3 x float> @llvm.copysign.v3f32(<3 x float> %a, <3 x float> %b) diff --git a/llvm/test/CodeGen/AArch64/fcvt.ll b/llvm/test/CodeGen/AArch64/fcvt.ll index b408e9c1bd4e6..15a8f0557cc41 100644 --- a/llvm/test/CodeGen/AArch64/fcvt.ll +++ b/llvm/test/CodeGen/AArch64/fcvt.ll @@ -84,6 +84,7 @@ define <3 x double> @ceil_v3f64(<3 x double> %a) { ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: frintp d2, d2 ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-GI-NEXT: frintp v0.2d, v0.2d ; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -115,10 +116,21 @@ entry: } define <3 x float> @ceil_v3f32(<3 x float> %a) { -; CHECK-LABEL: ceil_v3f32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: frintp v0.4s, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: ceil_v3f32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: frintp v0.4s, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ceil_v3f32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v1.s[2], v0.s[2] +; CHECK-GI-NEXT: frintp v1.4s, v1.4s +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] +; CHECK-GI-NEXT: ret entry: %c = call <3 x float> @llvm.ceil.v3f32(<3 x float> %a) ret <3 x float> %c @@ -383,6 +395,7 @@ define <3 x double> @floor_v3f64(<3 x double> %a) { ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: frintm d2, d2 ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-GI-NEXT: frintm v0.2d, v0.2d ; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -414,10 +427,21 @@ entry: } define <3 x float> @floor_v3f32(<3 x float> %a) { -; CHECK-LABEL: floor_v3f32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: frintm v0.4s, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: floor_v3f32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: frintm v0.4s, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: floor_v3f32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v1.s[2], v0.s[2] +; CHECK-GI-NEXT: frintm v1.4s, v1.4s +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] +; CHECK-GI-NEXT: ret entry: %c = call <3 x float> @llvm.floor.v3f32(<3 x float> %a) ret <3 x float> %c @@ -682,6 +706,7 @@ define <3 x double> @nearbyint_v3f64(<3 x double> %a) { ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: frinti d2, d2 ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-GI-NEXT: frinti v0.2d, v0.2d ; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -713,10 +738,21 @@ entry: } define <3 x float> @nearbyint_v3f32(<3 x float> %a) { -; CHECK-LABEL: nearbyint_v3f32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: frinti v0.4s, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: nearbyint_v3f32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: frinti v0.4s, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: nearbyint_v3f32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v1.s[2], v0.s[2] +; CHECK-GI-NEXT: frinti v1.4s, v1.4s +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] +; CHECK-GI-NEXT: ret entry: %c = call <3 x float> @llvm.nearbyint.v3f32(<3 x float> %a) ret <3 x float> %c @@ -981,6 +1017,7 @@ define <3 x double> @roundeven_v3f64(<3 x double> %a) { ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: frintn d2, d2 ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-GI-NEXT: frintn v0.2d, v0.2d ; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -1012,10 +1049,21 @@ entry: } define <3 x float> @roundeven_v3f32(<3 x float> %a) { -; CHECK-LABEL: roundeven_v3f32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: frintn v0.4s, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: roundeven_v3f32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: frintn v0.4s, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: roundeven_v3f32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v1.s[2], v0.s[2] +; CHECK-GI-NEXT: frintn v1.4s, v1.4s +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] +; CHECK-GI-NEXT: ret entry: %c = call <3 x float> @llvm.roundeven.v3f32(<3 x float> %a) ret <3 x float> %c @@ -1280,6 +1328,7 @@ define <3 x double> @rint_v3f64(<3 x double> %a) { ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: frintx d2, d2 ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-GI-NEXT: frintx v0.2d, v0.2d ; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -1311,10 +1360,21 @@ entry: } define <3 x float> @rint_v3f32(<3 x float> %a) { -; CHECK-LABEL: rint_v3f32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: frintx v0.4s, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: rint_v3f32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: frintx v0.4s, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: rint_v3f32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v1.s[2], v0.s[2] +; CHECK-GI-NEXT: frintx v1.4s, v1.4s +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] +; CHECK-GI-NEXT: ret entry: %c = call <3 x float> @llvm.rint.v3f32(<3 x float> %a) ret <3 x float> %c @@ -1579,6 +1639,7 @@ define <3 x double> @round_v3f64(<3 x double> %a) { ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: frinta d2, d2 ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-GI-NEXT: frinta v0.2d, v0.2d ; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -1610,10 +1671,21 @@ entry: } define <3 x float> @round_v3f32(<3 x float> %a) { -; CHECK-LABEL: round_v3f32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: frinta v0.4s, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: round_v3f32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: frinta v0.4s, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: round_v3f32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v1.s[2], v0.s[2] +; CHECK-GI-NEXT: frinta v1.4s, v1.4s +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] +; CHECK-GI-NEXT: ret entry: %c = call <3 x float> @llvm.round.v3f32(<3 x float> %a) ret <3 x float> %c @@ -1878,6 +1950,7 @@ define <3 x double> @trunc_v3f64(<3 x double> %a) { ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: frintz d2, d2 ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-GI-NEXT: frintz v0.2d, v0.2d ; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -1909,10 +1982,21 @@ entry: } define <3 x float> @trunc_v3f32(<3 x float> %a) { -; CHECK-LABEL: trunc_v3f32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: frintz v0.4s, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: trunc_v3f32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: frintz v0.4s, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: trunc_v3f32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v1.s[2], v0.s[2] +; CHECK-GI-NEXT: frintz v1.4s, v1.4s +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] +; CHECK-GI-NEXT: ret entry: %c = call <3 x float> @llvm.trunc.v3f32(<3 x float> %a) ret <3 x float> %c diff --git a/llvm/test/CodeGen/AArch64/fdiv.ll b/llvm/test/CodeGen/AArch64/fdiv.ll index 5bdccccc62b99..82ce3af7e614f 100644 --- a/llvm/test/CodeGen/AArch64/fdiv.ll +++ b/llvm/test/CodeGen/AArch64/fdiv.ll @@ -93,6 +93,7 @@ define <3 x double> @fdiv_v3f64(<3 x double> %a, <3 x double> %b) { ; CHECK-GI-NEXT: fdiv d2, d2, d5 ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: mov v3.d[1], v4.d[0] +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-GI-NEXT: fdiv v0.2d, v0.2d, v3.2d ; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -130,10 +131,24 @@ entry: } define <3 x float> @fdiv_v3f32(<3 x float> %a, <3 x float> %b) { -; CHECK-LABEL: fdiv_v3f32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fdiv v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: fdiv_v3f32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fdiv v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: fdiv_v3f32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v2.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v3.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v2.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v3.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v2.s[2], v0.s[2] +; CHECK-GI-NEXT: mov v3.s[2], v1.s[2] +; CHECK-GI-NEXT: fdiv v1.4s, v2.4s, v3.4s +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] +; CHECK-GI-NEXT: ret entry: %c = fdiv <3 x float> %a, %b ret <3 x float> %c diff --git a/llvm/test/CodeGen/AArch64/fexplog.ll b/llvm/test/CodeGen/AArch64/fexplog.ll index f13e2fcd1c448..08068ac4f1088 100644 --- a/llvm/test/CodeGen/AArch64/fexplog.ll +++ b/llvm/test/CodeGen/AArch64/fexplog.ll @@ -139,29 +139,33 @@ define <3 x double> @exp_v3f64(<3 x double> %a) { ; ; CHECK-GI-LABEL: exp_v3f64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: str d10, [sp, #-32]! // 8-byte Folded Spill -; CHECK-GI-NEXT: stp d9, d8, [sp, #8] // 16-byte Folded Spill -; CHECK-GI-NEXT: str x30, [sp, #24] // 8-byte Folded Spill -; CHECK-GI-NEXT: .cfi_def_cfa_offset 32 -; CHECK-GI-NEXT: .cfi_offset w30, -8 -; CHECK-GI-NEXT: .cfi_offset b8, -16 -; CHECK-GI-NEXT: .cfi_offset b9, -24 -; CHECK-GI-NEXT: .cfi_offset b10, -32 +; CHECK-GI-NEXT: sub sp, sp, #64 +; CHECK-GI-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: str x30, [sp, #48] // 8-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 64 +; CHECK-GI-NEXT: .cfi_offset w30, -16 +; CHECK-GI-NEXT: .cfi_offset b8, -24 +; CHECK-GI-NEXT: .cfi_offset b9, -32 ; CHECK-GI-NEXT: fmov d8, d1 ; CHECK-GI-NEXT: fmov d9, d2 ; CHECK-GI-NEXT: bl exp -; CHECK-GI-NEXT: fmov d10, d0 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov d0, d8 ; CHECK-GI-NEXT: bl exp -; CHECK-GI-NEXT: fmov d8, d0 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov d0, d9 ; CHECK-GI-NEXT: bl exp -; CHECK-GI-NEXT: fmov d1, d8 -; CHECK-GI-NEXT: ldp d9, d8, [sp, #8] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr x30, [sp, #24] // 8-byte Folded Reload +; CHECK-GI-NEXT: ldp q1, q3, [sp] // 32-byte Folded Reload ; CHECK-GI-NEXT: fmov d2, d0 -; CHECK-GI-NEXT: fmov d0, d10 -; CHECK-GI-NEXT: ldr d10, [sp], #32 // 8-byte Folded Reload +; CHECK-GI-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-GI-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload +; CHECK-GI-NEXT: mov v3.d[1], v1.d[0] +; CHECK-GI-NEXT: mov d1, v3.d[1] +; CHECK-GI-NEXT: fmov d0, d3 +; CHECK-GI-NEXT: add sp, sp, #64 ; CHECK-GI-NEXT: ret entry: %c = call <3 x double> @llvm.exp.v3f64(<3 x double> %a) @@ -355,7 +359,9 @@ define <3 x float> @exp_v3f32(<3 x float> %a) { ; CHECK-GI-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.s[1], v2.s[0] ; CHECK-GI-NEXT: mov v1.s[2], v0.s[0] -; CHECK-GI-NEXT: mov v0.16b, v1.16b +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] ; CHECK-GI-NEXT: add sp, sp, #64 ; CHECK-GI-NEXT: ret entry: @@ -1442,29 +1448,33 @@ define <3 x double> @exp2_v3f64(<3 x double> %a) { ; ; CHECK-GI-LABEL: exp2_v3f64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: str d10, [sp, #-32]! // 8-byte Folded Spill -; CHECK-GI-NEXT: stp d9, d8, [sp, #8] // 16-byte Folded Spill -; CHECK-GI-NEXT: str x30, [sp, #24] // 8-byte Folded Spill -; CHECK-GI-NEXT: .cfi_def_cfa_offset 32 -; CHECK-GI-NEXT: .cfi_offset w30, -8 -; CHECK-GI-NEXT: .cfi_offset b8, -16 -; CHECK-GI-NEXT: .cfi_offset b9, -24 -; CHECK-GI-NEXT: .cfi_offset b10, -32 +; CHECK-GI-NEXT: sub sp, sp, #64 +; CHECK-GI-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: str x30, [sp, #48] // 8-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 64 +; CHECK-GI-NEXT: .cfi_offset w30, -16 +; CHECK-GI-NEXT: .cfi_offset b8, -24 +; CHECK-GI-NEXT: .cfi_offset b9, -32 ; CHECK-GI-NEXT: fmov d8, d1 ; CHECK-GI-NEXT: fmov d9, d2 ; CHECK-GI-NEXT: bl exp2 -; CHECK-GI-NEXT: fmov d10, d0 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov d0, d8 ; CHECK-GI-NEXT: bl exp2 -; CHECK-GI-NEXT: fmov d8, d0 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov d0, d9 ; CHECK-GI-NEXT: bl exp2 -; CHECK-GI-NEXT: fmov d1, d8 -; CHECK-GI-NEXT: ldp d9, d8, [sp, #8] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr x30, [sp, #24] // 8-byte Folded Reload +; CHECK-GI-NEXT: ldp q1, q3, [sp] // 32-byte Folded Reload ; CHECK-GI-NEXT: fmov d2, d0 -; CHECK-GI-NEXT: fmov d0, d10 -; CHECK-GI-NEXT: ldr d10, [sp], #32 // 8-byte Folded Reload +; CHECK-GI-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-GI-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload +; CHECK-GI-NEXT: mov v3.d[1], v1.d[0] +; CHECK-GI-NEXT: mov d1, v3.d[1] +; CHECK-GI-NEXT: fmov d0, d3 +; CHECK-GI-NEXT: add sp, sp, #64 ; CHECK-GI-NEXT: ret entry: %c = call <3 x double> @llvm.exp2.v3f64(<3 x double> %a) @@ -1658,7 +1668,9 @@ define <3 x float> @exp2_v3f32(<3 x float> %a) { ; CHECK-GI-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.s[1], v2.s[0] ; CHECK-GI-NEXT: mov v1.s[2], v0.s[0] -; CHECK-GI-NEXT: mov v0.16b, v1.16b +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] ; CHECK-GI-NEXT: add sp, sp, #64 ; CHECK-GI-NEXT: ret entry: @@ -2745,29 +2757,33 @@ define <3 x double> @log_v3f64(<3 x double> %a) { ; ; CHECK-GI-LABEL: log_v3f64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: str d10, [sp, #-32]! // 8-byte Folded Spill -; CHECK-GI-NEXT: stp d9, d8, [sp, #8] // 16-byte Folded Spill -; CHECK-GI-NEXT: str x30, [sp, #24] // 8-byte Folded Spill -; CHECK-GI-NEXT: .cfi_def_cfa_offset 32 -; CHECK-GI-NEXT: .cfi_offset w30, -8 -; CHECK-GI-NEXT: .cfi_offset b8, -16 -; CHECK-GI-NEXT: .cfi_offset b9, -24 -; CHECK-GI-NEXT: .cfi_offset b10, -32 +; CHECK-GI-NEXT: sub sp, sp, #64 +; CHECK-GI-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: str x30, [sp, #48] // 8-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 64 +; CHECK-GI-NEXT: .cfi_offset w30, -16 +; CHECK-GI-NEXT: .cfi_offset b8, -24 +; CHECK-GI-NEXT: .cfi_offset b9, -32 ; CHECK-GI-NEXT: fmov d8, d1 ; CHECK-GI-NEXT: fmov d9, d2 ; CHECK-GI-NEXT: bl log -; CHECK-GI-NEXT: fmov d10, d0 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov d0, d8 ; CHECK-GI-NEXT: bl log -; CHECK-GI-NEXT: fmov d8, d0 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov d0, d9 ; CHECK-GI-NEXT: bl log -; CHECK-GI-NEXT: fmov d1, d8 -; CHECK-GI-NEXT: ldp d9, d8, [sp, #8] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr x30, [sp, #24] // 8-byte Folded Reload +; CHECK-GI-NEXT: ldp q1, q3, [sp] // 32-byte Folded Reload ; CHECK-GI-NEXT: fmov d2, d0 -; CHECK-GI-NEXT: fmov d0, d10 -; CHECK-GI-NEXT: ldr d10, [sp], #32 // 8-byte Folded Reload +; CHECK-GI-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-GI-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload +; CHECK-GI-NEXT: mov v3.d[1], v1.d[0] +; CHECK-GI-NEXT: mov d1, v3.d[1] +; CHECK-GI-NEXT: fmov d0, d3 +; CHECK-GI-NEXT: add sp, sp, #64 ; CHECK-GI-NEXT: ret entry: %c = call <3 x double> @llvm.log.v3f64(<3 x double> %a) @@ -2961,7 +2977,9 @@ define <3 x float> @log_v3f32(<3 x float> %a) { ; CHECK-GI-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.s[1], v2.s[0] ; CHECK-GI-NEXT: mov v1.s[2], v0.s[0] -; CHECK-GI-NEXT: mov v0.16b, v1.16b +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] ; CHECK-GI-NEXT: add sp, sp, #64 ; CHECK-GI-NEXT: ret entry: @@ -4048,29 +4066,33 @@ define <3 x double> @log2_v3f64(<3 x double> %a) { ; ; CHECK-GI-LABEL: log2_v3f64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: str d10, [sp, #-32]! // 8-byte Folded Spill -; CHECK-GI-NEXT: stp d9, d8, [sp, #8] // 16-byte Folded Spill -; CHECK-GI-NEXT: str x30, [sp, #24] // 8-byte Folded Spill -; CHECK-GI-NEXT: .cfi_def_cfa_offset 32 -; CHECK-GI-NEXT: .cfi_offset w30, -8 -; CHECK-GI-NEXT: .cfi_offset b8, -16 -; CHECK-GI-NEXT: .cfi_offset b9, -24 -; CHECK-GI-NEXT: .cfi_offset b10, -32 +; CHECK-GI-NEXT: sub sp, sp, #64 +; CHECK-GI-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: str x30, [sp, #48] // 8-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 64 +; CHECK-GI-NEXT: .cfi_offset w30, -16 +; CHECK-GI-NEXT: .cfi_offset b8, -24 +; CHECK-GI-NEXT: .cfi_offset b9, -32 ; CHECK-GI-NEXT: fmov d8, d1 ; CHECK-GI-NEXT: fmov d9, d2 ; CHECK-GI-NEXT: bl log2 -; CHECK-GI-NEXT: fmov d10, d0 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov d0, d8 ; CHECK-GI-NEXT: bl log2 -; CHECK-GI-NEXT: fmov d8, d0 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov d0, d9 ; CHECK-GI-NEXT: bl log2 -; CHECK-GI-NEXT: fmov d1, d8 -; CHECK-GI-NEXT: ldp d9, d8, [sp, #8] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr x30, [sp, #24] // 8-byte Folded Reload +; CHECK-GI-NEXT: ldp q1, q3, [sp] // 32-byte Folded Reload ; CHECK-GI-NEXT: fmov d2, d0 -; CHECK-GI-NEXT: fmov d0, d10 -; CHECK-GI-NEXT: ldr d10, [sp], #32 // 8-byte Folded Reload +; CHECK-GI-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-GI-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload +; CHECK-GI-NEXT: mov v3.d[1], v1.d[0] +; CHECK-GI-NEXT: mov d1, v3.d[1] +; CHECK-GI-NEXT: fmov d0, d3 +; CHECK-GI-NEXT: add sp, sp, #64 ; CHECK-GI-NEXT: ret entry: %c = call <3 x double> @llvm.log2.v3f64(<3 x double> %a) @@ -4264,7 +4286,9 @@ define <3 x float> @log2_v3f32(<3 x float> %a) { ; CHECK-GI-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.s[1], v2.s[0] ; CHECK-GI-NEXT: mov v1.s[2], v0.s[0] -; CHECK-GI-NEXT: mov v0.16b, v1.16b +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] ; CHECK-GI-NEXT: add sp, sp, #64 ; CHECK-GI-NEXT: ret entry: @@ -5351,29 +5375,33 @@ define <3 x double> @log10_v3f64(<3 x double> %a) { ; ; CHECK-GI-LABEL: log10_v3f64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: str d10, [sp, #-32]! // 8-byte Folded Spill -; CHECK-GI-NEXT: stp d9, d8, [sp, #8] // 16-byte Folded Spill -; CHECK-GI-NEXT: str x30, [sp, #24] // 8-byte Folded Spill -; CHECK-GI-NEXT: .cfi_def_cfa_offset 32 -; CHECK-GI-NEXT: .cfi_offset w30, -8 -; CHECK-GI-NEXT: .cfi_offset b8, -16 -; CHECK-GI-NEXT: .cfi_offset b9, -24 -; CHECK-GI-NEXT: .cfi_offset b10, -32 +; CHECK-GI-NEXT: sub sp, sp, #64 +; CHECK-GI-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: str x30, [sp, #48] // 8-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 64 +; CHECK-GI-NEXT: .cfi_offset w30, -16 +; CHECK-GI-NEXT: .cfi_offset b8, -24 +; CHECK-GI-NEXT: .cfi_offset b9, -32 ; CHECK-GI-NEXT: fmov d8, d1 ; CHECK-GI-NEXT: fmov d9, d2 ; CHECK-GI-NEXT: bl log10 -; CHECK-GI-NEXT: fmov d10, d0 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov d0, d8 ; CHECK-GI-NEXT: bl log10 -; CHECK-GI-NEXT: fmov d8, d0 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov d0, d9 ; CHECK-GI-NEXT: bl log10 -; CHECK-GI-NEXT: fmov d1, d8 -; CHECK-GI-NEXT: ldp d9, d8, [sp, #8] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr x30, [sp, #24] // 8-byte Folded Reload +; CHECK-GI-NEXT: ldp q1, q3, [sp] // 32-byte Folded Reload ; CHECK-GI-NEXT: fmov d2, d0 -; CHECK-GI-NEXT: fmov d0, d10 -; CHECK-GI-NEXT: ldr d10, [sp], #32 // 8-byte Folded Reload +; CHECK-GI-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-GI-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload +; CHECK-GI-NEXT: mov v3.d[1], v1.d[0] +; CHECK-GI-NEXT: mov d1, v3.d[1] +; CHECK-GI-NEXT: fmov d0, d3 +; CHECK-GI-NEXT: add sp, sp, #64 ; CHECK-GI-NEXT: ret entry: %c = call <3 x double> @llvm.log10.v3f64(<3 x double> %a) @@ -5567,7 +5595,9 @@ define <3 x float> @log10_v3f32(<3 x float> %a) { ; CHECK-GI-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.s[1], v2.s[0] ; CHECK-GI-NEXT: mov v1.s[2], v0.s[0] -; CHECK-GI-NEXT: mov v0.16b, v1.16b +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] ; CHECK-GI-NEXT: add sp, sp, #64 ; CHECK-GI-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll index bbfec8c7c3361..83b6f3c26f34c 100644 --- a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll +++ b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll @@ -16,8 +16,12 @@ define {<2 x half>, <2 x half>} @vector_deinterleave_v2f16_v4f16(<4 x half> %vec ; ; CHECK-GI-LABEL: vector_deinterleave_v2f16_v4f16: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: uzp1 v2.4h, v0.4h, v0.4h -; CHECK-GI-NEXT: uzp2 v1.4h, v0.4h, v0.4h +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: mov v2.h[0], v0.h[0] +; CHECK-GI-NEXT: mov v1.h[0], v0.h[1] +; CHECK-GI-NEXT: mov v2.h[1], v0.h[2] +; CHECK-GI-NEXT: mov v1.h[1], v0.h[3] +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-GI-NEXT: fmov d0, d2 ; CHECK-GI-NEXT: ret %retval = call {<2 x half>, <2 x half>} @llvm.vector.deinterleave2.v4f16(<4 x half> %vec) diff --git a/llvm/test/CodeGen/AArch64/fminimummaximum.ll b/llvm/test/CodeGen/AArch64/fminimummaximum.ll index fb12f8acf1745..e8201f62599b7 100644 --- a/llvm/test/CodeGen/AArch64/fminimummaximum.ll +++ b/llvm/test/CodeGen/AArch64/fminimummaximum.ll @@ -154,6 +154,7 @@ define <3 x double> @min_v3f64(<3 x double> %a, <3 x double> %b) { ; CHECK-GI-NEXT: fmin d2, d2, d5 ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: mov v3.d[1], v4.d[0] +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-GI-NEXT: fmin v0.2d, v0.2d, v3.2d ; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -191,6 +192,7 @@ define <3 x double> @max_v3f64(<3 x double> %a, <3 x double> %b) { ; CHECK-GI-NEXT: fmax d2, d2, d5 ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: mov v3.d[1], v4.d[0] +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-GI-NEXT: fmax v0.2d, v0.2d, v3.2d ; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -255,20 +257,48 @@ entry: } define <3 x float> @min_v3f32(<3 x float> %a, <3 x float> %b) { -; CHECK-LABEL: min_v3f32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmin v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: min_v3f32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fmin v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: min_v3f32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v2.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v3.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v2.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v3.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v2.s[2], v0.s[2] +; CHECK-GI-NEXT: mov v3.s[2], v1.s[2] +; CHECK-GI-NEXT: fmin v1.4s, v2.4s, v3.4s +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] +; CHECK-GI-NEXT: ret entry: %c = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> %b) ret <3 x float> %c } define <3 x float> @max_v3f32(<3 x float> %a, <3 x float> %b) { -; CHECK-LABEL: max_v3f32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmax v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: max_v3f32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fmax v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: max_v3f32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v2.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v3.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v2.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v3.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v2.s[2], v0.s[2] +; CHECK-GI-NEXT: mov v3.s[2], v1.s[2] +; CHECK-GI-NEXT: fmax v1.4s, v2.4s, v3.4s +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] +; CHECK-GI-NEXT: ret entry: %c = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> %b) ret <3 x float> %c diff --git a/llvm/test/CodeGen/AArch64/fminmax.ll b/llvm/test/CodeGen/AArch64/fminmax.ll index 64f0da8b4cd0f..8a613907807c4 100644 --- a/llvm/test/CodeGen/AArch64/fminmax.ll +++ b/llvm/test/CodeGen/AArch64/fminmax.ll @@ -154,6 +154,7 @@ define <3 x double> @min_v3f64(<3 x double> %a, <3 x double> %b) { ; CHECK-GI-NEXT: fminnm d2, d2, d5 ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: mov v3.d[1], v4.d[0] +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-GI-NEXT: fminnm v0.2d, v0.2d, v3.2d ; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -191,6 +192,7 @@ define <3 x double> @max_v3f64(<3 x double> %a, <3 x double> %b) { ; CHECK-GI-NEXT: fmaxnm d2, d2, d5 ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: mov v3.d[1], v4.d[0] +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-GI-NEXT: fmaxnm v0.2d, v0.2d, v3.2d ; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -255,20 +257,48 @@ entry: } define <3 x float> @min_v3f32(<3 x float> %a, <3 x float> %b) { -; CHECK-LABEL: min_v3f32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fminnm v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: min_v3f32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fminnm v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: min_v3f32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v2.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v3.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v2.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v3.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v2.s[2], v0.s[2] +; CHECK-GI-NEXT: mov v3.s[2], v1.s[2] +; CHECK-GI-NEXT: fminnm v1.4s, v2.4s, v3.4s +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] +; CHECK-GI-NEXT: ret entry: %c = call <3 x float> @llvm.minnum.v3f32(<3 x float> %a, <3 x float> %b) ret <3 x float> %c } define <3 x float> @max_v3f32(<3 x float> %a, <3 x float> %b) { -; CHECK-LABEL: max_v3f32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmaxnm v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: max_v3f32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fmaxnm v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: max_v3f32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v2.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v3.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v2.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v3.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v2.s[2], v0.s[2] +; CHECK-GI-NEXT: mov v3.s[2], v1.s[2] +; CHECK-GI-NEXT: fmaxnm v1.4s, v2.4s, v3.4s +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] +; CHECK-GI-NEXT: ret entry: %c = call <3 x float> @llvm.maxnum.v3f32(<3 x float> %a, <3 x float> %b) ret <3 x float> %c diff --git a/llvm/test/CodeGen/AArch64/fmla.ll b/llvm/test/CodeGen/AArch64/fmla.ll index 7bcaae5a77eac..7ed9425ed42e9 100644 --- a/llvm/test/CodeGen/AArch64/fmla.ll +++ b/llvm/test/CodeGen/AArch64/fmla.ll @@ -105,6 +105,7 @@ define <3 x double> @fma_v3f64(<3 x double> %a, <3 x double> %b, <3 x double> %c ; CHECK-GI-NEXT: fmla v6.2d, v3.2d, v0.2d ; CHECK-GI-NEXT: ldr d0, [sp] ; CHECK-GI-NEXT: fmadd d2, d2, d5, d0 +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-GI-NEXT: mov d1, v6.d[1] ; CHECK-GI-NEXT: fmov d0, d6 ; CHECK-GI-NEXT: ret @@ -138,11 +139,28 @@ entry: } define <3 x float> @fma_v3f32(<3 x float> %a, <3 x float> %b, <3 x float> %c) { -; CHECK-LABEL: fma_v3f32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmla v2.4s, v1.4s, v0.4s -; CHECK-NEXT: mov v0.16b, v2.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: fma_v3f32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fmla v2.4s, v1.4s, v0.4s +; CHECK-SD-NEXT: mov v0.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: fma_v3f32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v3.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v4.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v5.s[0], v2.s[0] +; CHECK-GI-NEXT: mov v3.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v4.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v5.s[1], v2.s[1] +; CHECK-GI-NEXT: mov v3.s[2], v0.s[2] +; CHECK-GI-NEXT: mov v4.s[2], v1.s[2] +; CHECK-GI-NEXT: mov v5.s[2], v2.s[2] +; CHECK-GI-NEXT: fmla v5.4s, v4.4s, v3.4s +; CHECK-GI-NEXT: mov v0.s[0], v5.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v5.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v5.s[2] +; CHECK-GI-NEXT: ret entry: %d = call <3 x float> @llvm.fma.v3f32(<3 x float> %a, <3 x float> %b, <3 x float> %c) ret <3 x float> %d @@ -756,6 +774,7 @@ define <3 x double> @fmuladd_v3f64(<3 x double> %a, <3 x double> %b, <3 x double ; CHECK-GI-NEXT: fmla v6.2d, v3.2d, v0.2d ; CHECK-GI-NEXT: ldr d0, [sp] ; CHECK-GI-NEXT: fmadd d2, d2, d5, d0 +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-GI-NEXT: mov d1, v6.d[1] ; CHECK-GI-NEXT: fmov d0, d6 ; CHECK-GI-NEXT: ret @@ -789,11 +808,28 @@ entry: } define <3 x float> @fmuladd_v3f32(<3 x float> %a, <3 x float> %b, <3 x float> %c) { -; CHECK-LABEL: fmuladd_v3f32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmla v2.4s, v1.4s, v0.4s -; CHECK-NEXT: mov v0.16b, v2.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: fmuladd_v3f32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fmla v2.4s, v1.4s, v0.4s +; CHECK-SD-NEXT: mov v0.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: fmuladd_v3f32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v3.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v4.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v5.s[0], v2.s[0] +; CHECK-GI-NEXT: mov v3.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v4.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v5.s[1], v2.s[1] +; CHECK-GI-NEXT: mov v3.s[2], v0.s[2] +; CHECK-GI-NEXT: mov v4.s[2], v1.s[2] +; CHECK-GI-NEXT: mov v5.s[2], v2.s[2] +; CHECK-GI-NEXT: fmla v5.4s, v4.4s, v3.4s +; CHECK-GI-NEXT: mov v0.s[0], v5.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v5.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v5.s[2] +; CHECK-GI-NEXT: ret entry: %d = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %a, <3 x float> %b, <3 x float> %c) ret <3 x float> %d @@ -1204,6 +1240,7 @@ define <3 x double> @fmul_v3f64(<3 x double> %a, <3 x double> %b, <3 x double> % ; CHECK-GI-NEXT: fmla v6.2d, v0.2d, v3.2d ; CHECK-GI-NEXT: ldr d0, [sp] ; CHECK-GI-NEXT: fmadd d2, d2, d5, d0 +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-GI-NEXT: mov d1, v6.d[1] ; CHECK-GI-NEXT: fmov d0, d6 ; CHECK-GI-NEXT: ret @@ -1262,8 +1299,19 @@ define <3 x float> @fmul_v3f32(<3 x float> %a, <3 x float> %b, <3 x float> %c) { ; ; CHECK-GI-LABEL: fmul_v3f32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fmla v2.4s, v0.4s, v1.4s -; CHECK-GI-NEXT: mov v0.16b, v2.16b +; CHECK-GI-NEXT: mov v3.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v4.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v5.s[0], v2.s[0] +; CHECK-GI-NEXT: mov v3.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v4.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v5.s[1], v2.s[1] +; CHECK-GI-NEXT: mov v3.s[2], v0.s[2] +; CHECK-GI-NEXT: mov v4.s[2], v1.s[2] +; CHECK-GI-NEXT: mov v5.s[2], v2.s[2] +; CHECK-GI-NEXT: fmla v5.4s, v3.4s, v4.4s +; CHECK-GI-NEXT: mov v0.s[0], v5.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v5.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v5.s[2] ; CHECK-GI-NEXT: ret entry: %d = fmul fast <3 x float> %a, %b diff --git a/llvm/test/CodeGen/AArch64/fmul.ll b/llvm/test/CodeGen/AArch64/fmul.ll index bd3d1353e643e..f045c5ab96c4e 100644 --- a/llvm/test/CodeGen/AArch64/fmul.ll +++ b/llvm/test/CodeGen/AArch64/fmul.ll @@ -93,6 +93,7 @@ define <3 x double> @fmul_v3f64(<3 x double> %a, <3 x double> %b) { ; CHECK-GI-NEXT: fmul d2, d2, d5 ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: mov v3.d[1], v4.d[0] +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-GI-NEXT: fmul v0.2d, v0.2d, v3.2d ; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -130,10 +131,24 @@ entry: } define <3 x float> @fmul_v3f32(<3 x float> %a, <3 x float> %b) { -; CHECK-LABEL: fmul_v3f32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: fmul_v3f32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fmul v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: fmul_v3f32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v2.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v3.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v2.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v3.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v2.s[2], v0.s[2] +; CHECK-GI-NEXT: mov v3.s[2], v1.s[2] +; CHECK-GI-NEXT: fmul v1.4s, v2.4s, v3.4s +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] +; CHECK-GI-NEXT: ret entry: %c = fmul <3 x float> %a, %b ret <3 x float> %c diff --git a/llvm/test/CodeGen/AArch64/fneg.ll b/llvm/test/CodeGen/AArch64/fneg.ll index de2671afe60ab..bcd4bcf4c2b0b 100644 --- a/llvm/test/CodeGen/AArch64/fneg.ll +++ b/llvm/test/CodeGen/AArch64/fneg.ll @@ -88,6 +88,7 @@ define <3 x double> @fabs_v3f64(<3 x double> %a) { ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: fneg d2, d2 ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-GI-NEXT: fneg v0.2d, v0.2d ; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -119,10 +120,21 @@ entry: } define <3 x float> @fabs_v3f32(<3 x float> %a) { -; CHECK-LABEL: fabs_v3f32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fneg v0.4s, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: fabs_v3f32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fneg v0.4s, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: fabs_v3f32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v1.s[2], v0.s[2] +; CHECK-GI-NEXT: fneg v1.4s, v1.4s +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] +; CHECK-GI-NEXT: ret entry: %c = fneg <3 x float> %a ret <3 x float> %c diff --git a/llvm/test/CodeGen/AArch64/fpext.ll b/llvm/test/CodeGen/AArch64/fpext.ll index df90f9d5f0910..7a30b68be6eae 100644 --- a/llvm/test/CodeGen/AArch64/fpext.ll +++ b/llvm/test/CodeGen/AArch64/fpext.ll @@ -82,9 +82,12 @@ define <3 x double> @fpext_v3f32_v3f64(<3 x float> %a) { ; ; CHECK-GI-LABEL: fpext_v3f32_v3f64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov s1, v0.s[2] -; CHECK-GI-NEXT: fcvtl v0.2d, v0.2s -; CHECK-GI-NEXT: fcvt d2, s1 +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov s2, v0.s[2] +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: fcvt d2, s2 +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-GI-NEXT: fcvtl v0.2d, v1.2s ; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret @@ -355,10 +358,14 @@ define <3 x double> @fpext_v3f16_v3f64(<3 x half> %a) { ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-GI-NEXT: mov h1, v0.h[1] -; CHECK-GI-NEXT: mov h2, v0.h[2] -; CHECK-GI-NEXT: fcvt d0, h0 +; CHECK-GI-NEXT: fcvt d3, h0 +; CHECK-GI-NEXT: mov h0, v0.h[2] ; CHECK-GI-NEXT: fcvt d1, h1 -; CHECK-GI-NEXT: fcvt d2, h2 +; CHECK-GI-NEXT: fcvt d2, h0 +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-GI-NEXT: mov v3.d[1], v1.d[0] +; CHECK-GI-NEXT: mov d1, v3.d[1] +; CHECK-GI-NEXT: fmov d0, d3 ; CHECK-GI-NEXT: ret entry: %c = fpext <3 x half> %a to <3 x double> @@ -403,10 +410,22 @@ entry: } define <3 x float> @fpext_v3f16_v3f32(<3 x half> %a) { -; CHECK-LABEL: fpext_v3f16_v3f32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fcvtl v0.4s, v0.4h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: fpext_v3f16_v3f32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fcvtl v0.4s, v0.4h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: fpext_v3f16_v3f32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: mov v1.h[0], v0.h[0] +; CHECK-GI-NEXT: mov v1.h[1], v0.h[1] +; CHECK-GI-NEXT: mov v1.h[2], v0.h[2] +; CHECK-GI-NEXT: fcvtl v1.4s, v1.4h +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] +; CHECK-GI-NEXT: ret entry: %c = fpext <3 x half> %a to <3 x float> ret <3 x float> %c diff --git a/llvm/test/CodeGen/AArch64/fpow.ll b/llvm/test/CodeGen/AArch64/fpow.ll index dc93d5be9b3f3..08589d647d189 100644 --- a/llvm/test/CodeGen/AArch64/fpow.ll +++ b/llvm/test/CodeGen/AArch64/fpow.ll @@ -156,38 +156,42 @@ define <3 x double> @pow_v3f64(<3 x double> %a, <3 x double> %b) { ; ; CHECK-GI-LABEL: pow_v3f64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: str d12, [sp, #-48]! // 8-byte Folded Spill -; CHECK-GI-NEXT: stp d11, d10, [sp, #8] // 16-byte Folded Spill -; CHECK-GI-NEXT: stp d9, d8, [sp, #24] // 16-byte Folded Spill -; CHECK-GI-NEXT: str x30, [sp, #40] // 8-byte Folded Spill -; CHECK-GI-NEXT: .cfi_def_cfa_offset 48 -; CHECK-GI-NEXT: .cfi_offset w30, -8 -; CHECK-GI-NEXT: .cfi_offset b8, -16 -; CHECK-GI-NEXT: .cfi_offset b9, -24 -; CHECK-GI-NEXT: .cfi_offset b10, -32 -; CHECK-GI-NEXT: .cfi_offset b11, -40 -; CHECK-GI-NEXT: .cfi_offset b12, -48 +; CHECK-GI-NEXT: sub sp, sp, #80 +; CHECK-GI-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 80 +; CHECK-GI-NEXT: .cfi_offset w30, -16 +; CHECK-GI-NEXT: .cfi_offset b8, -24 +; CHECK-GI-NEXT: .cfi_offset b9, -32 +; CHECK-GI-NEXT: .cfi_offset b10, -40 +; CHECK-GI-NEXT: .cfi_offset b11, -48 ; CHECK-GI-NEXT: fmov d8, d1 ; CHECK-GI-NEXT: fmov d1, d3 ; CHECK-GI-NEXT: fmov d9, d2 ; CHECK-GI-NEXT: fmov d10, d4 ; CHECK-GI-NEXT: fmov d11, d5 ; CHECK-GI-NEXT: bl pow -; CHECK-GI-NEXT: fmov d12, d0 -; CHECK-GI-NEXT: fmov d0, d8 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov d1, d10 +; CHECK-GI-NEXT: fmov d0, d8 ; CHECK-GI-NEXT: bl pow -; CHECK-GI-NEXT: fmov d8, d0 -; CHECK-GI-NEXT: fmov d0, d9 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov d1, d11 +; CHECK-GI-NEXT: fmov d0, d9 ; CHECK-GI-NEXT: bl pow -; CHECK-GI-NEXT: fmov d1, d8 -; CHECK-GI-NEXT: ldp d9, d8, [sp, #24] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldp d11, d10, [sp, #8] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp q1, q3, [sp] // 32-byte Folded Reload ; CHECK-GI-NEXT: fmov d2, d0 -; CHECK-GI-NEXT: ldr x30, [sp, #40] // 8-byte Folded Reload -; CHECK-GI-NEXT: fmov d0, d12 -; CHECK-GI-NEXT: ldr d12, [sp], #48 // 8-byte Folded Reload +; CHECK-GI-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-GI-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-GI-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.d[1], v1.d[0] +; CHECK-GI-NEXT: mov d1, v3.d[1] +; CHECK-GI-NEXT: fmov d0, d3 +; CHECK-GI-NEXT: add sp, sp, #80 ; CHECK-GI-NEXT: ret entry: %c = call <3 x double> @llvm.pow.v3f64(<3 x double> %a, <3 x double> %b) @@ -419,7 +423,9 @@ define <3 x float> @pow_v3f32(<3 x float> %a, <3 x float> %b) { ; CHECK-GI-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.s[1], v2.s[0] ; CHECK-GI-NEXT: mov v1.s[2], v0.s[0] -; CHECK-GI-NEXT: mov v0.16b, v1.16b +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] ; CHECK-GI-NEXT: add sp, sp, #80 ; CHECK-GI-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/fpowi.ll b/llvm/test/CodeGen/AArch64/fpowi.ll index 8948556d1b380..af81d5fa5bf6f 100644 --- a/llvm/test/CodeGen/AArch64/fpowi.ll +++ b/llvm/test/CodeGen/AArch64/fpowi.ll @@ -149,33 +149,37 @@ define <3 x double> @powi_v3f64(<3 x double> %a, i32 %b) { ; ; CHECK-GI-LABEL: powi_v3f64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: str d10, [sp, #-48]! // 8-byte Folded Spill -; CHECK-GI-NEXT: stp d9, d8, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: stp x30, x19, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: .cfi_def_cfa_offset 48 +; CHECK-GI-NEXT: sub sp, sp, #64 +; CHECK-GI-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x30, x19, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 64 ; CHECK-GI-NEXT: .cfi_offset w19, -8 ; CHECK-GI-NEXT: .cfi_offset w30, -16 ; CHECK-GI-NEXT: .cfi_offset b8, -24 ; CHECK-GI-NEXT: .cfi_offset b9, -32 -; CHECK-GI-NEXT: .cfi_offset b10, -48 ; CHECK-GI-NEXT: fmov d8, d1 ; CHECK-GI-NEXT: fmov d9, d2 ; CHECK-GI-NEXT: mov w19, w0 ; CHECK-GI-NEXT: bl __powidf2 -; CHECK-GI-NEXT: fmov d10, d0 -; CHECK-GI-NEXT: fmov d0, d8 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: mov w0, w19 +; CHECK-GI-NEXT: fmov d0, d8 ; CHECK-GI-NEXT: bl __powidf2 -; CHECK-GI-NEXT: fmov d8, d0 -; CHECK-GI-NEXT: fmov d0, d9 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: mov w0, w19 +; CHECK-GI-NEXT: fmov d0, d9 ; CHECK-GI-NEXT: bl __powidf2 -; CHECK-GI-NEXT: fmov d1, d8 -; CHECK-GI-NEXT: ldp x30, x19, [sp, #32] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp q1, q3, [sp] // 32-byte Folded Reload ; CHECK-GI-NEXT: fmov d2, d0 -; CHECK-GI-NEXT: fmov d0, d10 -; CHECK-GI-NEXT: ldr d10, [sp], #48 // 8-byte Folded Reload +; CHECK-GI-NEXT: ldp x30, x19, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-GI-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.d[1], v1.d[0] +; CHECK-GI-NEXT: mov d1, v3.d[1] +; CHECK-GI-NEXT: fmov d0, d3 +; CHECK-GI-NEXT: add sp, sp, #64 ; CHECK-GI-NEXT: ret entry: %c = call <3 x double> @llvm.powi.v3f64.i32(<3 x double> %a, i32 %b) @@ -393,7 +397,9 @@ define <3 x float> @powi_v3f32(<3 x float> %a, i32 %b) { ; CHECK-GI-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.s[1], v2.s[0] ; CHECK-GI-NEXT: mov v1.s[2], v0.s[0] -; CHECK-GI-NEXT: mov v0.16b, v1.16b +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] ; CHECK-GI-NEXT: add sp, sp, #64 ; CHECK-GI-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/fptoi.ll b/llvm/test/CodeGen/AArch64/fptoi.ll index 9c4f0207b84ce..1ab72b7dc0056 100644 --- a/llvm/test/CodeGen/AArch64/fptoi.ll +++ b/llvm/test/CodeGen/AArch64/fptoi.ll @@ -1015,32 +1015,60 @@ entry: } define <3 x i32> @fptos_v3f64_v3i32(<3 x double> %a) { -; CHECK-LABEL: fptos_v3f64_v3i32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: mov v0.d[1], v1.d[0] -; CHECK-NEXT: fcvtzs v1.2d, v2.2d -; CHECK-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: fptos_v3f64_v3i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] +; CHECK-SD-NEXT: fcvtzs v1.2d, v2.2d +; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d +; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: fptos_v3f64_v3i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-GI-NEXT: fcvtzs v1.2d, v2.2d +; CHECK-GI-NEXT: fcvtzs v0.2d, v0.2d +; CHECK-GI-NEXT: uzp1 v1.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] +; CHECK-GI-NEXT: ret entry: %c = fptosi <3 x double> %a to <3 x i32> ret <3 x i32> %c } define <3 x i32> @fptou_v3f64_v3i32(<3 x double> %a) { -; CHECK-LABEL: fptou_v3f64_v3i32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: mov v0.d[1], v1.d[0] -; CHECK-NEXT: fcvtzu v1.2d, v2.2d -; CHECK-NEXT: fcvtzu v0.2d, v0.2d -; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: fptou_v3f64_v3i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] +; CHECK-SD-NEXT: fcvtzu v1.2d, v2.2d +; CHECK-SD-NEXT: fcvtzu v0.2d, v0.2d +; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: fptou_v3f64_v3i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-GI-NEXT: fcvtzu v1.2d, v2.2d +; CHECK-GI-NEXT: fcvtzu v0.2d, v0.2d +; CHECK-GI-NEXT: uzp1 v1.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] +; CHECK-GI-NEXT: ret entry: %c = fptoui <3 x double> %a to <3 x i32> ret <3 x i32> %c @@ -1375,17 +1403,33 @@ entry: } define <3 x i16> @fptos_v3f64_v3i16(<3 x double> %a) { -; CHECK-LABEL: fptos_v3f64_v3i16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: mov v0.d[1], v1.d[0] -; CHECK-NEXT: fcvtzs v1.2d, v2.2d -; CHECK-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s -; CHECK-NEXT: xtn v0.4h, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: fptos_v3f64_v3i16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] +; CHECK-SD-NEXT: fcvtzs v1.2d, v2.2d +; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d +; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: xtn v0.4h, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: fptos_v3f64_v3i16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-GI-NEXT: fcvtzs v1.2d, v2.2d +; CHECK-GI-NEXT: fcvtzs v0.2d, v0.2d +; CHECK-GI-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: mov w8, v0.s[1] +; CHECK-GI-NEXT: mov w9, v0.s[2] +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: mov v0.h[2], w9 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret entry: %c = fptosi <3 x double> %a to <3 x i16> ret <3 x i16> %c @@ -1413,7 +1457,11 @@ define <3 x i16> @fptou_v3f64_v3i16(<3 x double> %a) { ; CHECK-GI-NEXT: fcvtzu v1.2d, v2.2d ; CHECK-GI-NEXT: fcvtzu v0.2d, v0.2d ; CHECK-GI-NEXT: uzp1 v0.4s, v0.4s, v1.4s -; CHECK-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-GI-NEXT: mov w8, v0.s[1] +; CHECK-GI-NEXT: mov w9, v0.s[2] +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: mov v0.h[2], w9 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret entry: %c = fptoui <3 x double> %a to <3 x i16> @@ -1876,15 +1924,18 @@ define <3 x i8> @fptos_v3f64_v3i8(<3 x double> %a) { ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] -; CHECK-GI-NEXT: fcvtzs v1.2d, v2.2d ; CHECK-GI-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-GI-NEXT: fmov x2, d1 -; CHECK-GI-NEXT: // kill: def $w2 killed $w2 killed $x2 -; CHECK-GI-NEXT: mov d2, v0.d[1] -; CHECK-GI-NEXT: fmov x0, d0 -; CHECK-GI-NEXT: // kill: def $w0 killed $w0 killed $x0 -; CHECK-GI-NEXT: fmov x1, d2 -; CHECK-GI-NEXT: // kill: def $w1 killed $w1 killed $x1 +; CHECK-GI-NEXT: mov d1, v0.d[1] +; CHECK-GI-NEXT: mov v0.s[0], v0.s[0] +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: fcvtzs v1.2d, v2.2d +; CHECK-GI-NEXT: mov v0.s[1], w8 +; CHECK-GI-NEXT: mov v0.s[2], v1.s[0] +; CHECK-GI-NEXT: mov s1, v0.s[1] +; CHECK-GI-NEXT: mov s2, v0.s[2] +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: fmov w1, s1 +; CHECK-GI-NEXT: fmov w2, s2 ; CHECK-GI-NEXT: ret entry: %c = fptosi <3 x double> %a to <3 x i8> @@ -1913,15 +1964,18 @@ define <3 x i8> @fptou_v3f64_v3i8(<3 x double> %a) { ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] -; CHECK-GI-NEXT: fcvtzu v1.2d, v2.2d ; CHECK-GI-NEXT: fcvtzu v0.2d, v0.2d -; CHECK-GI-NEXT: fmov x2, d1 -; CHECK-GI-NEXT: // kill: def $w2 killed $w2 killed $x2 -; CHECK-GI-NEXT: mov d2, v0.d[1] -; CHECK-GI-NEXT: fmov x0, d0 -; CHECK-GI-NEXT: // kill: def $w0 killed $w0 killed $x0 -; CHECK-GI-NEXT: fmov x1, d2 -; CHECK-GI-NEXT: // kill: def $w1 killed $w1 killed $x1 +; CHECK-GI-NEXT: mov d1, v0.d[1] +; CHECK-GI-NEXT: mov v0.s[0], v0.s[0] +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: fcvtzu v1.2d, v2.2d +; CHECK-GI-NEXT: mov v0.s[1], w8 +; CHECK-GI-NEXT: mov v0.s[2], v1.s[0] +; CHECK-GI-NEXT: mov s1, v0.s[1] +; CHECK-GI-NEXT: mov s2, v0.s[2] +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: fmov w1, s1 +; CHECK-GI-NEXT: fmov w2, s2 ; CHECK-GI-NEXT: ret entry: %c = fptoui <3 x double> %a to <3 x i8> @@ -2585,14 +2639,16 @@ define <3 x i64> @fptos_v3f32_v3i64(<3 x float> %a) { ; ; CHECK-GI-LABEL: fptos_v3f32_v3i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov v1.s[0], v0.s[2] -; CHECK-GI-NEXT: fcvtl v0.2d, v0.2s +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v0.s[0], v0.s[2] ; CHECK-GI-NEXT: fcvtl v1.2d, v1.2s -; CHECK-GI-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-GI-NEXT: fcvtzs v2.2d, v1.2d +; CHECK-GI-NEXT: fcvtl v2.2d, v0.2s +; CHECK-GI-NEXT: fcvtzs v0.2d, v1.2d +; CHECK-GI-NEXT: fcvtzs v2.2d, v2.2d +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-GI-NEXT: ret entry: %c = fptosi <3 x float> %a to <3 x i64> @@ -2614,14 +2670,16 @@ define <3 x i64> @fptou_v3f32_v3i64(<3 x float> %a) { ; ; CHECK-GI-LABEL: fptou_v3f32_v3i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov v1.s[0], v0.s[2] -; CHECK-GI-NEXT: fcvtl v0.2d, v0.2s +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v0.s[0], v0.s[2] ; CHECK-GI-NEXT: fcvtl v1.2d, v1.2s -; CHECK-GI-NEXT: fcvtzu v0.2d, v0.2d -; CHECK-GI-NEXT: fcvtzu v2.2d, v1.2d +; CHECK-GI-NEXT: fcvtl v2.2d, v0.2s +; CHECK-GI-NEXT: fcvtzu v0.2d, v1.2d +; CHECK-GI-NEXT: fcvtzu v2.2d, v2.2d +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-GI-NEXT: ret entry: %c = fptoui <3 x float> %a to <3 x i64> @@ -3025,20 +3083,42 @@ entry: } define <3 x i32> @fptos_v3f32_v3i32(<3 x float> %a) { -; CHECK-LABEL: fptos_v3f32_v3i32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fcvtzs v0.4s, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: fptos_v3f32_v3i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fcvtzs v0.4s, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: fptos_v3f32_v3i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v1.s[2], v0.s[2] +; CHECK-GI-NEXT: fcvtzs v1.4s, v1.4s +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] +; CHECK-GI-NEXT: ret entry: %c = fptosi <3 x float> %a to <3 x i32> ret <3 x i32> %c } define <3 x i32> @fptou_v3f32_v3i32(<3 x float> %a) { -; CHECK-LABEL: fptou_v3f32_v3i32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fcvtzu v0.4s, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: fptou_v3f32_v3i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fcvtzu v0.4s, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: fptou_v3f32_v3i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v1.s[2], v0.s[2] +; CHECK-GI-NEXT: fcvtzu v1.4s, v1.4s +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] +; CHECK-GI-NEXT: ret entry: %c = fptoui <3 x float> %a to <3 x i32> ret <3 x i32> %c @@ -3172,22 +3252,48 @@ entry: } define <3 x i16> @fptos_v3f32_v3i16(<3 x float> %a) { -; CHECK-LABEL: fptos_v3f32_v3i16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fcvtzs v0.4s, v0.4s -; CHECK-NEXT: xtn v0.4h, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: fptos_v3f32_v3i16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fcvtzs v0.4s, v0.4s +; CHECK-SD-NEXT: xtn v0.4h, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: fptos_v3f32_v3i16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v1.s[2], v0.s[2] +; CHECK-GI-NEXT: fcvtzs v0.4s, v1.4s +; CHECK-GI-NEXT: mov w8, v0.s[1] +; CHECK-GI-NEXT: mov w9, v0.s[2] +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: mov v0.h[2], w9 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret entry: %c = fptosi <3 x float> %a to <3 x i16> ret <3 x i16> %c } define <3 x i16> @fptou_v3f32_v3i16(<3 x float> %a) { -; CHECK-LABEL: fptou_v3f32_v3i16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fcvtzu v0.4s, v0.4s -; CHECK-NEXT: xtn v0.4h, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: fptou_v3f32_v3i16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fcvtzu v0.4s, v0.4s +; CHECK-SD-NEXT: xtn v0.4h, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: fptou_v3f32_v3i16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v1.s[2], v0.s[2] +; CHECK-GI-NEXT: fcvtzu v0.4s, v1.4s +; CHECK-GI-NEXT: mov w8, v0.s[1] +; CHECK-GI-NEXT: mov w9, v0.s[2] +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: mov v0.h[2], w9 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret entry: %c = fptoui <3 x float> %a to <3 x i16> ret <3 x i16> %c @@ -3414,7 +3520,10 @@ define <3 x i8> @fptos_v3f32_v3i8(<3 x float> %a) { ; ; CHECK-GI-LABEL: fptos_v3f32_v3i8: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fcvtzs v0.4s, v0.4s +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v1.s[2], v0.s[2] +; CHECK-GI-NEXT: fcvtzs v0.4s, v1.4s ; CHECK-GI-NEXT: mov s1, v0.s[1] ; CHECK-GI-NEXT: mov s2, v0.s[2] ; CHECK-GI-NEXT: fmov w0, s0 @@ -3438,7 +3547,10 @@ define <3 x i8> @fptou_v3f32_v3i8(<3 x float> %a) { ; ; CHECK-GI-LABEL: fptou_v3f32_v3i8: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fcvtzu v0.4s, v0.4s +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v1.s[2], v0.s[2] +; CHECK-GI-NEXT: fcvtzu v0.4s, v1.4s ; CHECK-GI-NEXT: mov s1, v0.s[1] ; CHECK-GI-NEXT: mov s2, v0.s[2] ; CHECK-GI-NEXT: fmov w0, s0 @@ -4056,7 +4168,11 @@ define <3 x i64> @fptos_v3f16_v3i64(<3 x half> %a) { ; ; CHECK-GI-NOFP16-LABEL: fptos_v3f16_v3i64: ; CHECK-GI-NOFP16: // %bb.0: // %entry -; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h +; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v0.h[0] +; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v0.h[1] +; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v0.h[2] +; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v1.4h ; CHECK-GI-NOFP16-NEXT: fcvtl v1.2d, v0.2s ; CHECK-GI-NOFP16-NEXT: fcvtl2 v2.2d, v0.4s ; CHECK-GI-NOFP16-NEXT: fcvtzs v0.2d, v1.2d @@ -4120,7 +4236,11 @@ define <3 x i64> @fptou_v3f16_v3i64(<3 x half> %a) { ; ; CHECK-GI-NOFP16-LABEL: fptou_v3f16_v3i64: ; CHECK-GI-NOFP16: // %bb.0: // %entry -; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h +; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v0.h[0] +; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v0.h[1] +; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v0.h[2] +; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v1.4h ; CHECK-GI-NOFP16-NEXT: fcvtl v1.2d, v0.2s ; CHECK-GI-NOFP16-NEXT: fcvtl2 v2.2d, v0.4s ; CHECK-GI-NOFP16-NEXT: fcvtzu v0.2d, v1.2d @@ -5729,22 +5849,48 @@ entry: } define <3 x i32> @fptos_v3f16_v3i32(<3 x half> %a) { -; CHECK-LABEL: fptos_v3f16_v3i32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fcvtl v0.4s, v0.4h -; CHECK-NEXT: fcvtzs v0.4s, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: fptos_v3f16_v3i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fcvtl v0.4s, v0.4h +; CHECK-SD-NEXT: fcvtzs v0.4s, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: fptos_v3f16_v3i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: mov v1.h[0], v0.h[0] +; CHECK-GI-NEXT: mov v1.h[1], v0.h[1] +; CHECK-GI-NEXT: mov v1.h[2], v0.h[2] +; CHECK-GI-NEXT: fcvtl v0.4s, v1.4h +; CHECK-GI-NEXT: fcvtzs v1.4s, v0.4s +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] +; CHECK-GI-NEXT: ret entry: %c = fptosi <3 x half> %a to <3 x i32> ret <3 x i32> %c } define <3 x i32> @fptou_v3f16_v3i32(<3 x half> %a) { -; CHECK-LABEL: fptou_v3f16_v3i32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fcvtl v0.4s, v0.4h -; CHECK-NEXT: fcvtzu v0.4s, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: fptou_v3f16_v3i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fcvtl v0.4s, v0.4h +; CHECK-SD-NEXT: fcvtzu v0.4s, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: fptou_v3f16_v3i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: mov v1.h[0], v0.h[0] +; CHECK-GI-NEXT: mov v1.h[1], v0.h[1] +; CHECK-GI-NEXT: mov v1.h[2], v0.h[2] +; CHECK-GI-NEXT: fcvtl v0.4s, v1.4h +; CHECK-GI-NEXT: fcvtzu v1.4s, v0.4s +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] +; CHECK-GI-NEXT: ret entry: %c = fptoui <3 x half> %a to <3 x i32> ret <3 x i32> %c @@ -6027,14 +6173,37 @@ define <3 x i16> @fptos_v3f16_v3i16(<3 x half> %a) { ; ; CHECK-GI-NOFP16-LABEL: fptos_v3f16_v3i16: ; CHECK-GI-NOFP16: // %bb.0: // %entry -; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h +; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v0.h[0] +; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v0.h[1] +; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v0.h[2] +; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v1.4h ; CHECK-GI-NOFP16-NEXT: fcvtzs v0.4s, v0.4s -; CHECK-GI-NOFP16-NEXT: xtn v0.4h, v0.4s +; CHECK-GI-NOFP16-NEXT: mov w8, v0.s[1] +; CHECK-GI-NOFP16-NEXT: mov w9, v0.s[2] +; CHECK-GI-NOFP16-NEXT: mov v0.h[1], w8 +; CHECK-GI-NOFP16-NEXT: mov v0.h[2], w9 +; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NOFP16-NEXT: ret ; ; CHECK-GI-FP16-LABEL: fptos_v3f16_v3i16: ; CHECK-GI-FP16: // %bb.0: // %entry -; CHECK-GI-FP16-NEXT: fcvtzs v0.4h, v0.4h +; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-FP16-NEXT: mov v1.h[0], v0.h[0] +; CHECK-GI-FP16-NEXT: mov v1.h[1], v0.h[1] +; CHECK-GI-FP16-NEXT: mov v1.h[2], v0.h[2] +; CHECK-GI-FP16-NEXT: fcvtzs v1.4h, v1.4h +; CHECK-GI-FP16-NEXT: umov w8, v1.h[0] +; CHECK-GI-FP16-NEXT: umov w9, v1.h[1] +; CHECK-GI-FP16-NEXT: mov v0.s[0], w8 +; CHECK-GI-FP16-NEXT: umov w8, v1.h[2] +; CHECK-GI-FP16-NEXT: mov v0.s[1], w9 +; CHECK-GI-FP16-NEXT: mov v0.s[2], w8 +; CHECK-GI-FP16-NEXT: mov w8, v0.s[1] +; CHECK-GI-FP16-NEXT: mov w9, v0.s[2] +; CHECK-GI-FP16-NEXT: mov v0.h[1], w8 +; CHECK-GI-FP16-NEXT: mov v0.h[2], w9 +; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-FP16-NEXT: ret entry: %c = fptosi <3 x half> %a to <3 x i16> @@ -6056,14 +6225,37 @@ define <3 x i16> @fptou_v3f16_v3i16(<3 x half> %a) { ; ; CHECK-GI-NOFP16-LABEL: fptou_v3f16_v3i16: ; CHECK-GI-NOFP16: // %bb.0: // %entry -; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h +; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v0.h[0] +; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v0.h[1] +; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v0.h[2] +; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v1.4h ; CHECK-GI-NOFP16-NEXT: fcvtzu v0.4s, v0.4s -; CHECK-GI-NOFP16-NEXT: xtn v0.4h, v0.4s +; CHECK-GI-NOFP16-NEXT: mov w8, v0.s[1] +; CHECK-GI-NOFP16-NEXT: mov w9, v0.s[2] +; CHECK-GI-NOFP16-NEXT: mov v0.h[1], w8 +; CHECK-GI-NOFP16-NEXT: mov v0.h[2], w9 +; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NOFP16-NEXT: ret ; ; CHECK-GI-FP16-LABEL: fptou_v3f16_v3i16: ; CHECK-GI-FP16: // %bb.0: // %entry -; CHECK-GI-FP16-NEXT: fcvtzu v0.4h, v0.4h +; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-FP16-NEXT: mov v1.h[0], v0.h[0] +; CHECK-GI-FP16-NEXT: mov v1.h[1], v0.h[1] +; CHECK-GI-FP16-NEXT: mov v1.h[2], v0.h[2] +; CHECK-GI-FP16-NEXT: fcvtzu v1.4h, v1.4h +; CHECK-GI-FP16-NEXT: umov w8, v1.h[0] +; CHECK-GI-FP16-NEXT: umov w9, v1.h[1] +; CHECK-GI-FP16-NEXT: mov v0.s[0], w8 +; CHECK-GI-FP16-NEXT: umov w8, v1.h[2] +; CHECK-GI-FP16-NEXT: mov v0.s[1], w9 +; CHECK-GI-FP16-NEXT: mov v0.s[2], w8 +; CHECK-GI-FP16-NEXT: mov w8, v0.s[1] +; CHECK-GI-FP16-NEXT: mov w9, v0.s[2] +; CHECK-GI-FP16-NEXT: mov v0.h[1], w8 +; CHECK-GI-FP16-NEXT: mov v0.h[2], w9 +; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-FP16-NEXT: ret entry: %c = fptoui <3 x half> %a to <3 x i16> @@ -6493,7 +6685,11 @@ define <3 x i8> @fptos_v3f16_v3i8(<3 x half> %a) { ; ; CHECK-GI-NOFP16-LABEL: fptos_v3f16_v3i8: ; CHECK-GI-NOFP16: // %bb.0: // %entry -; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h +; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v0.h[0] +; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v0.h[1] +; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v0.h[2] +; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v1.4h ; CHECK-GI-NOFP16-NEXT: fcvtzs v0.4s, v0.4s ; CHECK-GI-NOFP16-NEXT: mov s1, v0.s[1] ; CHECK-GI-NOFP16-NEXT: mov s2, v0.s[2] @@ -6504,10 +6700,22 @@ define <3 x i8> @fptos_v3f16_v3i8(<3 x half> %a) { ; ; CHECK-GI-FP16-LABEL: fptos_v3f16_v3i8: ; CHECK-GI-FP16: // %bb.0: // %entry -; CHECK-GI-FP16-NEXT: fcvtzs v0.4h, v0.4h -; CHECK-GI-FP16-NEXT: umov w0, v0.h[0] -; CHECK-GI-FP16-NEXT: umov w1, v0.h[1] -; CHECK-GI-FP16-NEXT: umov w2, v0.h[2] +; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-FP16-NEXT: mov v1.h[0], v0.h[0] +; CHECK-GI-FP16-NEXT: mov v1.h[1], v0.h[1] +; CHECK-GI-FP16-NEXT: mov v1.h[2], v0.h[2] +; CHECK-GI-FP16-NEXT: fcvtzs v0.4h, v1.4h +; CHECK-GI-FP16-NEXT: umov w8, v0.h[0] +; CHECK-GI-FP16-NEXT: umov w9, v0.h[1] +; CHECK-GI-FP16-NEXT: mov v1.s[0], w8 +; CHECK-GI-FP16-NEXT: umov w8, v0.h[2] +; CHECK-GI-FP16-NEXT: mov v1.s[1], w9 +; CHECK-GI-FP16-NEXT: mov v1.s[2], w8 +; CHECK-GI-FP16-NEXT: mov s0, v1.s[1] +; CHECK-GI-FP16-NEXT: mov s2, v1.s[2] +; CHECK-GI-FP16-NEXT: fmov w0, s1 +; CHECK-GI-FP16-NEXT: fmov w1, s0 +; CHECK-GI-FP16-NEXT: fmov w2, s2 ; CHECK-GI-FP16-NEXT: ret entry: %c = fptosi <3 x half> %a to <3 x i8> @@ -6535,7 +6743,11 @@ define <3 x i8> @fptou_v3f16_v3i8(<3 x half> %a) { ; ; CHECK-GI-NOFP16-LABEL: fptou_v3f16_v3i8: ; CHECK-GI-NOFP16: // %bb.0: // %entry -; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h +; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v0.h[0] +; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v0.h[1] +; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v0.h[2] +; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v1.4h ; CHECK-GI-NOFP16-NEXT: fcvtzu v0.4s, v0.4s ; CHECK-GI-NOFP16-NEXT: mov s1, v0.s[1] ; CHECK-GI-NOFP16-NEXT: mov s2, v0.s[2] @@ -6546,10 +6758,22 @@ define <3 x i8> @fptou_v3f16_v3i8(<3 x half> %a) { ; ; CHECK-GI-FP16-LABEL: fptou_v3f16_v3i8: ; CHECK-GI-FP16: // %bb.0: // %entry -; CHECK-GI-FP16-NEXT: fcvtzu v0.4h, v0.4h -; CHECK-GI-FP16-NEXT: umov w0, v0.h[0] -; CHECK-GI-FP16-NEXT: umov w1, v0.h[1] -; CHECK-GI-FP16-NEXT: umov w2, v0.h[2] +; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-FP16-NEXT: mov v1.h[0], v0.h[0] +; CHECK-GI-FP16-NEXT: mov v1.h[1], v0.h[1] +; CHECK-GI-FP16-NEXT: mov v1.h[2], v0.h[2] +; CHECK-GI-FP16-NEXT: fcvtzu v0.4h, v1.4h +; CHECK-GI-FP16-NEXT: umov w8, v0.h[0] +; CHECK-GI-FP16-NEXT: umov w9, v0.h[1] +; CHECK-GI-FP16-NEXT: mov v1.s[0], w8 +; CHECK-GI-FP16-NEXT: umov w8, v0.h[2] +; CHECK-GI-FP16-NEXT: mov v1.s[1], w9 +; CHECK-GI-FP16-NEXT: mov v1.s[2], w8 +; CHECK-GI-FP16-NEXT: mov s0, v1.s[1] +; CHECK-GI-FP16-NEXT: mov s2, v1.s[2] +; CHECK-GI-FP16-NEXT: fmov w0, s1 +; CHECK-GI-FP16-NEXT: fmov w1, s0 +; CHECK-GI-FP16-NEXT: fmov w2, s2 ; CHECK-GI-FP16-NEXT: ret entry: %c = fptoui <3 x half> %a to <3 x i8> @@ -7323,11 +7547,14 @@ define <3 x i64> @fptos_v3f128_v3i64(<3 x fp128> %a) { ; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov x20, x0 ; CHECK-GI-NEXT: bl __fixtfdi -; CHECK-GI-NEXT: fmov d0, x19 -; CHECK-GI-NEXT: fmov d1, x20 +; CHECK-GI-NEXT: mov v0.d[0], x19 +; CHECK-GI-NEXT: mov v2.d[0], x0 ; CHECK-GI-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-GI-NEXT: mov v0.d[1], x20 ; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload -; CHECK-GI-NEXT: fmov d2, x0 +; CHECK-GI-NEXT: mov d1, v0.d[1] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: add sp, sp, #64 ; CHECK-GI-NEXT: ret entry: @@ -7380,11 +7607,14 @@ define <3 x i64> @fptou_v3f128_v3i64(<3 x fp128> %a) { ; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov x20, x0 ; CHECK-GI-NEXT: bl __fixunstfdi -; CHECK-GI-NEXT: fmov d0, x19 -; CHECK-GI-NEXT: fmov d1, x20 +; CHECK-GI-NEXT: mov v0.d[0], x19 +; CHECK-GI-NEXT: mov v2.d[0], x0 ; CHECK-GI-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-GI-NEXT: mov v0.d[1], x20 ; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload -; CHECK-GI-NEXT: fmov d2, x0 +; CHECK-GI-NEXT: mov d1, v0.d[1] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: add sp, sp, #64 ; CHECK-GI-NEXT: ret entry: @@ -7519,11 +7749,14 @@ define <3 x i32> @fptos_v3f128_v3i32(<3 x fp128> %a) { ; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov w20, w0 ; CHECK-GI-NEXT: bl __fixtfsi -; CHECK-GI-NEXT: mov v0.s[0], w19 +; CHECK-GI-NEXT: mov v1.s[0], w19 ; CHECK-GI-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload -; CHECK-GI-NEXT: mov v0.s[1], w20 +; CHECK-GI-NEXT: mov v1.s[1], w20 ; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v0.s[2], w0 +; CHECK-GI-NEXT: mov v1.s[2], w0 +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] ; CHECK-GI-NEXT: add sp, sp, #64 ; CHECK-GI-NEXT: ret entry: @@ -7572,11 +7805,14 @@ define <3 x i32> @fptou_v3f128_v3i32(<3 x fp128> %a) { ; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov w20, w0 ; CHECK-GI-NEXT: bl __fixunstfsi -; CHECK-GI-NEXT: mov v0.s[0], w19 +; CHECK-GI-NEXT: mov v1.s[0], w19 ; CHECK-GI-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload -; CHECK-GI-NEXT: mov v0.s[1], w20 +; CHECK-GI-NEXT: mov v1.s[1], w20 ; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v0.s[2], w0 +; CHECK-GI-NEXT: mov v1.s[2], w0 +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] ; CHECK-GI-NEXT: add sp, sp, #64 ; CHECK-GI-NEXT: ret entry: @@ -7714,11 +7950,15 @@ define <3 x i16> @fptos_v3f128_v3i16(<3 x fp128> %a) { ; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov w20, w0 ; CHECK-GI-NEXT: bl __fixtfsi -; CHECK-GI-NEXT: fmov s0, w19 +; CHECK-GI-NEXT: mov v0.s[0], w19 ; CHECK-GI-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload -; CHECK-GI-NEXT: mov v0.h[1], w20 +; CHECK-GI-NEXT: mov v0.s[1], w20 ; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v0.h[2], w0 +; CHECK-GI-NEXT: mov v0.s[2], w0 +; CHECK-GI-NEXT: mov w8, v0.s[1] +; CHECK-GI-NEXT: mov w9, v0.s[2] +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: mov v0.h[2], w9 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: add sp, sp, #64 ; CHECK-GI-NEXT: ret @@ -7771,11 +8011,15 @@ define <3 x i16> @fptou_v3f128_v3i16(<3 x fp128> %a) { ; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov w20, w0 ; CHECK-GI-NEXT: bl __fixunstfsi -; CHECK-GI-NEXT: fmov s0, w19 +; CHECK-GI-NEXT: mov v0.s[0], w19 ; CHECK-GI-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload -; CHECK-GI-NEXT: mov v0.h[1], w20 +; CHECK-GI-NEXT: mov v0.s[1], w20 ; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v0.h[2], w0 +; CHECK-GI-NEXT: mov v0.s[2], w0 +; CHECK-GI-NEXT: mov w8, v0.s[1] +; CHECK-GI-NEXT: mov w9, v0.s[2] +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: mov v0.h[2], w9 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: add sp, sp, #64 ; CHECK-GI-NEXT: ret @@ -7917,11 +8161,16 @@ define <3 x i8> @fptos_v3f128_v3i8(<3 x fp128> %a) { ; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov w20, w0 ; CHECK-GI-NEXT: bl __fixtfsi -; CHECK-GI-NEXT: mov w2, w0 -; CHECK-GI-NEXT: mov w0, w19 -; CHECK-GI-NEXT: mov w1, w20 -; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v0.s[0], w19 ; CHECK-GI-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload +; CHECK-GI-NEXT: mov v0.s[1], w20 +; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v0.s[2], w0 +; CHECK-GI-NEXT: mov s1, v0.s[1] +; CHECK-GI-NEXT: mov s2, v0.s[2] +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: fmov w1, s1 +; CHECK-GI-NEXT: fmov w2, s2 ; CHECK-GI-NEXT: add sp, sp, #64 ; CHECK-GI-NEXT: ret entry: @@ -7976,11 +8225,16 @@ define <3 x i8> @fptou_v3f128_v3i8(<3 x fp128> %a) { ; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov w20, w0 ; CHECK-GI-NEXT: bl __fixunstfsi -; CHECK-GI-NEXT: mov w2, w0 -; CHECK-GI-NEXT: mov w0, w19 -; CHECK-GI-NEXT: mov w1, w20 -; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v0.s[0], w19 ; CHECK-GI-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload +; CHECK-GI-NEXT: mov v0.s[1], w20 +; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v0.s[2], w0 +; CHECK-GI-NEXT: mov s1, v0.s[1] +; CHECK-GI-NEXT: mov s2, v0.s[2] +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: fmov w1, s1 +; CHECK-GI-NEXT: fmov w2, s2 ; CHECK-GI-NEXT: add sp, sp, #64 ; CHECK-GI-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll index 9ef6d61c350ec..8dae8328f3ceb 100644 --- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll +++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll @@ -48,10 +48,21 @@ define <2 x i32> @test_signed_v2f32_v2i32(<2 x float> %f) { } define <3 x i32> @test_signed_v3f32_v3i32(<3 x float> %f) { -; CHECK-LABEL: test_signed_v3f32_v3i32: -; CHECK: // %bb.0: -; CHECK-NEXT: fcvtzs v0.4s, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_signed_v3f32_v3i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: fcvtzs v0.4s, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_signed_v3f32_v3i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v1.s[2], v0.s[2] +; CHECK-GI-NEXT: fcvtzs v1.4s, v1.4s +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] +; CHECK-GI-NEXT: ret %x = call <3 x i32> @llvm.fptosi.sat.v3f32.v3i32(<3 x float> %f) ret <3 x i32> %x } @@ -320,7 +331,10 @@ define <3 x i32> @test_signed_v3f64_v3i32(<3 x double> %f) { ; CHECK-GI-NEXT: cmgt v3.2d, v0.2d, v2.2d ; CHECK-GI-NEXT: bif v1.16b, v2.16b, v4.16b ; CHECK-GI-NEXT: bif v0.16b, v2.16b, v3.16b -; CHECK-GI-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: uzp1 v1.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] ; CHECK-GI-NEXT: ret %x = call <3 x i32> @llvm.fptosi.sat.v3f64.v3i32(<3 x double> %f) ret <3 x i32> %x @@ -902,14 +916,17 @@ define <3 x i32> @test_signed_v3f128_v3i32(<3 x fp128> %f) { ; CHECK-GI-NEXT: mov w19, w0 ; CHECK-GI-NEXT: mov v1.16b, v0.16b ; CHECK-GI-NEXT: bl __unordtf2 -; CHECK-GI-NEXT: mov v0.s[0], w21 +; CHECK-GI-NEXT: mov v1.s[0], w21 ; CHECK-GI-NEXT: cmp w0, #0 ; CHECK-GI-NEXT: csel w8, wzr, w19, ne ; CHECK-GI-NEXT: ldp x20, x19, [sp, #112] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp x22, x21, [sp, #96] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v0.s[1], w23 +; CHECK-GI-NEXT: mov v1.s[1], w23 ; CHECK-GI-NEXT: ldp x30, x23, [sp, #80] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v0.s[2], w8 +; CHECK-GI-NEXT: mov v1.s[2], w8 +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] ; CHECK-GI-NEXT: add sp, sp, #128 ; CHECK-GI-NEXT: ret %x = call <3 x i32> @llvm.fptosi.sat.v3f128.v3i32(<3 x fp128> %f) @@ -1221,11 +1238,24 @@ define <2 x i32> @test_signed_v2f16_v2i32(<2 x half> %f) { } define <3 x i32> @test_signed_v3f16_v3i32(<3 x half> %f) { -; CHECK-LABEL: test_signed_v3f16_v3i32: -; CHECK: // %bb.0: -; CHECK-NEXT: fcvtl v0.4s, v0.4h -; CHECK-NEXT: fcvtzs v0.4s, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_signed_v3f16_v3i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: fcvtl v0.4s, v0.4h +; CHECK-SD-NEXT: fcvtzs v0.4s, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_signed_v3f16_v3i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: mov v1.h[0], v0.h[0] +; CHECK-GI-NEXT: mov v1.h[1], v0.h[1] +; CHECK-GI-NEXT: mov v1.h[2], v0.h[2] +; CHECK-GI-NEXT: fcvtl v0.4s, v1.4h +; CHECK-GI-NEXT: fcvtzs v1.4s, v0.4s +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] +; CHECK-GI-NEXT: ret %x = call <3 x i32> @llvm.fptosi.sat.v3f16.v3i32(<3 x half> %f) ret <3 x i32> %x } diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll index e1670ad2dc053..a86c41a7b7edd 100644 --- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll +++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll @@ -48,10 +48,21 @@ define <2 x i32> @test_unsigned_v2f32_v2i32(<2 x float> %f) { } define <3 x i32> @test_unsigned_v3f32_v3i32(<3 x float> %f) { -; CHECK-LABEL: test_unsigned_v3f32_v3i32: -; CHECK: // %bb.0: -; CHECK-NEXT: fcvtzu v0.4s, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_unsigned_v3f32_v3i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: fcvtzu v0.4s, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_unsigned_v3f32_v3i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v1.s[2], v0.s[2] +; CHECK-GI-NEXT: fcvtzu v1.4s, v1.4s +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] +; CHECK-GI-NEXT: ret %x = call <3 x i32> @llvm.fptoui.sat.v3f32.v3i32(<3 x float> %f) ret <3 x i32> %x } @@ -308,7 +319,10 @@ define <3 x i32> @test_unsigned_v3f64_v3i32(<3 x double> %f) { ; CHECK-GI-NEXT: bif v1.16b, v2.16b, v4.16b ; CHECK-GI-NEXT: cmhi v3.2d, v2.2d, v0.2d ; CHECK-GI-NEXT: bif v0.16b, v2.16b, v3.16b -; CHECK-GI-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: uzp1 v1.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] ; CHECK-GI-NEXT: ret %x = call <3 x i32> @llvm.fptoui.sat.v3f64.v3i32(<3 x double> %f) ret <3 x i32> %x @@ -781,12 +795,15 @@ define <3 x i32> @test_unsigned_v3f128_v3i32(<3 x fp128> %f) { ; CHECK-GI-NEXT: csel x8, x23, x21, gt ; CHECK-GI-NEXT: mov v0.d[1], x8 ; CHECK-GI-NEXT: bl __fixunstfsi -; CHECK-GI-NEXT: mov v0.s[0], w19 +; CHECK-GI-NEXT: mov v1.s[0], w19 ; CHECK-GI-NEXT: ldp x22, x21, [sp, #80] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp x30, x23, [sp, #64] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v0.s[1], w20 +; CHECK-GI-NEXT: mov v1.s[1], w20 ; CHECK-GI-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v0.s[2], w0 +; CHECK-GI-NEXT: mov v1.s[2], w0 +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] ; CHECK-GI-NEXT: add sp, sp, #112 ; CHECK-GI-NEXT: ret %x = call <3 x i32> @llvm.fptoui.sat.v3f128.v3i32(<3 x fp128> %f) @@ -1052,11 +1069,24 @@ define <2 x i32> @test_unsigned_v2f16_v2i32(<2 x half> %f) { } define <3 x i32> @test_unsigned_v3f16_v3i32(<3 x half> %f) { -; CHECK-LABEL: test_unsigned_v3f16_v3i32: -; CHECK: // %bb.0: -; CHECK-NEXT: fcvtl v0.4s, v0.4h -; CHECK-NEXT: fcvtzu v0.4s, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_unsigned_v3f16_v3i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: fcvtl v0.4s, v0.4h +; CHECK-SD-NEXT: fcvtzu v0.4s, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_unsigned_v3f16_v3i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: mov v1.h[0], v0.h[0] +; CHECK-GI-NEXT: mov v1.h[1], v0.h[1] +; CHECK-GI-NEXT: mov v1.h[2], v0.h[2] +; CHECK-GI-NEXT: fcvtl v0.4s, v1.4h +; CHECK-GI-NEXT: fcvtzu v1.4s, v0.4s +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] +; CHECK-GI-NEXT: ret %x = call <3 x i32> @llvm.fptoui.sat.v3f16.v3i32(<3 x half> %f) ret <3 x i32> %x } diff --git a/llvm/test/CodeGen/AArch64/fptrunc.ll b/llvm/test/CodeGen/AArch64/fptrunc.ll index 2187717c4148a..89ac7dbe42487 100644 --- a/llvm/test/CodeGen/AArch64/fptrunc.ll +++ b/llvm/test/CodeGen/AArch64/fptrunc.ll @@ -130,9 +130,12 @@ define <2 x half> @fptrunc_v2f128_v2f16(<2 x fp128> %a) { ; CHECK-GI-NEXT: bl __trunctfhf2 ; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: bl __trunctfhf2 -; CHECK-GI-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp q0, q1, [sp] // 32-byte Folded Reload ; CHECK-GI-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: mov v1.h[1], v0.h[0] +; CHECK-GI-NEXT: mov h0, v1.h[1] +; CHECK-GI-NEXT: mov v1.h[1], v0.h[0] +; CHECK-GI-NEXT: mov v0.16b, v1.16b ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: add sp, sp, #64 ; CHECK-GI-NEXT: ret @@ -261,10 +264,13 @@ define <3 x float> @fptrunc_v3f64_v3f32(<3 x double> %a) { ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: fcvt s2, d2 ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] -; CHECK-GI-NEXT: fcvtn v1.2s, v0.2d +; CHECK-GI-NEXT: fcvtn v0.2s, v0.2d +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v1.s[2], v2.s[0] ; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] ; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] -; CHECK-GI-NEXT: mov v0.s[2], v2.s[0] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] ; CHECK-GI-NEXT: ret entry: %c = fptrunc <3 x double> %a to <3 x float> @@ -295,6 +301,8 @@ define <2 x half> @fptrunc_v2f64_v2f16(<2 x double> %a) { ; CHECK-GI-NEXT: fcvt h0, d0 ; CHECK-GI-NEXT: fcvt h1, d1 ; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: mov h1, v0.h[1] +; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret entry: @@ -318,8 +326,16 @@ define <3 x half> @fptrunc_v3f64_v3f16(<3 x double> %a) { ; CHECK-GI-NEXT: fcvt h0, d0 ; CHECK-GI-NEXT: fcvt h1, d1 ; CHECK-GI-NEXT: fcvt h2, d2 -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: mov v0.h[2], v2.h[0] +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: mov v0.s[0], w8 +; CHECK-GI-NEXT: fmov w8, s1 +; CHECK-GI-NEXT: mov v0.s[1], w8 +; CHECK-GI-NEXT: fmov w8, s2 +; CHECK-GI-NEXT: mov v0.s[2], w8 +; CHECK-GI-NEXT: mov w8, v0.s[1] +; CHECK-GI-NEXT: mov w9, v0.s[2] +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: mov v0.h[2], w9 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret entry: @@ -366,6 +382,9 @@ define <2 x half> @fptrunc_v2f32_v2f16(<2 x float> %a) { ; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] ; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] ; CHECK-GI-NEXT: fcvtn v0.4h, v1.4s +; CHECK-GI-NEXT: mov h1, v0.h[1] +; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret entry: %c = fptrunc <2 x float> %a to <2 x half> @@ -373,10 +392,29 @@ entry: } define <3 x half> @fptrunc_v3f32_v3f16(<3 x float> %a) { -; CHECK-LABEL: fptrunc_v3f32_v3f16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fcvtn v0.4h, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: fptrunc_v3f32_v3f16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fcvtn v0.4h, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: fptrunc_v3f32_v3f16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v1.s[2], v0.s[2] +; CHECK-GI-NEXT: fcvtn v1.4h, v1.4s +; CHECK-GI-NEXT: umov w8, v1.h[0] +; CHECK-GI-NEXT: umov w9, v1.h[1] +; CHECK-GI-NEXT: mov v0.s[0], w8 +; CHECK-GI-NEXT: umov w8, v1.h[2] +; CHECK-GI-NEXT: mov v0.s[1], w9 +; CHECK-GI-NEXT: mov v0.s[2], w8 +; CHECK-GI-NEXT: mov w8, v0.s[1] +; CHECK-GI-NEXT: mov w9, v0.s[2] +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: mov v0.h[2], w9 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret entry: %c = fptrunc <3 x float> %a to <3 x half> ret <3 x half> %c diff --git a/llvm/test/CodeGen/AArch64/frem.ll b/llvm/test/CodeGen/AArch64/frem.ll index feb13da64cbf8..cc2443497ad83 100644 --- a/llvm/test/CodeGen/AArch64/frem.ll +++ b/llvm/test/CodeGen/AArch64/frem.ll @@ -157,38 +157,42 @@ define <3 x double> @frem_v3f64(<3 x double> %a, <3 x double> %b) { ; ; CHECK-GI-LABEL: frem_v3f64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: str d12, [sp, #-48]! // 8-byte Folded Spill -; CHECK-GI-NEXT: stp d11, d10, [sp, #8] // 16-byte Folded Spill -; CHECK-GI-NEXT: stp d9, d8, [sp, #24] // 16-byte Folded Spill -; CHECK-GI-NEXT: str x30, [sp, #40] // 8-byte Folded Spill -; CHECK-GI-NEXT: .cfi_def_cfa_offset 48 -; CHECK-GI-NEXT: .cfi_offset w30, -8 -; CHECK-GI-NEXT: .cfi_offset b8, -16 -; CHECK-GI-NEXT: .cfi_offset b9, -24 -; CHECK-GI-NEXT: .cfi_offset b10, -32 -; CHECK-GI-NEXT: .cfi_offset b11, -40 -; CHECK-GI-NEXT: .cfi_offset b12, -48 +; CHECK-GI-NEXT: sub sp, sp, #80 +; CHECK-GI-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 80 +; CHECK-GI-NEXT: .cfi_offset w30, -16 +; CHECK-GI-NEXT: .cfi_offset b8, -24 +; CHECK-GI-NEXT: .cfi_offset b9, -32 +; CHECK-GI-NEXT: .cfi_offset b10, -40 +; CHECK-GI-NEXT: .cfi_offset b11, -48 ; CHECK-GI-NEXT: fmov d8, d1 ; CHECK-GI-NEXT: fmov d1, d3 ; CHECK-GI-NEXT: fmov d9, d2 ; CHECK-GI-NEXT: fmov d10, d4 ; CHECK-GI-NEXT: fmov d11, d5 ; CHECK-GI-NEXT: bl fmod -; CHECK-GI-NEXT: fmov d12, d0 -; CHECK-GI-NEXT: fmov d0, d8 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov d1, d10 +; CHECK-GI-NEXT: fmov d0, d8 ; CHECK-GI-NEXT: bl fmod -; CHECK-GI-NEXT: fmov d8, d0 -; CHECK-GI-NEXT: fmov d0, d9 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov d1, d11 +; CHECK-GI-NEXT: fmov d0, d9 ; CHECK-GI-NEXT: bl fmod -; CHECK-GI-NEXT: fmov d1, d8 -; CHECK-GI-NEXT: ldp d9, d8, [sp, #24] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldp d11, d10, [sp, #8] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp q1, q3, [sp] // 32-byte Folded Reload ; CHECK-GI-NEXT: fmov d2, d0 -; CHECK-GI-NEXT: ldr x30, [sp, #40] // 8-byte Folded Reload -; CHECK-GI-NEXT: fmov d0, d12 -; CHECK-GI-NEXT: ldr d12, [sp], #48 // 8-byte Folded Reload +; CHECK-GI-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-GI-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-GI-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.d[1], v1.d[0] +; CHECK-GI-NEXT: mov d1, v3.d[1] +; CHECK-GI-NEXT: fmov d0, d3 +; CHECK-GI-NEXT: add sp, sp, #80 ; CHECK-GI-NEXT: ret entry: %c = frem <3 x double> %a, %b @@ -420,7 +424,9 @@ define <3 x float> @frem_v3f32(<3 x float> %a, <3 x float> %b) { ; CHECK-GI-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.s[1], v2.s[0] ; CHECK-GI-NEXT: mov v1.s[2], v0.s[0] -; CHECK-GI-NEXT: mov v0.16b, v1.16b +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] ; CHECK-GI-NEXT: add sp, sp, #80 ; CHECK-GI-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/fsincos.ll b/llvm/test/CodeGen/AArch64/fsincos.ll index 2afc56a7139fb..4136dfe010ead 100644 --- a/llvm/test/CodeGen/AArch64/fsincos.ll +++ b/llvm/test/CodeGen/AArch64/fsincos.ll @@ -138,29 +138,33 @@ define <3 x double> @sin_v3f64(<3 x double> %a) { ; ; CHECK-GI-LABEL: sin_v3f64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: str d10, [sp, #-32]! // 8-byte Folded Spill -; CHECK-GI-NEXT: stp d9, d8, [sp, #8] // 16-byte Folded Spill -; CHECK-GI-NEXT: str x30, [sp, #24] // 8-byte Folded Spill -; CHECK-GI-NEXT: .cfi_def_cfa_offset 32 -; CHECK-GI-NEXT: .cfi_offset w30, -8 -; CHECK-GI-NEXT: .cfi_offset b8, -16 -; CHECK-GI-NEXT: .cfi_offset b9, -24 -; CHECK-GI-NEXT: .cfi_offset b10, -32 +; CHECK-GI-NEXT: sub sp, sp, #64 +; CHECK-GI-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: str x30, [sp, #48] // 8-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 64 +; CHECK-GI-NEXT: .cfi_offset w30, -16 +; CHECK-GI-NEXT: .cfi_offset b8, -24 +; CHECK-GI-NEXT: .cfi_offset b9, -32 ; CHECK-GI-NEXT: fmov d8, d1 ; CHECK-GI-NEXT: fmov d9, d2 ; CHECK-GI-NEXT: bl sin -; CHECK-GI-NEXT: fmov d10, d0 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov d0, d8 ; CHECK-GI-NEXT: bl sin -; CHECK-GI-NEXT: fmov d8, d0 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov d0, d9 ; CHECK-GI-NEXT: bl sin -; CHECK-GI-NEXT: fmov d1, d8 -; CHECK-GI-NEXT: ldp d9, d8, [sp, #8] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr x30, [sp, #24] // 8-byte Folded Reload +; CHECK-GI-NEXT: ldp q1, q3, [sp] // 32-byte Folded Reload ; CHECK-GI-NEXT: fmov d2, d0 -; CHECK-GI-NEXT: fmov d0, d10 -; CHECK-GI-NEXT: ldr d10, [sp], #32 // 8-byte Folded Reload +; CHECK-GI-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-GI-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload +; CHECK-GI-NEXT: mov v3.d[1], v1.d[0] +; CHECK-GI-NEXT: mov d1, v3.d[1] +; CHECK-GI-NEXT: fmov d0, d3 +; CHECK-GI-NEXT: add sp, sp, #64 ; CHECK-GI-NEXT: ret entry: %c = call <3 x double> @llvm.sin.v3f64(<3 x double> %a) @@ -354,7 +358,9 @@ define <3 x float> @sin_v3f32(<3 x float> %a) { ; CHECK-GI-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.s[1], v2.s[0] ; CHECK-GI-NEXT: mov v1.s[2], v0.s[0] -; CHECK-GI-NEXT: mov v0.16b, v1.16b +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] ; CHECK-GI-NEXT: add sp, sp, #64 ; CHECK-GI-NEXT: ret entry: @@ -1440,29 +1446,33 @@ define <3 x double> @cos_v3f64(<3 x double> %a) { ; ; CHECK-GI-LABEL: cos_v3f64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: str d10, [sp, #-32]! // 8-byte Folded Spill -; CHECK-GI-NEXT: stp d9, d8, [sp, #8] // 16-byte Folded Spill -; CHECK-GI-NEXT: str x30, [sp, #24] // 8-byte Folded Spill -; CHECK-GI-NEXT: .cfi_def_cfa_offset 32 -; CHECK-GI-NEXT: .cfi_offset w30, -8 -; CHECK-GI-NEXT: .cfi_offset b8, -16 -; CHECK-GI-NEXT: .cfi_offset b9, -24 -; CHECK-GI-NEXT: .cfi_offset b10, -32 +; CHECK-GI-NEXT: sub sp, sp, #64 +; CHECK-GI-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: str x30, [sp, #48] // 8-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 64 +; CHECK-GI-NEXT: .cfi_offset w30, -16 +; CHECK-GI-NEXT: .cfi_offset b8, -24 +; CHECK-GI-NEXT: .cfi_offset b9, -32 ; CHECK-GI-NEXT: fmov d8, d1 ; CHECK-GI-NEXT: fmov d9, d2 ; CHECK-GI-NEXT: bl cos -; CHECK-GI-NEXT: fmov d10, d0 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov d0, d8 ; CHECK-GI-NEXT: bl cos -; CHECK-GI-NEXT: fmov d8, d0 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov d0, d9 ; CHECK-GI-NEXT: bl cos -; CHECK-GI-NEXT: fmov d1, d8 -; CHECK-GI-NEXT: ldp d9, d8, [sp, #8] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr x30, [sp, #24] // 8-byte Folded Reload +; CHECK-GI-NEXT: ldp q1, q3, [sp] // 32-byte Folded Reload ; CHECK-GI-NEXT: fmov d2, d0 -; CHECK-GI-NEXT: fmov d0, d10 -; CHECK-GI-NEXT: ldr d10, [sp], #32 // 8-byte Folded Reload +; CHECK-GI-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-GI-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload +; CHECK-GI-NEXT: mov v3.d[1], v1.d[0] +; CHECK-GI-NEXT: mov d1, v3.d[1] +; CHECK-GI-NEXT: fmov d0, d3 +; CHECK-GI-NEXT: add sp, sp, #64 ; CHECK-GI-NEXT: ret entry: %c = call <3 x double> @llvm.cos.v3f64(<3 x double> %a) @@ -1656,7 +1666,9 @@ define <3 x float> @cos_v3f32(<3 x float> %a) { ; CHECK-GI-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.s[1], v2.s[0] ; CHECK-GI-NEXT: mov v1.s[2], v0.s[0] -; CHECK-GI-NEXT: mov v0.16b, v1.16b +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] ; CHECK-GI-NEXT: add sp, sp, #64 ; CHECK-GI-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/fsqrt.ll b/llvm/test/CodeGen/AArch64/fsqrt.ll index 6c5fd8e52b017..7514c9235b039 100644 --- a/llvm/test/CodeGen/AArch64/fsqrt.ll +++ b/llvm/test/CodeGen/AArch64/fsqrt.ll @@ -84,6 +84,7 @@ define <3 x double> @sqrt_v3f64(<3 x double> %a) { ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: fsqrt d2, d2 ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-GI-NEXT: fsqrt v0.2d, v0.2d ; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -115,10 +116,21 @@ entry: } define <3 x float> @sqrt_v3f32(<3 x float> %a) { -; CHECK-LABEL: sqrt_v3f32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fsqrt v0.4s, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sqrt_v3f32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fsqrt v0.4s, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sqrt_v3f32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v1.s[2], v0.s[2] +; CHECK-GI-NEXT: fsqrt v1.4s, v1.4s +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] +; CHECK-GI-NEXT: ret entry: %c = call <3 x float> @llvm.sqrt.v3f32(<3 x float> %a) ret <3 x float> %c diff --git a/llvm/test/CodeGen/AArch64/icmp.ll b/llvm/test/CodeGen/AArch64/icmp.ll index 61964060ca2c8..9a49266ace1d9 100644 --- a/llvm/test/CodeGen/AArch64/icmp.ll +++ b/llvm/test/CodeGen/AArch64/icmp.ll @@ -1155,28 +1155,29 @@ define <3 x i64> @v3i64_i64(<3 x i64> %a, <3 x i64> %b, <3 x i64> %d, <3 x i64> ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: // kill: def $d3 killed $d3 def $q3 ; CHECK-GI-NEXT: // kill: def $d4 killed $d4 def $q4 -; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-GI-NEXT: // kill: def $d6 killed $d6 def $q6 -; CHECK-GI-NEXT: // kill: def $d5 killed $d5 def $q5 +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-GI-NEXT: // kill: def $d7 killed $d7 def $q7 +; CHECK-GI-NEXT: // kill: def $d5 killed $d5 def $q5 ; CHECK-GI-NEXT: ldr x8, [sp] ; CHECK-GI-NEXT: ldr x10, [sp, #24] ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: mov v3.d[1], v4.d[0] -; CHECK-GI-NEXT: cmgt v2.2d, v5.2d, v2.2d -; CHECK-GI-NEXT: ldp d1, d4, [sp, #8] ; CHECK-GI-NEXT: mov v6.d[1], v7.d[0] -; CHECK-GI-NEXT: fmov x9, d2 +; CHECK-GI-NEXT: ldp d1, d4, [sp, #8] +; CHECK-GI-NEXT: cmgt v2.2d, v5.2d, v2.2d ; CHECK-GI-NEXT: mov v1.d[1], v4.d[0] ; CHECK-GI-NEXT: cmgt v0.2d, v3.2d, v0.2d +; CHECK-GI-NEXT: fmov x9, d2 ; CHECK-GI-NEXT: sbfx x9, x9, #0, #1 ; CHECK-GI-NEXT: bsl v0.16b, v6.16b, v1.16b ; CHECK-GI-NEXT: and x8, x8, x9 ; CHECK-GI-NEXT: bic x9, x10, x9 ; CHECK-GI-NEXT: orr x8, x8, x9 -; CHECK-GI-NEXT: fmov d2, x8 +; CHECK-GI-NEXT: mov v2.d[0], x8 ; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-GI-NEXT: ret entry: %c = icmp slt <3 x i64> %a, %b @@ -1227,22 +1228,37 @@ define <3 x i32> @v3i32_i32(<3 x i32> %a, <3 x i32> %b, <3 x i32> %d, <3 x i32> ; ; CHECK-GI-LABEL: v3i32_i32: ; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v4.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v5.s[0], v1.s[0] ; CHECK-GI-NEXT: mov w8, #31 // =0x1f +; CHECK-GI-NEXT: mov v6.s[0], w8 ; CHECK-GI-NEXT: mov w9, #-1 // =0xffffffff -; CHECK-GI-NEXT: cmgt v0.4s, v1.4s, v0.4s -; CHECK-GI-NEXT: mov v4.s[0], w8 -; CHECK-GI-NEXT: mov v5.s[0], w9 -; CHECK-GI-NEXT: mov v4.s[1], w8 -; CHECK-GI-NEXT: mov v5.s[1], w9 -; CHECK-GI-NEXT: mov v4.s[2], w8 -; CHECK-GI-NEXT: mov v5.s[2], w9 -; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v4.4s -; CHECK-GI-NEXT: neg v1.4s, v4.4s -; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v1.4s -; CHECK-GI-NEXT: eor v1.16b, v0.16b, v5.16b -; CHECK-GI-NEXT: and v0.16b, v2.16b, v0.16b -; CHECK-GI-NEXT: and v1.16b, v3.16b, v1.16b -; CHECK-GI-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: mov v4.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v5.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v6.s[1], w8 +; CHECK-GI-NEXT: mov v4.s[2], v0.s[2] +; CHECK-GI-NEXT: mov v5.s[2], v1.s[2] +; CHECK-GI-NEXT: mov v0.s[0], w9 +; CHECK-GI-NEXT: mov v6.s[2], w8 +; CHECK-GI-NEXT: cmgt v1.4s, v5.4s, v4.4s +; CHECK-GI-NEXT: mov v4.s[0], v2.s[0] +; CHECK-GI-NEXT: mov v5.s[0], v3.s[0] +; CHECK-GI-NEXT: mov v0.s[1], w9 +; CHECK-GI-NEXT: ushl v1.4s, v1.4s, v6.4s +; CHECK-GI-NEXT: neg v6.4s, v6.4s +; CHECK-GI-NEXT: mov v4.s[1], v2.s[1] +; CHECK-GI-NEXT: mov v5.s[1], v3.s[1] +; CHECK-GI-NEXT: mov v0.s[2], w9 +; CHECK-GI-NEXT: sshl v1.4s, v1.4s, v6.4s +; CHECK-GI-NEXT: mov v4.s[2], v2.s[2] +; CHECK-GI-NEXT: mov v5.s[2], v3.s[2] +; CHECK-GI-NEXT: eor v0.16b, v1.16b, v0.16b +; CHECK-GI-NEXT: and v1.16b, v4.16b, v1.16b +; CHECK-GI-NEXT: and v0.16b, v5.16b, v0.16b +; CHECK-GI-NEXT: orr v1.16b, v1.16b, v0.16b +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] ; CHECK-GI-NEXT: ret entry: %c = icmp slt <3 x i32> %a, %b diff --git a/llvm/test/CodeGen/AArch64/insertextract.ll b/llvm/test/CodeGen/AArch64/insertextract.ll index 54ee693db1239..c67d3b4ee9f41 100644 --- a/llvm/test/CodeGen/AArch64/insertextract.ll +++ b/llvm/test/CodeGen/AArch64/insertextract.ll @@ -299,12 +299,18 @@ define <3 x float> @insert_v3f32_c(<3 x float> %a, float %b, i32 %c) { ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: sub sp, sp, #16 ; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: mov v2.s[0], v0.s[0] ; CHECK-GI-NEXT: mov w9, w0 ; CHECK-GI-NEXT: mov x8, sp -; CHECK-GI-NEXT: str q0, [sp] ; CHECK-GI-NEXT: and x9, x9, #0x3 +; CHECK-GI-NEXT: mov v2.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v2.s[2], v0.s[2] +; CHECK-GI-NEXT: str q2, [sp] ; CHECK-GI-NEXT: str s1, [x8, x9, lsl #2] -; CHECK-GI-NEXT: ldr q0, [sp], #16 +; CHECK-GI-NEXT: ldr q1, [sp], #16 +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] ; CHECK-GI-NEXT: ret entry: %d = insertelement <3 x float> %a, float %b, i32 %c @@ -1019,12 +1025,18 @@ define <3 x i32> @insert_v3i32_c(<3 x i32> %a, i32 %b, i32 %c) { ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: sub sp, sp, #16 ; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] ; CHECK-GI-NEXT: mov w9, w1 ; CHECK-GI-NEXT: mov x8, sp -; CHECK-GI-NEXT: str q0, [sp] ; CHECK-GI-NEXT: and x9, x9, #0x3 +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v1.s[2], v0.s[2] +; CHECK-GI-NEXT: str q1, [sp] ; CHECK-GI-NEXT: str w0, [x8, x9, lsl #2] -; CHECK-GI-NEXT: ldr q0, [sp], #16 +; CHECK-GI-NEXT: ldr q1, [sp], #16 +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] ; CHECK-GI-NEXT: ret entry: %d = insertelement <3 x i32> %a, i32 %b, i32 %c @@ -1578,10 +1590,13 @@ define float @extract_v3f32_c(<3 x float> %a, i32 %c) { ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: sub sp, sp, #16 ; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] ; CHECK-GI-NEXT: mov w9, w0 ; CHECK-GI-NEXT: mov x8, sp -; CHECK-GI-NEXT: str q0, [sp] ; CHECK-GI-NEXT: and x9, x9, #0x3 +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v1.s[2], v0.s[2] +; CHECK-GI-NEXT: str q1, [sp] ; CHECK-GI-NEXT: ldr s0, [x8, x9, lsl #2] ; CHECK-GI-NEXT: add sp, sp, #16 ; CHECK-GI-NEXT: ret @@ -2272,10 +2287,13 @@ define i32 @extract_v3i32_c(<3 x i32> %a, i32 %c) { ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: sub sp, sp, #16 ; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] ; CHECK-GI-NEXT: mov w9, w0 ; CHECK-GI-NEXT: mov x8, sp -; CHECK-GI-NEXT: str q0, [sp] ; CHECK-GI-NEXT: and x9, x9, #0x3 +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v1.s[2], v0.s[2] +; CHECK-GI-NEXT: str q1, [sp] ; CHECK-GI-NEXT: ldr w0, [x8, x9, lsl #2] ; CHECK-GI-NEXT: add sp, sp, #16 ; CHECK-GI-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll index 81c1a64f2d434..caff8c527d34a 100644 --- a/llvm/test/CodeGen/AArch64/itofp.ll +++ b/llvm/test/CodeGen/AArch64/itofp.ll @@ -1345,18 +1345,16 @@ define <3 x double> @stofp_v3i128_v3f64(<3 x i128> %a) { ; ; CHECK-GI-LABEL: stofp_v3i128_v3f64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: stp d9, d8, [sp, #-64]! // 16-byte Folded Spill -; CHECK-GI-NEXT: str x30, [sp, #16] // 8-byte Folded Spill -; CHECK-GI-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill -; CHECK-GI-NEXT: .cfi_def_cfa_offset 64 +; CHECK-GI-NEXT: sub sp, sp, #80 +; CHECK-GI-NEXT: str x30, [sp, #32] // 8-byte Folded Spill +; CHECK-GI-NEXT: stp x22, x21, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 80 ; CHECK-GI-NEXT: .cfi_offset w19, -8 ; CHECK-GI-NEXT: .cfi_offset w20, -16 ; CHECK-GI-NEXT: .cfi_offset w21, -24 ; CHECK-GI-NEXT: .cfi_offset w22, -32 ; CHECK-GI-NEXT: .cfi_offset w30, -48 -; CHECK-GI-NEXT: .cfi_offset b8, -56 -; CHECK-GI-NEXT: .cfi_offset b9, -64 ; CHECK-GI-NEXT: mov x19, x2 ; CHECK-GI-NEXT: mov x20, x3 ; CHECK-GI-NEXT: mov x21, x4 @@ -1364,19 +1362,24 @@ define <3 x double> @stofp_v3i128_v3f64(<3 x i128> %a) { ; CHECK-GI-NEXT: bl __floattidf ; CHECK-GI-NEXT: mov x0, x19 ; CHECK-GI-NEXT: mov x1, x20 -; CHECK-GI-NEXT: fmov d8, d0 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: bl __floattidf ; CHECK-GI-NEXT: mov x0, x21 ; CHECK-GI-NEXT: mov x1, x22 -; CHECK-GI-NEXT: fmov d9, d0 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: bl __floattidf -; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp q1, q3, [sp] // 32-byte Folded Reload ; CHECK-GI-NEXT: fmov d2, d0 -; CHECK-GI-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload -; CHECK-GI-NEXT: fmov d0, d8 -; CHECK-GI-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload -; CHECK-GI-NEXT: fmov d1, d9 -; CHECK-GI-NEXT: ldp d9, d8, [sp], #64 // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-GI-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload +; CHECK-GI-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.d[1], v1.d[0] +; CHECK-GI-NEXT: mov d1, v3.d[1] +; CHECK-GI-NEXT: fmov d0, d3 +; CHECK-GI-NEXT: add sp, sp, #80 ; CHECK-GI-NEXT: ret entry: %c = sitofp <3 x i128> %a to <3 x double> @@ -1422,18 +1425,16 @@ define <3 x double> @utofp_v3i128_v3f64(<3 x i128> %a) { ; ; CHECK-GI-LABEL: utofp_v3i128_v3f64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: stp d9, d8, [sp, #-64]! // 16-byte Folded Spill -; CHECK-GI-NEXT: str x30, [sp, #16] // 8-byte Folded Spill -; CHECK-GI-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill -; CHECK-GI-NEXT: .cfi_def_cfa_offset 64 +; CHECK-GI-NEXT: sub sp, sp, #80 +; CHECK-GI-NEXT: str x30, [sp, #32] // 8-byte Folded Spill +; CHECK-GI-NEXT: stp x22, x21, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 80 ; CHECK-GI-NEXT: .cfi_offset w19, -8 ; CHECK-GI-NEXT: .cfi_offset w20, -16 ; CHECK-GI-NEXT: .cfi_offset w21, -24 ; CHECK-GI-NEXT: .cfi_offset w22, -32 ; CHECK-GI-NEXT: .cfi_offset w30, -48 -; CHECK-GI-NEXT: .cfi_offset b8, -56 -; CHECK-GI-NEXT: .cfi_offset b9, -64 ; CHECK-GI-NEXT: mov x19, x2 ; CHECK-GI-NEXT: mov x20, x3 ; CHECK-GI-NEXT: mov x21, x4 @@ -1441,19 +1442,24 @@ define <3 x double> @utofp_v3i128_v3f64(<3 x i128> %a) { ; CHECK-GI-NEXT: bl __floatuntidf ; CHECK-GI-NEXT: mov x0, x19 ; CHECK-GI-NEXT: mov x1, x20 -; CHECK-GI-NEXT: fmov d8, d0 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: bl __floatuntidf ; CHECK-GI-NEXT: mov x0, x21 ; CHECK-GI-NEXT: mov x1, x22 -; CHECK-GI-NEXT: fmov d9, d0 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: bl __floatuntidf -; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp q1, q3, [sp] // 32-byte Folded Reload ; CHECK-GI-NEXT: fmov d2, d0 -; CHECK-GI-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload -; CHECK-GI-NEXT: fmov d0, d8 -; CHECK-GI-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload -; CHECK-GI-NEXT: fmov d1, d9 -; CHECK-GI-NEXT: ldp d9, d8, [sp], #64 // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-GI-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload +; CHECK-GI-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.d[1], v1.d[0] +; CHECK-GI-NEXT: mov d1, v3.d[1] +; CHECK-GI-NEXT: fmov d0, d3 +; CHECK-GI-NEXT: add sp, sp, #80 ; CHECK-GI-NEXT: ret entry: %c = uitofp <3 x i128> %a to <3 x double> @@ -2009,13 +2015,16 @@ define <3 x double> @stofp_v3i32_v3f64(<3 x i32> %a) { ; ; CHECK-GI-LABEL: stofp_v3i32_v3f64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: sshll v1.2d, v0.2s, #0 -; CHECK-GI-NEXT: sshll2 v0.2d, v0.4s, #0 -; CHECK-GI-NEXT: scvtf v3.2d, v1.2d -; CHECK-GI-NEXT: scvtf v2.2d, v0.2d +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v2.s[0], v0.s[2] +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: sshll v0.2d, v1.2s, #0 +; CHECK-GI-NEXT: sshll v1.2d, v2.2s, #0 +; CHECK-GI-NEXT: scvtf v0.2d, v0.2d +; CHECK-GI-NEXT: scvtf v2.2d, v1.2d ; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-GI-NEXT: mov d1, v3.d[1] -; CHECK-GI-NEXT: fmov d0, d3 +; CHECK-GI-NEXT: mov d1, v0.d[1] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret entry: %c = sitofp <3 x i32> %a to <3 x double> @@ -2037,13 +2046,16 @@ define <3 x double> @utofp_v3i32_v3f64(<3 x i32> %a) { ; ; CHECK-GI-LABEL: utofp_v3i32_v3f64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ushll v1.2d, v0.2s, #0 -; CHECK-GI-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-GI-NEXT: ucvtf v3.2d, v1.2d -; CHECK-GI-NEXT: ucvtf v2.2d, v0.2d +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v2.s[0], v0.s[2] +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: ushll v0.2d, v1.2s, #0 +; CHECK-GI-NEXT: ushll v1.2d, v2.2s, #0 +; CHECK-GI-NEXT: ucvtf v0.2d, v0.2d +; CHECK-GI-NEXT: ucvtf v2.2d, v1.2d ; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-GI-NEXT: mov d1, v3.d[1] -; CHECK-GI-NEXT: fmov d0, d3 +; CHECK-GI-NEXT: mov d1, v0.d[1] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret entry: %c = uitofp <3 x i32> %a to <3 x double> @@ -2596,7 +2608,11 @@ define <3 x double> @stofp_v3i16_v3f64(<3 x i16> %a) { ; ; CHECK-GI-LABEL: stofp_v3i16_v3f64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: sshll v1.4s, v0.4h, #0 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: mov v1.h[0], v0.h[0] +; CHECK-GI-NEXT: mov v1.h[1], v0.h[1] +; CHECK-GI-NEXT: mov v1.h[2], v0.h[2] +; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0 ; CHECK-GI-NEXT: sshll v0.2d, v1.2s, #0 ; CHECK-GI-NEXT: sshll2 v1.2d, v1.4s, #0 ; CHECK-GI-NEXT: scvtf v0.2d, v0.2d @@ -2626,7 +2642,11 @@ define <3 x double> @utofp_v3i16_v3f64(<3 x i16> %a) { ; ; CHECK-GI-LABEL: utofp_v3i16_v3f64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ushll v1.4s, v0.4h, #0 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: mov v1.h[0], v0.h[0] +; CHECK-GI-NEXT: mov v1.h[1], v0.h[1] +; CHECK-GI-NEXT: mov v1.h[2], v0.h[2] +; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-GI-NEXT: ushll v0.2d, v1.2s, #0 ; CHECK-GI-NEXT: ushll2 v1.2d, v1.4s, #0 ; CHECK-GI-NEXT: ucvtf v0.2d, v0.2d @@ -4328,7 +4348,9 @@ define <3 x float> @stofp_v3i128_v3f32(<3 x i128> %a) { ; CHECK-GI-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.s[1], v2.s[0] ; CHECK-GI-NEXT: mov v1.s[2], v0.s[0] -; CHECK-GI-NEXT: mov v0.16b, v1.16b +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] ; CHECK-GI-NEXT: add sp, sp, #80 ; CHECK-GI-NEXT: ret entry: @@ -4412,7 +4434,9 @@ define <3 x float> @utofp_v3i128_v3f32(<3 x i128> %a) { ; CHECK-GI-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.s[1], v2.s[0] ; CHECK-GI-NEXT: mov v1.s[2], v0.s[0] -; CHECK-GI-NEXT: mov v0.16b, v1.16b +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] ; CHECK-GI-NEXT: add sp, sp, #80 ; CHECK-GI-NEXT: ret entry: @@ -4461,13 +4485,16 @@ define <3 x float> @stofp_v3i64_v3f32(<3 x i64> %a) { ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] -; CHECK-GI-NEXT: scvtf v2.2d, v2.2d +; CHECK-GI-NEXT: scvtf v1.2d, v2.2d ; CHECK-GI-NEXT: scvtf v0.2d, v0.2d -; CHECK-GI-NEXT: fcvtn v2.2s, v2.2d -; CHECK-GI-NEXT: fcvtn v1.2s, v0.2d -; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] -; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] -; CHECK-GI-NEXT: mov v0.s[2], v2.s[0] +; CHECK-GI-NEXT: fcvtn v1.2s, v1.2d +; CHECK-GI-NEXT: fcvtn v0.2s, v0.2d +; CHECK-GI-NEXT: mov v2.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v2.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v2.s[2], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[0], v2.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v2.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v2.s[2] ; CHECK-GI-NEXT: ret entry: %c = sitofp <3 x i64> %a to <3 x float> @@ -4493,13 +4520,16 @@ define <3 x float> @utofp_v3i64_v3f32(<3 x i64> %a) { ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] -; CHECK-GI-NEXT: ucvtf v2.2d, v2.2d +; CHECK-GI-NEXT: ucvtf v1.2d, v2.2d ; CHECK-GI-NEXT: ucvtf v0.2d, v0.2d -; CHECK-GI-NEXT: fcvtn v2.2s, v2.2d -; CHECK-GI-NEXT: fcvtn v1.2s, v0.2d -; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] -; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] -; CHECK-GI-NEXT: mov v0.s[2], v2.s[0] +; CHECK-GI-NEXT: fcvtn v1.2s, v1.2d +; CHECK-GI-NEXT: fcvtn v0.2s, v0.2d +; CHECK-GI-NEXT: mov v2.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v2.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v2.s[2], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[0], v2.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v2.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v2.s[2] ; CHECK-GI-NEXT: ret entry: %c = uitofp <3 x i64> %a to <3 x float> @@ -4831,20 +4861,42 @@ entry: } define <3 x float> @stofp_v3i32_v3f32(<3 x i32> %a) { -; CHECK-LABEL: stofp_v3i32_v3f32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: scvtf v0.4s, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: stofp_v3i32_v3f32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: scvtf v0.4s, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: stofp_v3i32_v3f32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v1.s[2], v0.s[2] +; CHECK-GI-NEXT: scvtf v1.4s, v1.4s +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] +; CHECK-GI-NEXT: ret entry: %c = sitofp <3 x i32> %a to <3 x float> ret <3 x float> %c } define <3 x float> @utofp_v3i32_v3f32(<3 x i32> %a) { -; CHECK-LABEL: utofp_v3i32_v3f32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ucvtf v0.4s, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: utofp_v3i32_v3f32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ucvtf v0.4s, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: utofp_v3i32_v3f32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v1.s[2], v0.s[2] +; CHECK-GI-NEXT: ucvtf v1.4s, v1.4s +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] +; CHECK-GI-NEXT: ret entry: %c = uitofp <3 x i32> %a to <3 x float> ret <3 x float> %c @@ -4977,22 +5029,48 @@ entry: } define <3 x float> @stofp_v3i16_v3f32(<3 x i16> %a) { -; CHECK-LABEL: stofp_v3i16_v3f32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sshll v0.4s, v0.4h, #0 -; CHECK-NEXT: scvtf v0.4s, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: stofp_v3i16_v3f32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-SD-NEXT: scvtf v0.4s, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: stofp_v3i16_v3f32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: mov v1.h[0], v0.h[0] +; CHECK-GI-NEXT: mov v1.h[1], v0.h[1] +; CHECK-GI-NEXT: mov v1.h[2], v0.h[2] +; CHECK-GI-NEXT: sshll v0.4s, v1.4h, #0 +; CHECK-GI-NEXT: scvtf v1.4s, v0.4s +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] +; CHECK-GI-NEXT: ret entry: %c = sitofp <3 x i16> %a to <3 x float> ret <3 x float> %c } define <3 x float> @utofp_v3i16_v3f32(<3 x i16> %a) { -; CHECK-LABEL: utofp_v3i16_v3f32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: ucvtf v0.4s, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: utofp_v3i16_v3f32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-SD-NEXT: ucvtf v0.4s, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: utofp_v3i16_v3f32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: mov v1.h[0], v0.h[0] +; CHECK-GI-NEXT: mov v1.h[1], v0.h[1] +; CHECK-GI-NEXT: mov v1.h[2], v0.h[2] +; CHECK-GI-NEXT: ushll v0.4s, v1.4h, #0 +; CHECK-GI-NEXT: ucvtf v1.4s, v0.4s +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] +; CHECK-GI-NEXT: ret entry: %c = uitofp <3 x i16> %a to <3 x float> ret <3 x float> %c @@ -5258,7 +5336,10 @@ define <3 x float> @stofp_v3i8_v3f32(<3 x i8> %a) { ; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0 ; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0 ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] -; CHECK-GI-NEXT: scvtf v0.4s, v0.4s +; CHECK-GI-NEXT: scvtf v1.4s, v0.4s +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] ; CHECK-GI-NEXT: ret entry: %c = sitofp <3 x i8> %a to <3 x float> @@ -5288,7 +5369,10 @@ define <3 x float> @utofp_v3i8_v3f32(<3 x i8> %a) { ; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] -; CHECK-GI-NEXT: ucvtf v0.4s, v0.4s +; CHECK-GI-NEXT: ucvtf v1.4s, v0.4s +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] ; CHECK-GI-NEXT: ret entry: %c = uitofp <3 x i8> %a to <3 x float> @@ -5690,11 +5774,14 @@ define <2 x half> @stofp_v2i128_v2f16(<2 x i128> %a) { ; CHECK-GI-NOFP16-NEXT: mov x1, x20 ; CHECK-GI-NOFP16-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NOFP16-NEXT: bl __floattisf -; CHECK-GI-NOFP16-NEXT: fcvt h1, s0 -; CHECK-GI-NOFP16-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-GI-NOFP16-NEXT: fcvt h0, s0 +; CHECK-GI-NOFP16-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-GI-NOFP16-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload ; CHECK-GI-NOFP16-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v0.h[0] +; CHECK-GI-NOFP16-NEXT: mov h0, v1.h[1] +; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v0.h[0] +; CHECK-GI-NOFP16-NEXT: mov v0.16b, v1.16b ; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NOFP16-NEXT: add sp, sp, #48 ; CHECK-GI-NOFP16-NEXT: ret @@ -5721,7 +5808,10 @@ define <2 x half> @stofp_v2i128_v2f16(<2 x i128> %a) { ; CHECK-GI-FP16-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-FP16-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload ; CHECK-GI-FP16-NEXT: mov v1.h[1], v0.h[0] -; CHECK-GI-FP16-NEXT: fmov d0, d1 +; CHECK-GI-FP16-NEXT: mov h0, v1.h[1] +; CHECK-GI-FP16-NEXT: mov v1.h[1], v0.h[0] +; CHECK-GI-FP16-NEXT: mov v0.16b, v1.16b +; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-FP16-NEXT: add sp, sp, #48 ; CHECK-GI-FP16-NEXT: ret entry: @@ -5803,11 +5893,14 @@ define <2 x half> @utofp_v2i128_v2f16(<2 x i128> %a) { ; CHECK-GI-NOFP16-NEXT: mov x1, x20 ; CHECK-GI-NOFP16-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NOFP16-NEXT: bl __floatuntisf -; CHECK-GI-NOFP16-NEXT: fcvt h1, s0 -; CHECK-GI-NOFP16-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-GI-NOFP16-NEXT: fcvt h0, s0 +; CHECK-GI-NOFP16-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-GI-NOFP16-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload ; CHECK-GI-NOFP16-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v0.h[0] +; CHECK-GI-NOFP16-NEXT: mov h0, v1.h[1] +; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v0.h[0] +; CHECK-GI-NOFP16-NEXT: mov v0.16b, v1.16b ; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NOFP16-NEXT: add sp, sp, #48 ; CHECK-GI-NOFP16-NEXT: ret @@ -5834,7 +5927,10 @@ define <2 x half> @utofp_v2i128_v2f16(<2 x i128> %a) { ; CHECK-GI-FP16-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-FP16-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload ; CHECK-GI-FP16-NEXT: mov v1.h[1], v0.h[0] -; CHECK-GI-FP16-NEXT: fmov d0, d1 +; CHECK-GI-FP16-NEXT: mov h0, v1.h[1] +; CHECK-GI-FP16-NEXT: mov v1.h[1], v0.h[0] +; CHECK-GI-FP16-NEXT: mov v0.16b, v1.16b +; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-FP16-NEXT: add sp, sp, #48 ; CHECK-GI-FP16-NEXT: ret entry: @@ -5927,55 +6023,63 @@ define <3 x half> @stofp_v3i128_v3f16(<3 x i128> %a) { ; ; CHECK-GI-NOFP16-LABEL: stofp_v3i128_v3f16: ; CHECK-GI-NOFP16: // %bb.0: // %entry -; CHECK-GI-NOFP16-NEXT: sub sp, sp, #80 -; CHECK-GI-NOFP16-NEXT: str x30, [sp, #32] // 8-byte Folded Spill -; CHECK-GI-NOFP16-NEXT: stp x22, x21, [sp, #48] // 16-byte Folded Spill -; CHECK-GI-NOFP16-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill -; CHECK-GI-NOFP16-NEXT: .cfi_def_cfa_offset 80 +; CHECK-GI-NOFP16-NEXT: stp d9, d8, [sp, #-64]! // 16-byte Folded Spill +; CHECK-GI-NOFP16-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-GI-NOFP16-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NOFP16-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NOFP16-NEXT: .cfi_def_cfa_offset 64 ; CHECK-GI-NOFP16-NEXT: .cfi_offset w19, -8 ; CHECK-GI-NOFP16-NEXT: .cfi_offset w20, -16 ; CHECK-GI-NOFP16-NEXT: .cfi_offset w21, -24 ; CHECK-GI-NOFP16-NEXT: .cfi_offset w22, -32 ; CHECK-GI-NOFP16-NEXT: .cfi_offset w30, -48 +; CHECK-GI-NOFP16-NEXT: .cfi_offset b8, -56 +; CHECK-GI-NOFP16-NEXT: .cfi_offset b9, -64 ; CHECK-GI-NOFP16-NEXT: mov x19, x2 ; CHECK-GI-NOFP16-NEXT: mov x20, x3 ; CHECK-GI-NOFP16-NEXT: mov x21, x4 ; CHECK-GI-NOFP16-NEXT: mov x22, x5 ; CHECK-GI-NOFP16-NEXT: bl __floattisf -; CHECK-GI-NOFP16-NEXT: fcvt h0, s0 ; CHECK-GI-NOFP16-NEXT: mov x0, x19 ; CHECK-GI-NOFP16-NEXT: mov x1, x20 -; CHECK-GI-NOFP16-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NOFP16-NEXT: fcvt h8, s0 ; CHECK-GI-NOFP16-NEXT: bl __floattisf -; CHECK-GI-NOFP16-NEXT: fcvt h0, s0 ; CHECK-GI-NOFP16-NEXT: mov x0, x21 ; CHECK-GI-NOFP16-NEXT: mov x1, x22 -; CHECK-GI-NOFP16-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NOFP16-NEXT: fcvt h9, s0 ; CHECK-GI-NOFP16-NEXT: bl __floattisf -; CHECK-GI-NOFP16-NEXT: ldp q2, q1, [sp] // 32-byte Folded Reload +; CHECK-GI-NOFP16-NEXT: fmov w8, s8 ; CHECK-GI-NOFP16-NEXT: fcvt h0, s0 -; CHECK-GI-NOFP16-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload -; CHECK-GI-NOFP16-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload -; CHECK-GI-NOFP16-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload -; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v2.h[0] -; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v0.h[0] -; CHECK-GI-NOFP16-NEXT: mov v0.16b, v1.16b -; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-GI-NOFP16-NEXT: add sp, sp, #80 +; CHECK-GI-NOFP16-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-GI-NOFP16-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NOFP16-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NOFP16-NEXT: mov v1.s[0], w8 +; CHECK-GI-NOFP16-NEXT: fmov w8, s9 +; CHECK-GI-NOFP16-NEXT: mov v1.s[1], w8 +; CHECK-GI-NOFP16-NEXT: fmov w8, s0 +; CHECK-GI-NOFP16-NEXT: mov v1.s[2], w8 +; CHECK-GI-NOFP16-NEXT: mov w8, v1.s[1] +; CHECK-GI-NOFP16-NEXT: mov w9, v1.s[2] +; CHECK-GI-NOFP16-NEXT: mov v1.h[1], w8 +; CHECK-GI-NOFP16-NEXT: mov v1.h[2], w9 +; CHECK-GI-NOFP16-NEXT: fmov d0, d1 +; CHECK-GI-NOFP16-NEXT: ldp d9, d8, [sp], #64 // 16-byte Folded Reload ; CHECK-GI-NOFP16-NEXT: ret ; ; CHECK-GI-FP16-LABEL: stofp_v3i128_v3f16: ; CHECK-GI-FP16: // %bb.0: // %entry -; CHECK-GI-FP16-NEXT: sub sp, sp, #80 -; CHECK-GI-FP16-NEXT: str x30, [sp, #32] // 8-byte Folded Spill -; CHECK-GI-FP16-NEXT: stp x22, x21, [sp, #48] // 16-byte Folded Spill -; CHECK-GI-FP16-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill -; CHECK-GI-FP16-NEXT: .cfi_def_cfa_offset 80 +; CHECK-GI-FP16-NEXT: stp d9, d8, [sp, #-64]! // 16-byte Folded Spill +; CHECK-GI-FP16-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-GI-FP16-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-FP16-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-FP16-NEXT: .cfi_def_cfa_offset 64 ; CHECK-GI-FP16-NEXT: .cfi_offset w19, -8 ; CHECK-GI-FP16-NEXT: .cfi_offset w20, -16 ; CHECK-GI-FP16-NEXT: .cfi_offset w21, -24 ; CHECK-GI-FP16-NEXT: .cfi_offset w22, -32 ; CHECK-GI-FP16-NEXT: .cfi_offset w30, -48 +; CHECK-GI-FP16-NEXT: .cfi_offset b8, -56 +; CHECK-GI-FP16-NEXT: .cfi_offset b9, -64 ; CHECK-GI-FP16-NEXT: mov x19, x2 ; CHECK-GI-FP16-NEXT: mov x20, x3 ; CHECK-GI-FP16-NEXT: mov x21, x4 @@ -5983,24 +6087,28 @@ define <3 x half> @stofp_v3i128_v3f16(<3 x i128> %a) { ; CHECK-GI-FP16-NEXT: bl __floattihf ; CHECK-GI-FP16-NEXT: mov x0, x19 ; CHECK-GI-FP16-NEXT: mov x1, x20 -; CHECK-GI-FP16-NEXT: // kill: def $h0 killed $h0 def $q0 -; CHECK-GI-FP16-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-FP16-NEXT: fmov s8, s0 ; CHECK-GI-FP16-NEXT: bl __floattihf ; CHECK-GI-FP16-NEXT: mov x0, x21 ; CHECK-GI-FP16-NEXT: mov x1, x22 -; CHECK-GI-FP16-NEXT: // kill: def $h0 killed $h0 def $q0 -; CHECK-GI-FP16-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-FP16-NEXT: fmov s9, s0 ; CHECK-GI-FP16-NEXT: bl __floattihf -; CHECK-GI-FP16-NEXT: ldp q2, q1, [sp] // 32-byte Folded Reload -; CHECK-GI-FP16-NEXT: // kill: def $h0 killed $h0 def $q0 -; CHECK-GI-FP16-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload -; CHECK-GI-FP16-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload -; CHECK-GI-FP16-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload -; CHECK-GI-FP16-NEXT: mov v1.h[1], v2.h[0] -; CHECK-GI-FP16-NEXT: mov v1.h[2], v0.h[0] -; CHECK-GI-FP16-NEXT: mov v0.16b, v1.16b -; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-GI-FP16-NEXT: add sp, sp, #80 +; CHECK-GI-FP16-NEXT: fmov w8, s8 +; CHECK-GI-FP16-NEXT: // kill: def $h0 killed $h0 def $s0 +; CHECK-GI-FP16-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-FP16-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-FP16-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-GI-FP16-NEXT: mov v1.s[0], w8 +; CHECK-GI-FP16-NEXT: fmov w8, s9 +; CHECK-GI-FP16-NEXT: mov v1.s[1], w8 +; CHECK-GI-FP16-NEXT: fmov w8, s0 +; CHECK-GI-FP16-NEXT: mov v1.s[2], w8 +; CHECK-GI-FP16-NEXT: mov w8, v1.s[1] +; CHECK-GI-FP16-NEXT: mov w9, v1.s[2] +; CHECK-GI-FP16-NEXT: mov v1.h[1], w8 +; CHECK-GI-FP16-NEXT: mov v1.h[2], w9 +; CHECK-GI-FP16-NEXT: fmov d0, d1 +; CHECK-GI-FP16-NEXT: ldp d9, d8, [sp], #64 // 16-byte Folded Reload ; CHECK-GI-FP16-NEXT: ret entry: %c = sitofp <3 x i128> %a to <3 x half> @@ -6092,55 +6200,63 @@ define <3 x half> @utofp_v3i128_v3f16(<3 x i128> %a) { ; ; CHECK-GI-NOFP16-LABEL: utofp_v3i128_v3f16: ; CHECK-GI-NOFP16: // %bb.0: // %entry -; CHECK-GI-NOFP16-NEXT: sub sp, sp, #80 -; CHECK-GI-NOFP16-NEXT: str x30, [sp, #32] // 8-byte Folded Spill -; CHECK-GI-NOFP16-NEXT: stp x22, x21, [sp, #48] // 16-byte Folded Spill -; CHECK-GI-NOFP16-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill -; CHECK-GI-NOFP16-NEXT: .cfi_def_cfa_offset 80 +; CHECK-GI-NOFP16-NEXT: stp d9, d8, [sp, #-64]! // 16-byte Folded Spill +; CHECK-GI-NOFP16-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-GI-NOFP16-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NOFP16-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NOFP16-NEXT: .cfi_def_cfa_offset 64 ; CHECK-GI-NOFP16-NEXT: .cfi_offset w19, -8 ; CHECK-GI-NOFP16-NEXT: .cfi_offset w20, -16 ; CHECK-GI-NOFP16-NEXT: .cfi_offset w21, -24 ; CHECK-GI-NOFP16-NEXT: .cfi_offset w22, -32 ; CHECK-GI-NOFP16-NEXT: .cfi_offset w30, -48 +; CHECK-GI-NOFP16-NEXT: .cfi_offset b8, -56 +; CHECK-GI-NOFP16-NEXT: .cfi_offset b9, -64 ; CHECK-GI-NOFP16-NEXT: mov x19, x2 ; CHECK-GI-NOFP16-NEXT: mov x20, x3 ; CHECK-GI-NOFP16-NEXT: mov x21, x4 ; CHECK-GI-NOFP16-NEXT: mov x22, x5 ; CHECK-GI-NOFP16-NEXT: bl __floatuntisf -; CHECK-GI-NOFP16-NEXT: fcvt h0, s0 ; CHECK-GI-NOFP16-NEXT: mov x0, x19 ; CHECK-GI-NOFP16-NEXT: mov x1, x20 -; CHECK-GI-NOFP16-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NOFP16-NEXT: fcvt h8, s0 ; CHECK-GI-NOFP16-NEXT: bl __floatuntisf -; CHECK-GI-NOFP16-NEXT: fcvt h0, s0 ; CHECK-GI-NOFP16-NEXT: mov x0, x21 ; CHECK-GI-NOFP16-NEXT: mov x1, x22 -; CHECK-GI-NOFP16-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NOFP16-NEXT: fcvt h9, s0 ; CHECK-GI-NOFP16-NEXT: bl __floatuntisf -; CHECK-GI-NOFP16-NEXT: ldp q2, q1, [sp] // 32-byte Folded Reload +; CHECK-GI-NOFP16-NEXT: fmov w8, s8 ; CHECK-GI-NOFP16-NEXT: fcvt h0, s0 -; CHECK-GI-NOFP16-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload -; CHECK-GI-NOFP16-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload -; CHECK-GI-NOFP16-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload -; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v2.h[0] -; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v0.h[0] -; CHECK-GI-NOFP16-NEXT: mov v0.16b, v1.16b -; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-GI-NOFP16-NEXT: add sp, sp, #80 +; CHECK-GI-NOFP16-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-GI-NOFP16-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NOFP16-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NOFP16-NEXT: mov v1.s[0], w8 +; CHECK-GI-NOFP16-NEXT: fmov w8, s9 +; CHECK-GI-NOFP16-NEXT: mov v1.s[1], w8 +; CHECK-GI-NOFP16-NEXT: fmov w8, s0 +; CHECK-GI-NOFP16-NEXT: mov v1.s[2], w8 +; CHECK-GI-NOFP16-NEXT: mov w8, v1.s[1] +; CHECK-GI-NOFP16-NEXT: mov w9, v1.s[2] +; CHECK-GI-NOFP16-NEXT: mov v1.h[1], w8 +; CHECK-GI-NOFP16-NEXT: mov v1.h[2], w9 +; CHECK-GI-NOFP16-NEXT: fmov d0, d1 +; CHECK-GI-NOFP16-NEXT: ldp d9, d8, [sp], #64 // 16-byte Folded Reload ; CHECK-GI-NOFP16-NEXT: ret ; ; CHECK-GI-FP16-LABEL: utofp_v3i128_v3f16: ; CHECK-GI-FP16: // %bb.0: // %entry -; CHECK-GI-FP16-NEXT: sub sp, sp, #80 -; CHECK-GI-FP16-NEXT: str x30, [sp, #32] // 8-byte Folded Spill -; CHECK-GI-FP16-NEXT: stp x22, x21, [sp, #48] // 16-byte Folded Spill -; CHECK-GI-FP16-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill -; CHECK-GI-FP16-NEXT: .cfi_def_cfa_offset 80 +; CHECK-GI-FP16-NEXT: stp d9, d8, [sp, #-64]! // 16-byte Folded Spill +; CHECK-GI-FP16-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-GI-FP16-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-FP16-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-FP16-NEXT: .cfi_def_cfa_offset 64 ; CHECK-GI-FP16-NEXT: .cfi_offset w19, -8 ; CHECK-GI-FP16-NEXT: .cfi_offset w20, -16 ; CHECK-GI-FP16-NEXT: .cfi_offset w21, -24 ; CHECK-GI-FP16-NEXT: .cfi_offset w22, -32 ; CHECK-GI-FP16-NEXT: .cfi_offset w30, -48 +; CHECK-GI-FP16-NEXT: .cfi_offset b8, -56 +; CHECK-GI-FP16-NEXT: .cfi_offset b9, -64 ; CHECK-GI-FP16-NEXT: mov x19, x2 ; CHECK-GI-FP16-NEXT: mov x20, x3 ; CHECK-GI-FP16-NEXT: mov x21, x4 @@ -6148,24 +6264,28 @@ define <3 x half> @utofp_v3i128_v3f16(<3 x i128> %a) { ; CHECK-GI-FP16-NEXT: bl __floatuntihf ; CHECK-GI-FP16-NEXT: mov x0, x19 ; CHECK-GI-FP16-NEXT: mov x1, x20 -; CHECK-GI-FP16-NEXT: // kill: def $h0 killed $h0 def $q0 -; CHECK-GI-FP16-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-FP16-NEXT: fmov s8, s0 ; CHECK-GI-FP16-NEXT: bl __floatuntihf ; CHECK-GI-FP16-NEXT: mov x0, x21 ; CHECK-GI-FP16-NEXT: mov x1, x22 -; CHECK-GI-FP16-NEXT: // kill: def $h0 killed $h0 def $q0 -; CHECK-GI-FP16-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-FP16-NEXT: fmov s9, s0 ; CHECK-GI-FP16-NEXT: bl __floatuntihf -; CHECK-GI-FP16-NEXT: ldp q2, q1, [sp] // 32-byte Folded Reload -; CHECK-GI-FP16-NEXT: // kill: def $h0 killed $h0 def $q0 -; CHECK-GI-FP16-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload -; CHECK-GI-FP16-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload -; CHECK-GI-FP16-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload -; CHECK-GI-FP16-NEXT: mov v1.h[1], v2.h[0] -; CHECK-GI-FP16-NEXT: mov v1.h[2], v0.h[0] -; CHECK-GI-FP16-NEXT: mov v0.16b, v1.16b -; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-GI-FP16-NEXT: add sp, sp, #80 +; CHECK-GI-FP16-NEXT: fmov w8, s8 +; CHECK-GI-FP16-NEXT: // kill: def $h0 killed $h0 def $s0 +; CHECK-GI-FP16-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-FP16-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-FP16-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-GI-FP16-NEXT: mov v1.s[0], w8 +; CHECK-GI-FP16-NEXT: fmov w8, s9 +; CHECK-GI-FP16-NEXT: mov v1.s[1], w8 +; CHECK-GI-FP16-NEXT: fmov w8, s0 +; CHECK-GI-FP16-NEXT: mov v1.s[2], w8 +; CHECK-GI-FP16-NEXT: mov w8, v1.s[1] +; CHECK-GI-FP16-NEXT: mov w9, v1.s[2] +; CHECK-GI-FP16-NEXT: mov v1.h[1], w8 +; CHECK-GI-FP16-NEXT: mov v1.h[2], w9 +; CHECK-GI-FP16-NEXT: fmov d0, d1 +; CHECK-GI-FP16-NEXT: ldp d9, d8, [sp], #64 // 16-byte Folded Reload ; CHECK-GI-FP16-NEXT: ret entry: %c = uitofp <3 x i128> %a to <3 x half> @@ -6202,6 +6322,9 @@ define <2 x half> @stofp_v2i64_v2f16(<2 x i64> %a) { ; CHECK-GI-NOFP16-NEXT: mov v1.s[0], v0.s[0] ; CHECK-GI-NOFP16-NEXT: mov v1.s[1], v0.s[1] ; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s +; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1] +; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NOFP16-NEXT: ret ; ; CHECK-GI-FP16-LABEL: stofp_v2i64_v2f16: @@ -6211,6 +6334,8 @@ define <2 x half> @stofp_v2i64_v2f16(<2 x i64> %a) { ; CHECK-GI-FP16-NEXT: fcvt h0, d0 ; CHECK-GI-FP16-NEXT: fcvt h1, d1 ; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-FP16-NEXT: mov h1, v0.h[1] +; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0] ; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-FP16-NEXT: ret entry: @@ -6248,6 +6373,9 @@ define <2 x half> @utofp_v2i64_v2f16(<2 x i64> %a) { ; CHECK-GI-NOFP16-NEXT: mov v1.s[0], v0.s[0] ; CHECK-GI-NOFP16-NEXT: mov v1.s[1], v0.s[1] ; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s +; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1] +; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NOFP16-NEXT: ret ; ; CHECK-GI-FP16-LABEL: utofp_v2i64_v2f16: @@ -6257,6 +6385,8 @@ define <2 x half> @utofp_v2i64_v2f16(<2 x i64> %a) { ; CHECK-GI-FP16-NEXT: fcvt h0, d0 ; CHECK-GI-FP16-NEXT: fcvt h1, d1 ; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-FP16-NEXT: mov h1, v0.h[1] +; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0] ; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-FP16-NEXT: ret entry: @@ -6288,7 +6418,18 @@ define <3 x half> @stofp_v3i64_v3f16(<3 x i64> %a) { ; CHECK-GI-NOFP16-NEXT: scvtf v0.2d, v0.2d ; CHECK-GI-NOFP16-NEXT: fcvtn v0.2s, v0.2d ; CHECK-GI-NOFP16-NEXT: fcvtn2 v0.4s, v1.2d -; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s +; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v0.4s +; CHECK-GI-NOFP16-NEXT: umov w8, v1.h[0] +; CHECK-GI-NOFP16-NEXT: umov w9, v1.h[1] +; CHECK-GI-NOFP16-NEXT: mov v0.s[0], w8 +; CHECK-GI-NOFP16-NEXT: umov w8, v1.h[2] +; CHECK-GI-NOFP16-NEXT: mov v0.s[1], w9 +; CHECK-GI-NOFP16-NEXT: mov v0.s[2], w8 +; CHECK-GI-NOFP16-NEXT: mov w8, v0.s[1] +; CHECK-GI-NOFP16-NEXT: mov w9, v0.s[2] +; CHECK-GI-NOFP16-NEXT: mov v0.h[1], w8 +; CHECK-GI-NOFP16-NEXT: mov v0.h[2], w9 +; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NOFP16-NEXT: ret ; ; CHECK-GI-FP16-LABEL: stofp_v3i64_v3f16: @@ -6303,8 +6444,16 @@ define <3 x half> @stofp_v3i64_v3f16(<3 x i64> %a) { ; CHECK-GI-FP16-NEXT: mov d1, v0.d[1] ; CHECK-GI-FP16-NEXT: fcvt h0, d0 ; CHECK-GI-FP16-NEXT: fcvt h1, d1 -; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-FP16-NEXT: mov v0.h[2], v2.h[0] +; CHECK-GI-FP16-NEXT: fmov w8, s0 +; CHECK-GI-FP16-NEXT: mov v0.s[0], w8 +; CHECK-GI-FP16-NEXT: fmov w8, s1 +; CHECK-GI-FP16-NEXT: mov v0.s[1], w8 +; CHECK-GI-FP16-NEXT: fmov w8, s2 +; CHECK-GI-FP16-NEXT: mov v0.s[2], w8 +; CHECK-GI-FP16-NEXT: mov w8, v0.s[1] +; CHECK-GI-FP16-NEXT: mov w9, v0.s[2] +; CHECK-GI-FP16-NEXT: mov v0.h[1], w8 +; CHECK-GI-FP16-NEXT: mov v0.h[2], w9 ; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-FP16-NEXT: ret entry: @@ -6336,7 +6485,18 @@ define <3 x half> @utofp_v3i64_v3f16(<3 x i64> %a) { ; CHECK-GI-NOFP16-NEXT: ucvtf v0.2d, v0.2d ; CHECK-GI-NOFP16-NEXT: fcvtn v0.2s, v0.2d ; CHECK-GI-NOFP16-NEXT: fcvtn2 v0.4s, v1.2d -; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s +; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v0.4s +; CHECK-GI-NOFP16-NEXT: umov w8, v1.h[0] +; CHECK-GI-NOFP16-NEXT: umov w9, v1.h[1] +; CHECK-GI-NOFP16-NEXT: mov v0.s[0], w8 +; CHECK-GI-NOFP16-NEXT: umov w8, v1.h[2] +; CHECK-GI-NOFP16-NEXT: mov v0.s[1], w9 +; CHECK-GI-NOFP16-NEXT: mov v0.s[2], w8 +; CHECK-GI-NOFP16-NEXT: mov w8, v0.s[1] +; CHECK-GI-NOFP16-NEXT: mov w9, v0.s[2] +; CHECK-GI-NOFP16-NEXT: mov v0.h[1], w8 +; CHECK-GI-NOFP16-NEXT: mov v0.h[2], w9 +; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NOFP16-NEXT: ret ; ; CHECK-GI-FP16-LABEL: utofp_v3i64_v3f16: @@ -6351,8 +6511,16 @@ define <3 x half> @utofp_v3i64_v3f16(<3 x i64> %a) { ; CHECK-GI-FP16-NEXT: mov d1, v0.d[1] ; CHECK-GI-FP16-NEXT: fcvt h0, d0 ; CHECK-GI-FP16-NEXT: fcvt h1, d1 -; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-FP16-NEXT: mov v0.h[2], v2.h[0] +; CHECK-GI-FP16-NEXT: fmov w8, s0 +; CHECK-GI-FP16-NEXT: mov v0.s[0], w8 +; CHECK-GI-FP16-NEXT: fmov w8, s1 +; CHECK-GI-FP16-NEXT: mov v0.s[1], w8 +; CHECK-GI-FP16-NEXT: fmov w8, s2 +; CHECK-GI-FP16-NEXT: mov v0.s[2], w8 +; CHECK-GI-FP16-NEXT: mov w8, v0.s[1] +; CHECK-GI-FP16-NEXT: mov w9, v0.s[2] +; CHECK-GI-FP16-NEXT: mov v0.h[1], w8 +; CHECK-GI-FP16-NEXT: mov v0.h[2], w9 ; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-FP16-NEXT: ret entry: @@ -7184,6 +7352,9 @@ define <2 x half> @stofp_v2i32_v2f16(<2 x i32> %a) { ; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] ; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] ; CHECK-GI-NEXT: fcvtn v0.4h, v1.4s +; CHECK-GI-NEXT: mov h1, v0.h[1] +; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret entry: %c = sitofp <2 x i32> %a to <2 x half> @@ -7204,6 +7375,9 @@ define <2 x half> @utofp_v2i32_v2f16(<2 x i32> %a) { ; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] ; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] ; CHECK-GI-NEXT: fcvtn v0.4h, v1.4s +; CHECK-GI-NEXT: mov h1, v0.h[1] +; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret entry: %c = uitofp <2 x i32> %a to <2 x half> @@ -7211,22 +7385,62 @@ entry: } define <3 x half> @stofp_v3i32_v3f16(<3 x i32> %a) { -; CHECK-LABEL: stofp_v3i32_v3f16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: scvtf v0.4s, v0.4s -; CHECK-NEXT: fcvtn v0.4h, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: stofp_v3i32_v3f16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: scvtf v0.4s, v0.4s +; CHECK-SD-NEXT: fcvtn v0.4h, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: stofp_v3i32_v3f16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v1.s[2], v0.s[2] +; CHECK-GI-NEXT: scvtf v0.4s, v1.4s +; CHECK-GI-NEXT: fcvtn v1.4h, v0.4s +; CHECK-GI-NEXT: umov w8, v1.h[0] +; CHECK-GI-NEXT: umov w9, v1.h[1] +; CHECK-GI-NEXT: mov v0.s[0], w8 +; CHECK-GI-NEXT: umov w8, v1.h[2] +; CHECK-GI-NEXT: mov v0.s[1], w9 +; CHECK-GI-NEXT: mov v0.s[2], w8 +; CHECK-GI-NEXT: mov w8, v0.s[1] +; CHECK-GI-NEXT: mov w9, v0.s[2] +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: mov v0.h[2], w9 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret entry: %c = sitofp <3 x i32> %a to <3 x half> ret <3 x half> %c } define <3 x half> @utofp_v3i32_v3f16(<3 x i32> %a) { -; CHECK-LABEL: utofp_v3i32_v3f16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ucvtf v0.4s, v0.4s -; CHECK-NEXT: fcvtn v0.4h, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: utofp_v3i32_v3f16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ucvtf v0.4s, v0.4s +; CHECK-SD-NEXT: fcvtn v0.4h, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: utofp_v3i32_v3f16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v1.s[2], v0.s[2] +; CHECK-GI-NEXT: ucvtf v0.4s, v1.4s +; CHECK-GI-NEXT: fcvtn v1.4h, v0.4s +; CHECK-GI-NEXT: umov w8, v1.h[0] +; CHECK-GI-NEXT: umov w9, v1.h[1] +; CHECK-GI-NEXT: mov v0.s[0], w8 +; CHECK-GI-NEXT: umov w8, v1.h[2] +; CHECK-GI-NEXT: mov v0.s[1], w9 +; CHECK-GI-NEXT: mov v0.s[2], w8 +; CHECK-GI-NEXT: mov w8, v0.s[1] +; CHECK-GI-NEXT: mov w9, v0.s[2] +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: mov v0.h[2], w9 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret entry: %c = uitofp <3 x i32> %a to <3 x half> ret <3 x half> %c @@ -7411,12 +7625,18 @@ define <2 x half> @stofp_v2i16_v2f16(<2 x i16> %a) { ; CHECK-GI-NOFP16-NEXT: mov v1.s[0], v0.s[0] ; CHECK-GI-NOFP16-NEXT: mov v1.s[1], v0.s[1] ; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s +; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1] +; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NOFP16-NEXT: ret ; ; CHECK-GI-FP16-LABEL: stofp_v2i16_v2f16: ; CHECK-GI-FP16: // %bb.0: // %entry ; CHECK-GI-FP16-NEXT: uzp1 v0.4h, v0.4h, v0.4h ; CHECK-GI-FP16-NEXT: scvtf v0.4h, v0.4h +; CHECK-GI-FP16-NEXT: mov h1, v0.h[1] +; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-FP16-NEXT: ret entry: %c = sitofp <2 x i16> %a to <2 x half> @@ -7446,12 +7666,18 @@ define <2 x half> @utofp_v2i16_v2f16(<2 x i16> %a) { ; CHECK-GI-NOFP16-NEXT: mov v1.s[0], v0.s[0] ; CHECK-GI-NOFP16-NEXT: mov v1.s[1], v0.s[1] ; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s +; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1] +; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NOFP16-NEXT: ret ; ; CHECK-GI-FP16-LABEL: utofp_v2i16_v2f16: ; CHECK-GI-FP16: // %bb.0: // %entry ; CHECK-GI-FP16-NEXT: uzp1 v0.4h, v0.4h, v0.4h ; CHECK-GI-FP16-NEXT: ucvtf v0.4h, v0.4h +; CHECK-GI-FP16-NEXT: mov h1, v0.h[1] +; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-FP16-NEXT: ret entry: %c = uitofp <2 x i16> %a to <2 x half> @@ -7473,14 +7699,44 @@ define <3 x half> @stofp_v3i16_v3f16(<3 x i16> %a) { ; ; CHECK-GI-NOFP16-LABEL: stofp_v3i16_v3f16: ; CHECK-GI-NOFP16: // %bb.0: // %entry -; CHECK-GI-NOFP16-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v0.h[0] +; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v0.h[1] +; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v0.h[2] +; CHECK-GI-NOFP16-NEXT: sshll v0.4s, v1.4h, #0 ; CHECK-GI-NOFP16-NEXT: scvtf v0.4s, v0.4s -; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s +; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v0.4s +; CHECK-GI-NOFP16-NEXT: umov w8, v1.h[0] +; CHECK-GI-NOFP16-NEXT: umov w9, v1.h[1] +; CHECK-GI-NOFP16-NEXT: mov v0.s[0], w8 +; CHECK-GI-NOFP16-NEXT: umov w8, v1.h[2] +; CHECK-GI-NOFP16-NEXT: mov v0.s[1], w9 +; CHECK-GI-NOFP16-NEXT: mov v0.s[2], w8 +; CHECK-GI-NOFP16-NEXT: mov w8, v0.s[1] +; CHECK-GI-NOFP16-NEXT: mov w9, v0.s[2] +; CHECK-GI-NOFP16-NEXT: mov v0.h[1], w8 +; CHECK-GI-NOFP16-NEXT: mov v0.h[2], w9 +; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NOFP16-NEXT: ret ; ; CHECK-GI-FP16-LABEL: stofp_v3i16_v3f16: ; CHECK-GI-FP16: // %bb.0: // %entry -; CHECK-GI-FP16-NEXT: scvtf v0.4h, v0.4h +; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-FP16-NEXT: mov v1.h[0], v0.h[0] +; CHECK-GI-FP16-NEXT: mov v1.h[1], v0.h[1] +; CHECK-GI-FP16-NEXT: mov v1.h[2], v0.h[2] +; CHECK-GI-FP16-NEXT: scvtf v1.4h, v1.4h +; CHECK-GI-FP16-NEXT: umov w8, v1.h[0] +; CHECK-GI-FP16-NEXT: umov w9, v1.h[1] +; CHECK-GI-FP16-NEXT: mov v0.s[0], w8 +; CHECK-GI-FP16-NEXT: umov w8, v1.h[2] +; CHECK-GI-FP16-NEXT: mov v0.s[1], w9 +; CHECK-GI-FP16-NEXT: mov v0.s[2], w8 +; CHECK-GI-FP16-NEXT: mov w8, v0.s[1] +; CHECK-GI-FP16-NEXT: mov w9, v0.s[2] +; CHECK-GI-FP16-NEXT: mov v0.h[1], w8 +; CHECK-GI-FP16-NEXT: mov v0.h[2], w9 +; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-FP16-NEXT: ret entry: %c = sitofp <3 x i16> %a to <3 x half> @@ -7502,14 +7758,44 @@ define <3 x half> @utofp_v3i16_v3f16(<3 x i16> %a) { ; ; CHECK-GI-NOFP16-LABEL: utofp_v3i16_v3f16: ; CHECK-GI-NOFP16: // %bb.0: // %entry -; CHECK-GI-NOFP16-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v0.h[0] +; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v0.h[1] +; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v0.h[2] +; CHECK-GI-NOFP16-NEXT: ushll v0.4s, v1.4h, #0 ; CHECK-GI-NOFP16-NEXT: ucvtf v0.4s, v0.4s -; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s +; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v0.4s +; CHECK-GI-NOFP16-NEXT: umov w8, v1.h[0] +; CHECK-GI-NOFP16-NEXT: umov w9, v1.h[1] +; CHECK-GI-NOFP16-NEXT: mov v0.s[0], w8 +; CHECK-GI-NOFP16-NEXT: umov w8, v1.h[2] +; CHECK-GI-NOFP16-NEXT: mov v0.s[1], w9 +; CHECK-GI-NOFP16-NEXT: mov v0.s[2], w8 +; CHECK-GI-NOFP16-NEXT: mov w8, v0.s[1] +; CHECK-GI-NOFP16-NEXT: mov w9, v0.s[2] +; CHECK-GI-NOFP16-NEXT: mov v0.h[1], w8 +; CHECK-GI-NOFP16-NEXT: mov v0.h[2], w9 +; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NOFP16-NEXT: ret ; ; CHECK-GI-FP16-LABEL: utofp_v3i16_v3f16: ; CHECK-GI-FP16: // %bb.0: // %entry -; CHECK-GI-FP16-NEXT: ucvtf v0.4h, v0.4h +; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-FP16-NEXT: mov v1.h[0], v0.h[0] +; CHECK-GI-FP16-NEXT: mov v1.h[1], v0.h[1] +; CHECK-GI-FP16-NEXT: mov v1.h[2], v0.h[2] +; CHECK-GI-FP16-NEXT: ucvtf v1.4h, v1.4h +; CHECK-GI-FP16-NEXT: umov w8, v1.h[0] +; CHECK-GI-FP16-NEXT: umov w9, v1.h[1] +; CHECK-GI-FP16-NEXT: mov v0.s[0], w8 +; CHECK-GI-FP16-NEXT: umov w8, v1.h[2] +; CHECK-GI-FP16-NEXT: mov v0.s[1], w9 +; CHECK-GI-FP16-NEXT: mov v0.s[2], w8 +; CHECK-GI-FP16-NEXT: mov w8, v0.s[1] +; CHECK-GI-FP16-NEXT: mov w9, v0.s[2] +; CHECK-GI-FP16-NEXT: mov v0.h[1], w8 +; CHECK-GI-FP16-NEXT: mov v0.h[2], w9 +; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-FP16-NEXT: ret entry: %c = uitofp <3 x i16> %a to <3 x half> @@ -7933,6 +8219,9 @@ define <2 x half> @stofp_v2i8_v2f16(<2 x i8> %a) { ; CHECK-GI-NOFP16-NEXT: mov v1.s[0], v0.s[0] ; CHECK-GI-NOFP16-NEXT: mov v1.s[1], v0.s[1] ; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s +; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1] +; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NOFP16-NEXT: ret ; ; CHECK-GI-FP16-LABEL: stofp_v2i8_v2f16: @@ -7941,6 +8230,9 @@ define <2 x half> @stofp_v2i8_v2f16(<2 x i8> %a) { ; CHECK-GI-FP16-NEXT: shl v0.4h, v0.4h, #8 ; CHECK-GI-FP16-NEXT: sshr v0.4h, v0.4h, #8 ; CHECK-GI-FP16-NEXT: scvtf v0.4h, v0.4h +; CHECK-GI-FP16-NEXT: mov h1, v0.h[1] +; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-FP16-NEXT: ret entry: %c = sitofp <2 x i8> %a to <2 x half> @@ -7984,6 +8276,9 @@ define <2 x half> @utofp_v2i8_v2f16(<2 x i8> %a) { ; CHECK-GI-NOFP16-NEXT: mov v1.s[0], v0.s[0] ; CHECK-GI-NOFP16-NEXT: mov v1.s[1], v0.s[1] ; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s +; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1] +; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NOFP16-NEXT: ret ; ; CHECK-GI-FP16-LABEL: utofp_v2i8_v2f16: @@ -7992,6 +8287,9 @@ define <2 x half> @utofp_v2i8_v2f16(<2 x i8> %a) { ; CHECK-GI-FP16-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-GI-FP16-NEXT: uzp1 v0.4h, v0.4h, v0.4h ; CHECK-GI-FP16-NEXT: ucvtf v0.4h, v0.4h +; CHECK-GI-FP16-NEXT: mov h1, v0.h[1] +; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-FP16-NEXT: ret entry: %c = uitofp <2 x i8> %a to <2 x half> @@ -8034,7 +8332,18 @@ define <3 x half> @stofp_v3i8_v3f16(<3 x i8> %a) { ; CHECK-GI-NOFP16-NEXT: sshll v1.4s, v1.4h, #0 ; CHECK-GI-NOFP16-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NOFP16-NEXT: scvtf v0.4s, v0.4s -; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s +; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v0.4s +; CHECK-GI-NOFP16-NEXT: umov w8, v1.h[0] +; CHECK-GI-NOFP16-NEXT: umov w9, v1.h[1] +; CHECK-GI-NOFP16-NEXT: mov v0.s[0], w8 +; CHECK-GI-NOFP16-NEXT: umov w8, v1.h[2] +; CHECK-GI-NOFP16-NEXT: mov v0.s[1], w9 +; CHECK-GI-NOFP16-NEXT: mov v0.s[2], w8 +; CHECK-GI-NOFP16-NEXT: mov w8, v0.s[1] +; CHECK-GI-NOFP16-NEXT: mov w9, v0.s[2] +; CHECK-GI-NOFP16-NEXT: mov v0.h[1], w8 +; CHECK-GI-NOFP16-NEXT: mov v0.h[2], w9 +; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NOFP16-NEXT: ret ; ; CHECK-GI-FP16-LABEL: stofp_v3i8_v3f16: @@ -8043,7 +8352,18 @@ define <3 x half> @stofp_v3i8_v3f16(<3 x i8> %a) { ; CHECK-GI-FP16-NEXT: mov v0.b[1], w1 ; CHECK-GI-FP16-NEXT: mov v0.b[2], w2 ; CHECK-GI-FP16-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-GI-FP16-NEXT: scvtf v0.4h, v0.4h +; CHECK-GI-FP16-NEXT: scvtf v1.4h, v0.4h +; CHECK-GI-FP16-NEXT: umov w8, v1.h[0] +; CHECK-GI-FP16-NEXT: umov w9, v1.h[1] +; CHECK-GI-FP16-NEXT: mov v0.s[0], w8 +; CHECK-GI-FP16-NEXT: umov w8, v1.h[2] +; CHECK-GI-FP16-NEXT: mov v0.s[1], w9 +; CHECK-GI-FP16-NEXT: mov v0.s[2], w8 +; CHECK-GI-FP16-NEXT: mov w8, v0.s[1] +; CHECK-GI-FP16-NEXT: mov w9, v0.s[2] +; CHECK-GI-FP16-NEXT: mov v0.h[1], w8 +; CHECK-GI-FP16-NEXT: mov v0.h[2], w9 +; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-FP16-NEXT: ret entry: %c = sitofp <3 x i8> %a to <3 x half> @@ -8084,7 +8404,18 @@ define <3 x half> @utofp_v3i8_v3f16(<3 x i8> %a) { ; CHECK-GI-NOFP16-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-GI-NOFP16-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NOFP16-NEXT: ucvtf v0.4s, v0.4s -; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v0.4s +; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v0.4s +; CHECK-GI-NOFP16-NEXT: umov w8, v1.h[0] +; CHECK-GI-NOFP16-NEXT: umov w9, v1.h[1] +; CHECK-GI-NOFP16-NEXT: mov v0.s[0], w8 +; CHECK-GI-NOFP16-NEXT: umov w8, v1.h[2] +; CHECK-GI-NOFP16-NEXT: mov v0.s[1], w9 +; CHECK-GI-NOFP16-NEXT: mov v0.s[2], w8 +; CHECK-GI-NOFP16-NEXT: mov w8, v0.s[1] +; CHECK-GI-NOFP16-NEXT: mov w9, v0.s[2] +; CHECK-GI-NOFP16-NEXT: mov v0.h[1], w8 +; CHECK-GI-NOFP16-NEXT: mov v0.h[2], w9 +; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NOFP16-NEXT: ret ; ; CHECK-GI-FP16-LABEL: utofp_v3i8_v3f16: @@ -8093,7 +8424,18 @@ define <3 x half> @utofp_v3i8_v3f16(<3 x i8> %a) { ; CHECK-GI-FP16-NEXT: mov v0.b[1], w1 ; CHECK-GI-FP16-NEXT: mov v0.b[2], w2 ; CHECK-GI-FP16-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-GI-FP16-NEXT: ucvtf v0.4h, v0.4h +; CHECK-GI-FP16-NEXT: ucvtf v1.4h, v0.4h +; CHECK-GI-FP16-NEXT: umov w8, v1.h[0] +; CHECK-GI-FP16-NEXT: umov w9, v1.h[1] +; CHECK-GI-FP16-NEXT: mov v0.s[0], w8 +; CHECK-GI-FP16-NEXT: umov w8, v1.h[2] +; CHECK-GI-FP16-NEXT: mov v0.s[1], w9 +; CHECK-GI-FP16-NEXT: mov v0.s[2], w8 +; CHECK-GI-FP16-NEXT: mov w8, v0.s[1] +; CHECK-GI-FP16-NEXT: mov w9, v0.s[2] +; CHECK-GI-FP16-NEXT: mov v0.h[1], w8 +; CHECK-GI-FP16-NEXT: mov v0.h[2], w9 +; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-FP16-NEXT: ret entry: %c = uitofp <3 x i8> %a to <3 x half> diff --git a/llvm/test/CodeGen/AArch64/llvm.exp10.ll b/llvm/test/CodeGen/AArch64/llvm.exp10.ll index c1ea891bc86e7..33e8a85784d13 100644 --- a/llvm/test/CodeGen/AArch64/llvm.exp10.ll +++ b/llvm/test/CodeGen/AArch64/llvm.exp10.ll @@ -109,11 +109,14 @@ define <2 x half> @exp10_v2f16(<2 x half> %x) { ; GISEL-NEXT: str q0, [sp] // 16-byte Folded Spill ; GISEL-NEXT: fmov s0, s1 ; GISEL-NEXT: bl exp10f -; GISEL-NEXT: fcvt h1, s0 -; GISEL-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; GISEL-NEXT: fcvt h0, s0 +; GISEL-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; GISEL-NEXT: ldr x30, [sp, #24] // 8-byte Folded Reload ; GISEL-NEXT: ldr d8, [sp, #16] // 8-byte Folded Reload -; GISEL-NEXT: mov v0.h[1], v1.h[0] +; GISEL-NEXT: mov v1.h[1], v0.h[0] +; GISEL-NEXT: mov h0, v1.h[1] +; GISEL-NEXT: mov v1.h[1], v0.h[0] +; GISEL-NEXT: mov v0.16b, v1.16b ; GISEL-NEXT: // kill: def $d0 killed $d0 killed $q0 ; GISEL-NEXT: add sp, sp, #32 ; GISEL-NEXT: ret @@ -165,10 +168,9 @@ define <3 x half> @exp10_v3f16(<3 x half> %x) { ; ; GISEL-LABEL: exp10_v3f16: ; GISEL: // %bb.0: -; GISEL-NEXT: sub sp, sp, #64 -; GISEL-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill -; GISEL-NEXT: str x30, [sp, #48] // 8-byte Folded Spill -; GISEL-NEXT: .cfi_def_cfa_offset 64 +; GISEL-NEXT: stp d9, d8, [sp, #-32]! // 16-byte Folded Spill +; GISEL-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; GISEL-NEXT: .cfi_def_cfa_offset 32 ; GISEL-NEXT: .cfi_offset w30, -16 ; GISEL-NEXT: .cfi_offset b8, -24 ; GISEL-NEXT: .cfi_offset b9, -32 @@ -178,24 +180,27 @@ define <3 x half> @exp10_v3f16(<3 x half> %x) { ; GISEL-NEXT: fcvt s0, h0 ; GISEL-NEXT: bl exp10f ; GISEL-NEXT: fcvt s1, h8 -; GISEL-NEXT: fcvt h0, s0 -; GISEL-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; GISEL-NEXT: fcvt h8, s0 ; GISEL-NEXT: fmov s0, s1 ; GISEL-NEXT: bl exp10f ; GISEL-NEXT: fcvt s1, h9 -; GISEL-NEXT: fcvt h0, s0 -; GISEL-NEXT: str q0, [sp] // 16-byte Folded Spill +; GISEL-NEXT: fcvt h9, s0 ; GISEL-NEXT: fmov s0, s1 ; GISEL-NEXT: bl exp10f -; GISEL-NEXT: ldp q2, q1, [sp] // 32-byte Folded Reload +; GISEL-NEXT: fmov w8, s8 ; GISEL-NEXT: fcvt h0, s0 -; GISEL-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload -; GISEL-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload -; GISEL-NEXT: mov v1.h[1], v2.h[0] -; GISEL-NEXT: mov v1.h[2], v0.h[0] -; GISEL-NEXT: mov v0.16b, v1.16b -; GISEL-NEXT: // kill: def $d0 killed $d0 killed $q0 -; GISEL-NEXT: add sp, sp, #64 +; GISEL-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; GISEL-NEXT: mov v1.s[0], w8 +; GISEL-NEXT: fmov w8, s9 +; GISEL-NEXT: mov v1.s[1], w8 +; GISEL-NEXT: fmov w8, s0 +; GISEL-NEXT: mov v1.s[2], w8 +; GISEL-NEXT: mov w8, v1.s[1] +; GISEL-NEXT: mov w9, v1.s[2] +; GISEL-NEXT: mov v1.h[1], w8 +; GISEL-NEXT: mov v1.h[2], w9 +; GISEL-NEXT: fmov d0, d1 +; GISEL-NEXT: ldp d9, d8, [sp], #32 // 16-byte Folded Reload ; GISEL-NEXT: ret %r = call <3 x half> @llvm.exp10.v3f16(<3 x half> %x) ret <3 x half> %r @@ -436,7 +441,9 @@ define <3 x float> @exp10_v3f32(<3 x float> %x) { ; GISEL-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload ; GISEL-NEXT: mov v1.s[1], v2.s[0] ; GISEL-NEXT: mov v1.s[2], v0.s[0] -; GISEL-NEXT: mov v0.16b, v1.16b +; GISEL-NEXT: mov v0.s[0], v1.s[0] +; GISEL-NEXT: mov v0.s[1], v1.s[1] +; GISEL-NEXT: mov v0.s[2], v1.s[2] ; GISEL-NEXT: add sp, sp, #64 ; GISEL-NEXT: ret %r = call <3 x float> @llvm.exp10.v3f32(<3 x float> %x) @@ -624,29 +631,33 @@ define <3 x double> @exp10_v3f64(<3 x double> %x) { ; ; GISEL-LABEL: exp10_v3f64: ; GISEL: // %bb.0: -; GISEL-NEXT: str d10, [sp, #-32]! // 8-byte Folded Spill -; GISEL-NEXT: stp d9, d8, [sp, #8] // 16-byte Folded Spill -; GISEL-NEXT: str x30, [sp, #24] // 8-byte Folded Spill -; GISEL-NEXT: .cfi_def_cfa_offset 32 -; GISEL-NEXT: .cfi_offset w30, -8 -; GISEL-NEXT: .cfi_offset b8, -16 -; GISEL-NEXT: .cfi_offset b9, -24 -; GISEL-NEXT: .cfi_offset b10, -32 +; GISEL-NEXT: sub sp, sp, #64 +; GISEL-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill +; GISEL-NEXT: str x30, [sp, #48] // 8-byte Folded Spill +; GISEL-NEXT: .cfi_def_cfa_offset 64 +; GISEL-NEXT: .cfi_offset w30, -16 +; GISEL-NEXT: .cfi_offset b8, -24 +; GISEL-NEXT: .cfi_offset b9, -32 ; GISEL-NEXT: fmov d8, d1 ; GISEL-NEXT: fmov d9, d2 ; GISEL-NEXT: bl exp10 -; GISEL-NEXT: fmov d10, d0 +; GISEL-NEXT: // kill: def $d0 killed $d0 def $q0 +; GISEL-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; GISEL-NEXT: fmov d0, d8 ; GISEL-NEXT: bl exp10 -; GISEL-NEXT: fmov d8, d0 +; GISEL-NEXT: // kill: def $d0 killed $d0 def $q0 +; GISEL-NEXT: str q0, [sp] // 16-byte Folded Spill ; GISEL-NEXT: fmov d0, d9 ; GISEL-NEXT: bl exp10 -; GISEL-NEXT: fmov d1, d8 -; GISEL-NEXT: ldp d9, d8, [sp, #8] // 16-byte Folded Reload -; GISEL-NEXT: ldr x30, [sp, #24] // 8-byte Folded Reload +; GISEL-NEXT: ldp q1, q3, [sp] // 32-byte Folded Reload ; GISEL-NEXT: fmov d2, d0 -; GISEL-NEXT: fmov d0, d10 -; GISEL-NEXT: ldr d10, [sp], #32 // 8-byte Folded Reload +; GISEL-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload +; GISEL-NEXT: // kill: def $d2 killed $d2 killed $q2 +; GISEL-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload +; GISEL-NEXT: mov v3.d[1], v1.d[0] +; GISEL-NEXT: mov d1, v3.d[1] +; GISEL-NEXT: fmov d0, d3 +; GISEL-NEXT: add sp, sp, #64 ; GISEL-NEXT: ret %r = call <3 x double> @llvm.exp10.v3f64(<3 x double> %x) ret <3 x double> %r diff --git a/llvm/test/CodeGen/AArch64/load.ll b/llvm/test/CodeGen/AArch64/load.ll index 70ab10e716875..a93a089cda3be 100644 --- a/llvm/test/CodeGen/AArch64/load.ll +++ b/llvm/test/CodeGen/AArch64/load.ll @@ -215,10 +215,16 @@ define <3 x i8> @load_v3i8(ptr %ptr){ ; ; CHECK-GI-LABEL: load_v3i8: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ldrb w8, [x0] -; CHECK-GI-NEXT: ldrb w1, [x0, #1] -; CHECK-GI-NEXT: ldrb w2, [x0, #2] -; CHECK-GI-NEXT: mov w0, w8 +; CHECK-GI-NEXT: ldr b0, [x0] +; CHECK-GI-NEXT: ldr b1, [x0, #1] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[0] +; CHECK-GI-NEXT: ldr b1, [x0, #2] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[0] +; CHECK-GI-NEXT: mov s1, v0.s[1] +; CHECK-GI-NEXT: mov s2, v0.s[2] +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: fmov w1, s1 +; CHECK-GI-NEXT: fmov w2, s2 ; CHECK-GI-NEXT: ret %a = load <3 x i8>, ptr %ptr ret <3 x i8> %a @@ -261,10 +267,14 @@ define <3 x i16> @load_v3i16(ptr %ptr){ ; CHECK-GI-LABEL: load_v3i16: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: ldr h0, [x0] -; CHECK-GI-NEXT: add x8, x0, #2 -; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8] -; CHECK-GI-NEXT: add x8, x0, #4 -; CHECK-GI-NEXT: ld1 { v0.h }[2], [x8] +; CHECK-GI-NEXT: ldr h1, [x0, #2] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[0] +; CHECK-GI-NEXT: ldr h1, [x0, #4] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[0] +; CHECK-GI-NEXT: mov w8, v0.s[1] +; CHECK-GI-NEXT: mov w9, v0.s[2] +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: mov v0.h[2], w9 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret %a = load <3 x i16>, ptr %ptr @@ -305,11 +315,14 @@ define <3 x i32> @load_v3i32(ptr %ptr){ ; ; CHECK-GI-LABEL: load_v3i32: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ldr s0, [x0] +; CHECK-GI-NEXT: ldr s1, [x0] ; CHECK-GI-NEXT: add x8, x0, #4 -; CHECK-GI-NEXT: ld1 { v0.s }[1], [x8] +; CHECK-GI-NEXT: ld1 { v1.s }[1], [x8] ; CHECK-GI-NEXT: add x8, x0, #8 -; CHECK-GI-NEXT: ld1 { v0.s }[2], [x8] +; CHECK-GI-NEXT: ld1 { v1.s }[2], [x8] +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] ; CHECK-GI-NEXT: ret %a = load <3 x i32>, ptr %ptr ret <3 x i32> %a diff --git a/llvm/test/CodeGen/AArch64/mul.ll b/llvm/test/CodeGen/AArch64/mul.ll index 9ca975d9e742e..9735354402aab 100644 --- a/llvm/test/CodeGen/AArch64/mul.ll +++ b/llvm/test/CodeGen/AArch64/mul.ll @@ -355,10 +355,24 @@ entry: } define <3 x i32> @v3i32(<3 x i32> %d, <3 x i32> %e) { -; CHECK-LABEL: v3i32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: v3i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v3i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v2.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v3.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v2.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v3.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v2.s[2], v0.s[2] +; CHECK-GI-NEXT: mov v3.s[2], v1.s[2] +; CHECK-GI-NEXT: mul v1.4s, v2.4s, v3.4s +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] +; CHECK-GI-NEXT: ret entry: %s = mul <3 x i32> %d, %e ret <3 x i32> %s @@ -457,14 +471,15 @@ define <3 x i64> @v3i64(<3 x i64> %d, <3 x i64> %e) { ; CHECK-GI-NEXT: mov x11, v3.d[1] ; CHECK-GI-NEXT: mul x8, x8, x9 ; CHECK-GI-NEXT: mul x9, x10, x11 +; CHECK-GI-NEXT: fmov x10, d5 ; CHECK-GI-NEXT: mov v0.d[0], x8 ; CHECK-GI-NEXT: fmov x8, d2 +; CHECK-GI-NEXT: mul x8, x8, x10 ; CHECK-GI-NEXT: mov v0.d[1], x9 -; CHECK-GI-NEXT: fmov x9, d5 -; CHECK-GI-NEXT: mul x8, x8, x9 ; CHECK-GI-NEXT: mov d1, v0.d[1] +; CHECK-GI-NEXT: mov v2.d[0], x8 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-GI-NEXT: fmov d2, x8 +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-GI-NEXT: ret entry: %s = mul <3 x i64> %d, %e diff --git a/llvm/test/CodeGen/AArch64/rem.ll b/llvm/test/CodeGen/AArch64/rem.ll index d807635f5d87d..ad83cc8172072 100644 --- a/llvm/test/CodeGen/AArch64/rem.ll +++ b/llvm/test/CodeGen/AArch64/rem.ll @@ -227,10 +227,18 @@ define <3 x i8> @sv3i8(<3 x i8> %d, <3 x i8> %e) { ; CHECK-GI-NEXT: sxtb w15, w5 ; CHECK-GI-NEXT: sdiv w10, w8, w9 ; CHECK-GI-NEXT: sdiv w13, w11, w12 -; CHECK-GI-NEXT: msub w0, w10, w9, w8 -; CHECK-GI-NEXT: sdiv w16, w14, w15 -; CHECK-GI-NEXT: msub w1, w13, w12, w11 -; CHECK-GI-NEXT: msub w2, w16, w15, w14 +; CHECK-GI-NEXT: msub w8, w10, w9, w8 +; CHECK-GI-NEXT: mov v0.s[0], w8 +; CHECK-GI-NEXT: sdiv w9, w14, w15 +; CHECK-GI-NEXT: msub w8, w13, w12, w11 +; CHECK-GI-NEXT: mov v0.s[1], w8 +; CHECK-GI-NEXT: msub w8, w9, w15, w14 +; CHECK-GI-NEXT: mov v0.s[2], w8 +; CHECK-GI-NEXT: mov s1, v0.s[1] +; CHECK-GI-NEXT: mov s2, v0.s[2] +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: fmov w1, s1 +; CHECK-GI-NEXT: fmov w2, s2 ; CHECK-GI-NEXT: ret entry: %s = srem <3 x i8> %d, %e @@ -1141,15 +1149,23 @@ define <3 x i8> @uv3i8(<3 x i8> %d, <3 x i8> %e) { ; CHECK-GI-NEXT: and w8, w0, #0xff ; CHECK-GI-NEXT: and w9, w3, #0xff ; CHECK-GI-NEXT: and w11, w1, #0xff +; CHECK-GI-NEXT: udiv w10, w8, w9 ; CHECK-GI-NEXT: and w12, w4, #0xff ; CHECK-GI-NEXT: and w14, w2, #0xff ; CHECK-GI-NEXT: and w15, w5, #0xff -; CHECK-GI-NEXT: udiv w10, w8, w9 ; CHECK-GI-NEXT: udiv w13, w11, w12 -; CHECK-GI-NEXT: msub w0, w10, w9, w8 -; CHECK-GI-NEXT: udiv w16, w14, w15 -; CHECK-GI-NEXT: msub w1, w13, w12, w11 -; CHECK-GI-NEXT: msub w2, w16, w15, w14 +; CHECK-GI-NEXT: msub w8, w10, w9, w8 +; CHECK-GI-NEXT: mov v0.s[0], w8 +; CHECK-GI-NEXT: udiv w9, w14, w15 +; CHECK-GI-NEXT: msub w8, w13, w12, w11 +; CHECK-GI-NEXT: mov v0.s[1], w8 +; CHECK-GI-NEXT: msub w8, w9, w15, w14 +; CHECK-GI-NEXT: mov v0.s[2], w8 +; CHECK-GI-NEXT: mov s1, v0.s[1] +; CHECK-GI-NEXT: mov s2, v0.s[2] +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: fmov w1, s1 +; CHECK-GI-NEXT: fmov w2, s2 ; CHECK-GI-NEXT: ret entry: %s = urem <3 x i8> %d, %e @@ -2075,12 +2091,16 @@ define <3 x i16> @sv3i16(<3 x i16> %d, <3 x i16> %e) { ; CHECK-GI-NEXT: sdiv w10, w8, w9 ; CHECK-GI-NEXT: sdiv w13, w11, w12 ; CHECK-GI-NEXT: msub w8, w10, w9, w8 -; CHECK-GI-NEXT: fmov s0, w8 -; CHECK-GI-NEXT: sdiv w16, w14, w15 -; CHECK-GI-NEXT: msub w9, w13, w12, w11 -; CHECK-GI-NEXT: mov v0.h[1], w9 -; CHECK-GI-NEXT: msub w8, w16, w15, w14 -; CHECK-GI-NEXT: mov v0.h[2], w8 +; CHECK-GI-NEXT: mov v0.s[0], w8 +; CHECK-GI-NEXT: sdiv w9, w14, w15 +; CHECK-GI-NEXT: msub w8, w13, w12, w11 +; CHECK-GI-NEXT: mov v0.s[1], w8 +; CHECK-GI-NEXT: msub w8, w9, w15, w14 +; CHECK-GI-NEXT: mov v0.s[2], w8 +; CHECK-GI-NEXT: mov w8, v0.s[1] +; CHECK-GI-NEXT: mov w9, v0.s[2] +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: mov v0.h[2], w9 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret entry: @@ -2543,12 +2563,16 @@ define <3 x i16> @uv3i16(<3 x i16> %d, <3 x i16> %e) { ; CHECK-GI-NEXT: udiv w10, w8, w9 ; CHECK-GI-NEXT: udiv w13, w11, w12 ; CHECK-GI-NEXT: msub w8, w10, w9, w8 -; CHECK-GI-NEXT: fmov s0, w8 -; CHECK-GI-NEXT: udiv w16, w14, w15 -; CHECK-GI-NEXT: msub w9, w13, w12, w11 -; CHECK-GI-NEXT: mov v0.h[1], w9 -; CHECK-GI-NEXT: msub w8, w16, w15, w14 -; CHECK-GI-NEXT: mov v0.h[2], w8 +; CHECK-GI-NEXT: mov v0.s[0], w8 +; CHECK-GI-NEXT: udiv w9, w14, w15 +; CHECK-GI-NEXT: msub w8, w13, w12, w11 +; CHECK-GI-NEXT: mov v0.s[1], w8 +; CHECK-GI-NEXT: msub w8, w9, w15, w14 +; CHECK-GI-NEXT: mov v0.s[2], w8 +; CHECK-GI-NEXT: mov w8, v0.s[1] +; CHECK-GI-NEXT: mov w9, v0.s[2] +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: mov v0.h[2], w9 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret entry: @@ -3003,12 +3027,15 @@ define <3 x i32> @sv3i32(<3 x i32> %d, <3 x i32> %e) { ; CHECK-GI-NEXT: fmov w15, s1 ; CHECK-GI-NEXT: sdiv w13, w11, w12 ; CHECK-GI-NEXT: msub w8, w10, w9, w8 -; CHECK-GI-NEXT: mov v0.s[0], w8 +; CHECK-GI-NEXT: mov v1.s[0], w8 ; CHECK-GI-NEXT: sdiv w9, w14, w15 ; CHECK-GI-NEXT: msub w8, w13, w12, w11 -; CHECK-GI-NEXT: mov v0.s[1], w8 +; CHECK-GI-NEXT: mov v1.s[1], w8 ; CHECK-GI-NEXT: msub w8, w9, w15, w14 -; CHECK-GI-NEXT: mov v0.s[2], w8 +; CHECK-GI-NEXT: mov v1.s[2], w8 +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] ; CHECK-GI-NEXT: ret entry: %s = srem <3 x i32> %d, %e @@ -3234,12 +3261,15 @@ define <3 x i32> @uv3i32(<3 x i32> %d, <3 x i32> %e) { ; CHECK-GI-NEXT: fmov w15, s1 ; CHECK-GI-NEXT: udiv w13, w11, w12 ; CHECK-GI-NEXT: msub w8, w10, w9, w8 -; CHECK-GI-NEXT: mov v0.s[0], w8 +; CHECK-GI-NEXT: mov v1.s[0], w8 ; CHECK-GI-NEXT: udiv w9, w14, w15 ; CHECK-GI-NEXT: msub w8, w13, w12, w11 -; CHECK-GI-NEXT: mov v0.s[1], w8 +; CHECK-GI-NEXT: mov v1.s[1], w8 ; CHECK-GI-NEXT: msub w8, w9, w15, w14 -; CHECK-GI-NEXT: mov v0.s[2], w8 +; CHECK-GI-NEXT: mov v1.s[2], w8 +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] ; CHECK-GI-NEXT: ret entry: %s = urem <3 x i32> %d, %e @@ -3469,25 +3499,26 @@ define <3 x i64> @sv3i64(<3 x i64> %d, <3 x i64> %e) { ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: sdiv x8, x8, x9 ; CHECK-GI-NEXT: fmov x9, d1 -; CHECK-GI-NEXT: fmov x11, d3 +; CHECK-GI-NEXT: fmov x12, d3 ; CHECK-GI-NEXT: mov x14, v3.d[1] ; CHECK-GI-NEXT: sdiv x9, x9, x10 ; CHECK-GI-NEXT: mov v6.d[0], x8 ; CHECK-GI-NEXT: fmov x8, d2 +; CHECK-GI-NEXT: fmov x10, d5 ; CHECK-GI-NEXT: mov v6.d[1], x9 -; CHECK-GI-NEXT: fmov x9, d5 -; CHECK-GI-NEXT: sdiv x12, x8, x9 -; CHECK-GI-NEXT: fmov x10, d6 +; CHECK-GI-NEXT: sdiv x9, x8, x10 +; CHECK-GI-NEXT: fmov x11, d6 ; CHECK-GI-NEXT: mov x13, v6.d[1] -; CHECK-GI-NEXT: mul x10, x10, x11 -; CHECK-GI-NEXT: mul x11, x13, x14 -; CHECK-GI-NEXT: mov v2.d[0], x10 -; CHECK-GI-NEXT: mov v2.d[1], x11 -; CHECK-GI-NEXT: msub x8, x12, x9, x8 +; CHECK-GI-NEXT: mul x11, x11, x12 +; CHECK-GI-NEXT: mul x12, x13, x14 +; CHECK-GI-NEXT: mov v2.d[0], x11 +; CHECK-GI-NEXT: mov v2.d[1], x12 +; CHECK-GI-NEXT: msub x8, x9, x10, x8 ; CHECK-GI-NEXT: sub v0.2d, v0.2d, v2.2d +; CHECK-GI-NEXT: mov v2.d[0], x8 +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-GI-NEXT: fmov d2, x8 ; CHECK-GI-NEXT: ret entry: %s = srem <3 x i64> %d, %e @@ -3634,25 +3665,26 @@ define <3 x i64> @uv3i64(<3 x i64> %d, <3 x i64> %e) { ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: udiv x8, x8, x9 ; CHECK-GI-NEXT: fmov x9, d1 -; CHECK-GI-NEXT: fmov x11, d3 +; CHECK-GI-NEXT: fmov x12, d3 ; CHECK-GI-NEXT: mov x14, v3.d[1] ; CHECK-GI-NEXT: udiv x9, x9, x10 ; CHECK-GI-NEXT: mov v6.d[0], x8 ; CHECK-GI-NEXT: fmov x8, d2 +; CHECK-GI-NEXT: fmov x10, d5 ; CHECK-GI-NEXT: mov v6.d[1], x9 -; CHECK-GI-NEXT: fmov x9, d5 -; CHECK-GI-NEXT: udiv x12, x8, x9 -; CHECK-GI-NEXT: fmov x10, d6 +; CHECK-GI-NEXT: udiv x9, x8, x10 +; CHECK-GI-NEXT: fmov x11, d6 ; CHECK-GI-NEXT: mov x13, v6.d[1] -; CHECK-GI-NEXT: mul x10, x10, x11 -; CHECK-GI-NEXT: mul x11, x13, x14 -; CHECK-GI-NEXT: mov v2.d[0], x10 -; CHECK-GI-NEXT: mov v2.d[1], x11 -; CHECK-GI-NEXT: msub x8, x12, x9, x8 +; CHECK-GI-NEXT: mul x11, x11, x12 +; CHECK-GI-NEXT: mul x12, x13, x14 +; CHECK-GI-NEXT: mov v2.d[0], x11 +; CHECK-GI-NEXT: mov v2.d[1], x12 +; CHECK-GI-NEXT: msub x8, x9, x10, x8 ; CHECK-GI-NEXT: sub v0.2d, v0.2d, v2.2d +; CHECK-GI-NEXT: mov v2.d[0], x8 +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-GI-NEXT: fmov d2, x8 ; CHECK-GI-NEXT: ret entry: %s = urem <3 x i64> %d, %e diff --git a/llvm/test/CodeGen/AArch64/sext.ll b/llvm/test/CodeGen/AArch64/sext.ll index 853ed92c91fbc..ca38f3b701084 100644 --- a/llvm/test/CodeGen/AArch64/sext.ll +++ b/llvm/test/CodeGen/AArch64/sext.ll @@ -219,18 +219,16 @@ define <3 x i16> @sext_v3i8_v3i16(<3 x i8> %a) { ; ; CHECK-GI-LABEL: sext_v3i8_v3i16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: lsl w8, w0, #8 -; CHECK-GI-NEXT: lsl w9, w1, #8 -; CHECK-GI-NEXT: lsl w10, w2, #8 -; CHECK-GI-NEXT: sxth w8, w8 -; CHECK-GI-NEXT: sxth w9, w9 -; CHECK-GI-NEXT: asr w8, w8, #8 -; CHECK-GI-NEXT: asr w9, w9, #8 -; CHECK-GI-NEXT: fmov s0, w8 -; CHECK-GI-NEXT: sxth w8, w10 -; CHECK-GI-NEXT: asr w8, w8, #8 -; CHECK-GI-NEXT: mov v0.h[1], w9 -; CHECK-GI-NEXT: mov v0.h[2], w8 +; CHECK-GI-NEXT: sxtb w8, w0 +; CHECK-GI-NEXT: mov v0.s[0], w8 +; CHECK-GI-NEXT: sxtb w8, w1 +; CHECK-GI-NEXT: mov v0.s[1], w8 +; CHECK-GI-NEXT: sxtb w8, w2 +; CHECK-GI-NEXT: mov v0.s[2], w8 +; CHECK-GI-NEXT: mov w8, v0.s[1] +; CHECK-GI-NEXT: mov w9, v0.s[2] +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: mov v0.h[2], w9 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret entry: @@ -252,11 +250,14 @@ define <3 x i32> @sext_v3i8_v3i32(<3 x i8> %a) { ; CHECK-GI-LABEL: sext_v3i8_v3i32: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: sxtb w8, w0 -; CHECK-GI-NEXT: mov v0.s[0], w8 +; CHECK-GI-NEXT: mov v1.s[0], w8 ; CHECK-GI-NEXT: sxtb w8, w1 -; CHECK-GI-NEXT: mov v0.s[1], w8 +; CHECK-GI-NEXT: mov v1.s[1], w8 ; CHECK-GI-NEXT: sxtb w8, w2 -; CHECK-GI-NEXT: mov v0.s[2], w8 +; CHECK-GI-NEXT: mov v1.s[2], w8 +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] ; CHECK-GI-NEXT: ret entry: %c = sext <3 x i8> %a to <3 x i32> @@ -284,14 +285,17 @@ define <3 x i64> @sext_v3i8_v3i64(<3 x i8> %a) { ; CHECK-GI-LABEL: sext_v3i8_v3i64: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-GI-NEXT: sxtb x8, w0 ; CHECK-GI-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-GI-NEXT: // kill: def $w2 killed $w2 def $x2 -; CHECK-GI-NEXT: sxtb x8, w0 -; CHECK-GI-NEXT: sxtb x9, w1 -; CHECK-GI-NEXT: sxtb x10, w2 -; CHECK-GI-NEXT: fmov d0, x8 -; CHECK-GI-NEXT: fmov d1, x9 -; CHECK-GI-NEXT: fmov d2, x10 +; CHECK-GI-NEXT: mov v0.d[0], x8 +; CHECK-GI-NEXT: sxtb x8, w1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: sxtb x8, w2 +; CHECK-GI-NEXT: mov v2.d[0], x8 +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-GI-NEXT: mov d1, v0.d[1] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret entry: %c = sext <3 x i8> %a to <3 x i64> @@ -313,7 +317,9 @@ define <3 x i32> @sext_v3i16_v3i32(<3 x i16> %a) { ; CHECK-GI-NEXT: smov w8, v0.h[2] ; CHECK-GI-NEXT: mov v1.s[1], w9 ; CHECK-GI-NEXT: mov v1.s[2], w8 -; CHECK-GI-NEXT: mov v0.16b, v1.16b +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] ; CHECK-GI-NEXT: ret entry: %c = sext <3 x i16> %a to <3 x i32> @@ -337,10 +343,13 @@ define <3 x i64> @sext_v3i16_v3i64(<3 x i16> %a) { ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-GI-NEXT: smov x8, v0.h[0] ; CHECK-GI-NEXT: smov x9, v0.h[1] -; CHECK-GI-NEXT: smov x10, v0.h[2] -; CHECK-GI-NEXT: fmov d0, x8 -; CHECK-GI-NEXT: fmov d1, x9 -; CHECK-GI-NEXT: fmov d2, x10 +; CHECK-GI-NEXT: mov v3.d[0], x8 +; CHECK-GI-NEXT: smov x8, v0.h[2] +; CHECK-GI-NEXT: mov v3.d[1], x9 +; CHECK-GI-NEXT: mov v2.d[0], x8 +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-GI-NEXT: mov d1, v3.d[1] +; CHECK-GI-NEXT: fmov d0, d3 ; CHECK-GI-NEXT: ret entry: %c = sext <3 x i16> %a to <3 x i64> @@ -362,10 +371,13 @@ define <3 x i64> @sext_v3i32_v3i64(<3 x i32> %a) { ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: smov x8, v0.s[0] ; CHECK-GI-NEXT: smov x9, v0.s[1] -; CHECK-GI-NEXT: smov x10, v0.s[2] -; CHECK-GI-NEXT: fmov d0, x8 -; CHECK-GI-NEXT: fmov d1, x9 -; CHECK-GI-NEXT: fmov d2, x10 +; CHECK-GI-NEXT: mov v3.d[0], x8 +; CHECK-GI-NEXT: smov x8, v0.s[2] +; CHECK-GI-NEXT: mov v3.d[1], x9 +; CHECK-GI-NEXT: mov v2.d[0], x8 +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-GI-NEXT: mov d1, v3.d[1] +; CHECK-GI-NEXT: fmov d0, d3 ; CHECK-GI-NEXT: ret entry: %c = sext <3 x i32> %a to <3 x i64> @@ -384,18 +396,16 @@ define <3 x i16> @sext_v3i10_v3i16(<3 x i10> %a) { ; ; CHECK-GI-LABEL: sext_v3i10_v3i16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: lsl w8, w0, #6 -; CHECK-GI-NEXT: lsl w9, w1, #6 -; CHECK-GI-NEXT: lsl w10, w2, #6 -; CHECK-GI-NEXT: sxth w8, w8 -; CHECK-GI-NEXT: sxth w9, w9 -; CHECK-GI-NEXT: asr w8, w8, #6 -; CHECK-GI-NEXT: asr w9, w9, #6 -; CHECK-GI-NEXT: fmov s0, w8 -; CHECK-GI-NEXT: sxth w8, w10 -; CHECK-GI-NEXT: asr w8, w8, #6 -; CHECK-GI-NEXT: mov v0.h[1], w9 -; CHECK-GI-NEXT: mov v0.h[2], w8 +; CHECK-GI-NEXT: sbfx w8, w0, #0, #10 +; CHECK-GI-NEXT: mov v0.s[0], w8 +; CHECK-GI-NEXT: sbfx w8, w1, #0, #10 +; CHECK-GI-NEXT: mov v0.s[1], w8 +; CHECK-GI-NEXT: sbfx w8, w2, #0, #10 +; CHECK-GI-NEXT: mov v0.s[2], w8 +; CHECK-GI-NEXT: mov w8, v0.s[1] +; CHECK-GI-NEXT: mov w9, v0.s[2] +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: mov v0.h[2], w9 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret entry: @@ -417,11 +427,14 @@ define <3 x i32> @sext_v3i10_v3i32(<3 x i10> %a) { ; CHECK-GI-LABEL: sext_v3i10_v3i32: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: sbfx w8, w0, #0, #10 -; CHECK-GI-NEXT: mov v0.s[0], w8 +; CHECK-GI-NEXT: mov v1.s[0], w8 ; CHECK-GI-NEXT: sbfx w8, w1, #0, #10 -; CHECK-GI-NEXT: mov v0.s[1], w8 +; CHECK-GI-NEXT: mov v1.s[1], w8 ; CHECK-GI-NEXT: sbfx w8, w2, #0, #10 -; CHECK-GI-NEXT: mov v0.s[2], w8 +; CHECK-GI-NEXT: mov v1.s[2], w8 +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] ; CHECK-GI-NEXT: ret entry: %c = sext <3 x i10> %a to <3 x i32> @@ -449,14 +462,17 @@ define <3 x i64> @sext_v3i10_v3i64(<3 x i10> %a) { ; CHECK-GI-LABEL: sext_v3i10_v3i64: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-GI-NEXT: sbfx x8, x0, #0, #10 ; CHECK-GI-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-GI-NEXT: // kill: def $w2 killed $w2 def $x2 -; CHECK-GI-NEXT: sbfx x8, x0, #0, #10 -; CHECK-GI-NEXT: sbfx x9, x1, #0, #10 -; CHECK-GI-NEXT: sbfx x10, x2, #0, #10 -; CHECK-GI-NEXT: fmov d0, x8 -; CHECK-GI-NEXT: fmov d1, x9 -; CHECK-GI-NEXT: fmov d2, x10 +; CHECK-GI-NEXT: mov v0.d[0], x8 +; CHECK-GI-NEXT: sbfx x8, x1, #0, #10 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: sbfx x8, x2, #0, #10 +; CHECK-GI-NEXT: mov v2.d[0], x8 +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-GI-NEXT: mov d1, v0.d[1] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret entry: %c = sext <3 x i10> %a to <3 x i64> diff --git a/llvm/test/CodeGen/AArch64/shift.ll b/llvm/test/CodeGen/AArch64/shift.ll index 066928687cc02..a9517383cae0d 100644 --- a/llvm/test/CodeGen/AArch64/shift.ll +++ b/llvm/test/CodeGen/AArch64/shift.ll @@ -1069,9 +1069,17 @@ define <3 x i8> @shl_v3i8(<3 x i8> %0, <3 x i8> %1){ ; CHECK-GI-NEXT: mov v0.b[2], w2 ; CHECK-GI-NEXT: mov v1.b[2], w5 ; CHECK-GI-NEXT: ushl v0.8b, v0.8b, v1.8b -; CHECK-GI-NEXT: umov w0, v0.b[0] -; CHECK-GI-NEXT: umov w1, v0.b[1] -; CHECK-GI-NEXT: umov w2, v0.b[2] +; CHECK-GI-NEXT: umov w8, v0.b[0] +; CHECK-GI-NEXT: umov w9, v0.b[1] +; CHECK-GI-NEXT: mov v1.s[0], w8 +; CHECK-GI-NEXT: umov w8, v0.b[2] +; CHECK-GI-NEXT: mov v1.s[1], w9 +; CHECK-GI-NEXT: mov v1.s[2], w8 +; CHECK-GI-NEXT: mov s0, v1.s[1] +; CHECK-GI-NEXT: mov s2, v1.s[2] +; CHECK-GI-NEXT: fmov w0, s1 +; CHECK-GI-NEXT: fmov w1, s0 +; CHECK-GI-NEXT: fmov w2, s2 ; CHECK-GI-NEXT: ret %3 = shl <3 x i8> %0, %1 ret <3 x i8> %3 @@ -1087,10 +1095,34 @@ define <7 x i8> @shl_v7i8(<7 x i8> %0, <7 x i8> %1){ } define <3 x i16> @shl_v3i16(<3 x i16> %0, <3 x i16> %1){ -; CHECK-LABEL: shl_v3i16: -; CHECK: // %bb.0: -; CHECK-NEXT: ushl v0.4h, v0.4h, v1.4h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: shl_v3i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ushl v0.4h, v0.4h, v1.4h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: shl_v3i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-GI-NEXT: mov v2.h[0], v0.h[0] +; CHECK-GI-NEXT: mov v3.h[0], v1.h[0] +; CHECK-GI-NEXT: mov v2.h[1], v0.h[1] +; CHECK-GI-NEXT: mov v3.h[1], v1.h[1] +; CHECK-GI-NEXT: mov v2.h[2], v0.h[2] +; CHECK-GI-NEXT: mov v3.h[2], v1.h[2] +; CHECK-GI-NEXT: ushl v1.4h, v2.4h, v3.4h +; CHECK-GI-NEXT: umov w8, v1.h[0] +; CHECK-GI-NEXT: umov w9, v1.h[1] +; CHECK-GI-NEXT: mov v0.s[0], w8 +; CHECK-GI-NEXT: umov w8, v1.h[2] +; CHECK-GI-NEXT: mov v0.s[1], w9 +; CHECK-GI-NEXT: mov v0.s[2], w8 +; CHECK-GI-NEXT: mov w8, v0.s[1] +; CHECK-GI-NEXT: mov w9, v0.s[2] +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: mov v0.h[2], w9 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret %3 = shl <3 x i16> %0, %1 ret <3 x i16> %3 } @@ -1105,10 +1137,24 @@ define <7 x i16> @shl_v7i16(<7 x i16> %0, <7 x i16> %1){ } define <3 x i32> @shl_v3i32(<3 x i32> %0, <3 x i32> %1){ -; CHECK-LABEL: shl_v3i32: -; CHECK: // %bb.0: -; CHECK-NEXT: ushl v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: shl_v3i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ushl v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: shl_v3i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov v2.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v3.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v2.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v3.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v2.s[2], v0.s[2] +; CHECK-GI-NEXT: mov v3.s[2], v1.s[2] +; CHECK-GI-NEXT: ushl v1.4s, v2.4s, v3.4s +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] +; CHECK-GI-NEXT: ret %3 = shl <3 x i32> %0, %1 ret <3 x i32> %3 } @@ -1142,9 +1188,17 @@ define <3 x i8> @ashr_v3i8(<3 x i8> %0, <3 x i8> %1){ ; CHECK-GI-NEXT: mov v1.b[2], w2 ; CHECK-GI-NEXT: neg v0.8b, v0.8b ; CHECK-GI-NEXT: sshl v0.8b, v1.8b, v0.8b -; CHECK-GI-NEXT: umov w0, v0.b[0] -; CHECK-GI-NEXT: umov w1, v0.b[1] -; CHECK-GI-NEXT: umov w2, v0.b[2] +; CHECK-GI-NEXT: umov w8, v0.b[0] +; CHECK-GI-NEXT: umov w9, v0.b[1] +; CHECK-GI-NEXT: mov v1.s[0], w8 +; CHECK-GI-NEXT: umov w8, v0.b[2] +; CHECK-GI-NEXT: mov v1.s[1], w9 +; CHECK-GI-NEXT: mov v1.s[2], w8 +; CHECK-GI-NEXT: mov s0, v1.s[1] +; CHECK-GI-NEXT: mov s2, v1.s[2] +; CHECK-GI-NEXT: fmov w0, s1 +; CHECK-GI-NEXT: fmov w1, s0 +; CHECK-GI-NEXT: fmov w2, s2 ; CHECK-GI-NEXT: ret %3 = ashr <3 x i8> %0, %1 ret <3 x i8> %3 @@ -1161,11 +1215,36 @@ define <7 x i8> @ashr_v7i8(<7 x i8> %0, <7 x i8> %1){ } define <3 x i16> @ashr_v3i16(<3 x i16> %0, <3 x i16> %1){ -; CHECK-LABEL: ashr_v3i16: -; CHECK: // %bb.0: -; CHECK-NEXT: neg v1.4h, v1.4h -; CHECK-NEXT: sshl v0.4h, v0.4h, v1.4h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: ashr_v3i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: neg v1.4h, v1.4h +; CHECK-SD-NEXT: sshl v0.4h, v0.4h, v1.4h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ashr_v3i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-GI-NEXT: mov v2.h[0], v1.h[0] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: mov v3.h[0], v0.h[0] +; CHECK-GI-NEXT: mov v2.h[1], v1.h[1] +; CHECK-GI-NEXT: mov v3.h[1], v0.h[1] +; CHECK-GI-NEXT: mov v2.h[2], v1.h[2] +; CHECK-GI-NEXT: mov v3.h[2], v0.h[2] +; CHECK-GI-NEXT: neg v0.4h, v2.4h +; CHECK-GI-NEXT: sshl v1.4h, v3.4h, v0.4h +; CHECK-GI-NEXT: umov w8, v1.h[0] +; CHECK-GI-NEXT: umov w9, v1.h[1] +; CHECK-GI-NEXT: mov v0.s[0], w8 +; CHECK-GI-NEXT: umov w8, v1.h[2] +; CHECK-GI-NEXT: mov v0.s[1], w9 +; CHECK-GI-NEXT: mov v0.s[2], w8 +; CHECK-GI-NEXT: mov w8, v0.s[1] +; CHECK-GI-NEXT: mov w9, v0.s[2] +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: mov v0.h[2], w9 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret %3 = ashr <3 x i16> %0, %1 ret <3 x i16> %3 } @@ -1181,11 +1260,26 @@ define <7 x i16> @ashr_v7i16(<7 x i16> %0, <7 x i16> %1){ } define <3 x i32> @ashr_v3i32(<3 x i32> %0, <3 x i32> %1){ -; CHECK-LABEL: ashr_v3i32: -; CHECK: // %bb.0: -; CHECK-NEXT: neg v1.4s, v1.4s -; CHECK-NEXT: sshl v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: ashr_v3i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: neg v1.4s, v1.4s +; CHECK-SD-NEXT: sshl v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ashr_v3i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov v2.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v3.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v2.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v3.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v2.s[2], v1.s[2] +; CHECK-GI-NEXT: mov v3.s[2], v0.s[2] +; CHECK-GI-NEXT: neg v0.4s, v2.4s +; CHECK-GI-NEXT: sshl v1.4s, v3.4s, v0.4s +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] +; CHECK-GI-NEXT: ret %3 = ashr <3 x i32> %0, %1 ret <3 x i32> %3 } @@ -1218,9 +1312,17 @@ define <3 x i8> @lshr_v3i8(<3 x i8> %0, <3 x i8> %1){ ; CHECK-GI-NEXT: mov v1.b[2], w2 ; CHECK-GI-NEXT: neg v0.8b, v0.8b ; CHECK-GI-NEXT: ushl v0.8b, v1.8b, v0.8b -; CHECK-GI-NEXT: umov w0, v0.b[0] -; CHECK-GI-NEXT: umov w1, v0.b[1] -; CHECK-GI-NEXT: umov w2, v0.b[2] +; CHECK-GI-NEXT: umov w8, v0.b[0] +; CHECK-GI-NEXT: umov w9, v0.b[1] +; CHECK-GI-NEXT: mov v1.s[0], w8 +; CHECK-GI-NEXT: umov w8, v0.b[2] +; CHECK-GI-NEXT: mov v1.s[1], w9 +; CHECK-GI-NEXT: mov v1.s[2], w8 +; CHECK-GI-NEXT: mov s0, v1.s[1] +; CHECK-GI-NEXT: mov s2, v1.s[2] +; CHECK-GI-NEXT: fmov w0, s1 +; CHECK-GI-NEXT: fmov w1, s0 +; CHECK-GI-NEXT: fmov w2, s2 ; CHECK-GI-NEXT: ret %3 = lshr <3 x i8> %0, %1 ret <3 x i8> %3 @@ -1237,11 +1339,36 @@ define <7 x i8> @lshr_v7i8(<7 x i8> %0, <7 x i8> %1){ } define <3 x i16> @lshr_v3i16(<3 x i16> %0, <3 x i16> %1){ -; CHECK-LABEL: lshr_v3i16: -; CHECK: // %bb.0: -; CHECK-NEXT: neg v1.4h, v1.4h -; CHECK-NEXT: ushl v0.4h, v0.4h, v1.4h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: lshr_v3i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: neg v1.4h, v1.4h +; CHECK-SD-NEXT: ushl v0.4h, v0.4h, v1.4h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: lshr_v3i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-GI-NEXT: mov v2.h[0], v1.h[0] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: mov v3.h[0], v0.h[0] +; CHECK-GI-NEXT: mov v2.h[1], v1.h[1] +; CHECK-GI-NEXT: mov v3.h[1], v0.h[1] +; CHECK-GI-NEXT: mov v2.h[2], v1.h[2] +; CHECK-GI-NEXT: mov v3.h[2], v0.h[2] +; CHECK-GI-NEXT: neg v0.4h, v2.4h +; CHECK-GI-NEXT: ushl v1.4h, v3.4h, v0.4h +; CHECK-GI-NEXT: umov w8, v1.h[0] +; CHECK-GI-NEXT: umov w9, v1.h[1] +; CHECK-GI-NEXT: mov v0.s[0], w8 +; CHECK-GI-NEXT: umov w8, v1.h[2] +; CHECK-GI-NEXT: mov v0.s[1], w9 +; CHECK-GI-NEXT: mov v0.s[2], w8 +; CHECK-GI-NEXT: mov w8, v0.s[1] +; CHECK-GI-NEXT: mov w9, v0.s[2] +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: mov v0.h[2], w9 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret %3 = lshr <3 x i16> %0, %1 ret <3 x i16> %3 } @@ -1257,11 +1384,26 @@ define <7 x i16> @lshr_v7i16(<7 x i16> %0, <7 x i16> %1){ } define <3 x i32> @lshr_v3i32(<3 x i32> %0, <3 x i32> %1){ -; CHECK-LABEL: lshr_v3i32: -; CHECK: // %bb.0: -; CHECK-NEXT: neg v1.4s, v1.4s -; CHECK-NEXT: ushl v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: lshr_v3i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: neg v1.4s, v1.4s +; CHECK-SD-NEXT: ushl v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: lshr_v3i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov v2.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v3.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v2.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v3.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v2.s[2], v1.s[2] +; CHECK-GI-NEXT: mov v3.s[2], v0.s[2] +; CHECK-GI-NEXT: neg v0.4s, v2.4s +; CHECK-GI-NEXT: ushl v1.4s, v3.4s, v0.4s +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] +; CHECK-GI-NEXT: ret %3 = lshr <3 x i32> %0, %1 ret <3 x i32> %3 } diff --git a/llvm/test/CodeGen/AArch64/shufflevector.ll b/llvm/test/CodeGen/AArch64/shufflevector.ll index 6b5951551c3a5..02142f9b9e71d 100644 --- a/llvm/test/CodeGen/AArch64/shufflevector.ll +++ b/llvm/test/CodeGen/AArch64/shufflevector.ll @@ -322,10 +322,17 @@ define <16 x i16> @shufflevector_v16i16(<16 x i16> %a, <16 x i16> %b){ } define <1 x i32> @shufflevector_v1i32(<1 x i32> %a, <1 x i32> %b) { -; CHECK-LABEL: shufflevector_v1i32: -; CHECK: // %bb.0: -; CHECK-NEXT: fmov d0, d1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: shufflevector_v1i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: fmov d0, d1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: shufflevector_v1i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret %c = shufflevector <1 x i32> %a, <1 x i32> %b, <1 x i32> ret <1 x i32> %c } @@ -464,9 +471,16 @@ define <16 x i16> @shufflevector_v16i16_zeroes(<16 x i16> %a, <16 x i16> %b){ } define <1 x i32> @shufflevector_v1i32_zeroes(<1 x i32> %a, <1 x i32> %b) { -; CHECK-LABEL: shufflevector_v1i32_zeroes: -; CHECK: // %bb.0: -; CHECK-NEXT: ret +; CHECK-SD-LABEL: shufflevector_v1i32_zeroes: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: shufflevector_v1i32_zeroes: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: mov v0.s[0], v0.s[0] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret %c = shufflevector <1 x i32> %a, <1 x i32> %b, <1 x i32> ret <1 x i32> %c } @@ -503,19 +517,14 @@ define <3 x i8> @shufflevector_v3i8(<3 x i8> %a, <3 x i8> %b) { ; ; CHECK-GI-LABEL: shufflevector_v3i8: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fmov s0, w0 -; CHECK-GI-NEXT: fmov s1, w3 -; CHECK-GI-NEXT: adrp x8, .LCPI30_0 -; CHECK-GI-NEXT: mov v0.b[1], w1 -; CHECK-GI-NEXT: mov v1.b[1], w4 -; CHECK-GI-NEXT: mov v0.b[2], w2 -; CHECK-GI-NEXT: mov v1.b[2], w5 -; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] -; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI30_0] -; CHECK-GI-NEXT: tbl v0.16b, { v0.16b }, v1.16b -; CHECK-GI-NEXT: umov w0, v0.b[0] -; CHECK-GI-NEXT: umov w1, v0.b[1] -; CHECK-GI-NEXT: umov w2, v0.b[2] +; CHECK-GI-NEXT: mov v0.s[0], w0 +; CHECK-GI-NEXT: mov v0.s[1], w1 +; CHECK-GI-NEXT: mov v0.s[2], w2 +; CHECK-GI-NEXT: mov w2, w4 +; CHECK-GI-NEXT: mov s1, v0.s[1] +; CHECK-GI-NEXT: mov s0, v0.s[2] +; CHECK-GI-NEXT: fmov w0, s1 +; CHECK-GI-NEXT: fmov w1, s0 ; CHECK-GI-NEXT: ret %c = shufflevector <3 x i8> %a, <3 x i8> %b, <3 x i32> ret <3 x i8> %c @@ -556,11 +565,18 @@ define <3 x i16> @shufflevector_v3i16(<3 x i16> %a, <3 x i16> %b) { ; CHECK-GI-LABEL: shufflevector_v3i16: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: umov w8, v0.h[0] +; CHECK-GI-NEXT: umov w9, v0.h[1] ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: adrp x8, .LCPI32_0 -; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] -; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI32_0] -; CHECK-GI-NEXT: tbl v0.16b, { v0.16b }, v1.16b +; CHECK-GI-NEXT: mov v2.s[0], w8 +; CHECK-GI-NEXT: umov w8, v0.h[2] +; CHECK-GI-NEXT: mov v2.s[1], w9 +; CHECK-GI-NEXT: mov v2.s[2], w8 +; CHECK-GI-NEXT: mov w8, v2.s[1] +; CHECK-GI-NEXT: mov w9, v2.s[2] +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: mov v0.h[1], w9 +; CHECK-GI-NEXT: mov v0.h[2], v1.h[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret %c = shufflevector <3 x i16> %a, <3 x i16> %b, <3 x i32> @@ -598,11 +614,12 @@ define <3 x i32> @shufflevector_v3i32(<3 x i32> %a, <3 x i32> %b) { ; ; CHECK-GI-LABEL: shufflevector_v3i32: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: adrp x8, .LCPI34_0 -; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI34_0] -; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-GI-NEXT: mov v2.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v2.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v2.s[2], v0.s[2] +; CHECK-GI-NEXT: mov v0.s[0], v2.s[1] +; CHECK-GI-NEXT: mov v0.s[1], v2.s[2] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[1] ; CHECK-GI-NEXT: ret %c = shufflevector <3 x i32> %a, <3 x i32> %b, <3 x i32> ret <3 x i32> %c @@ -619,13 +636,12 @@ define <3 x i8> @shufflevector_v3i8_zeroes(<3 x i8> %a, <3 x i8> %b) { ; ; CHECK-GI-LABEL: shufflevector_v3i8_zeroes: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fmov s0, w0 -; CHECK-GI-NEXT: mov v0.b[1], w1 -; CHECK-GI-NEXT: mov v0.b[2], w2 -; CHECK-GI-NEXT: dup v0.8b, v0.b[0] -; CHECK-GI-NEXT: umov w0, v0.b[0] -; CHECK-GI-NEXT: umov w1, v0.b[1] -; CHECK-GI-NEXT: umov w2, v0.b[2] +; CHECK-GI-NEXT: mov v0.s[0], w0 +; CHECK-GI-NEXT: mov v0.s[1], w1 +; CHECK-GI-NEXT: mov v0.s[2], w2 +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: fmov w1, s0 +; CHECK-GI-NEXT: fmov w2, s0 ; CHECK-GI-NEXT: ret %c = shufflevector <3 x i8> %a, <3 x i8> %b, <3 x i32> ret <3 x i8> %c @@ -642,11 +658,26 @@ define <7 x i8> @shufflevector_v7i8_zeroes(<7 x i8> %a, <7 x i8> %b) { } define <3 x i16> @shufflevector_v3i16_zeroes(<3 x i16> %a, <3 x i16> %b) { -; CHECK-LABEL: shufflevector_v3i16_zeroes: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: dup v0.4h, v0.h[0] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: shufflevector_v3i16_zeroes: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: dup v0.4h, v0.h[0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: shufflevector_v3i16_zeroes: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: umov w8, v0.h[0] +; CHECK-GI-NEXT: umov w9, v0.h[1] +; CHECK-GI-NEXT: mov v1.s[0], w8 +; CHECK-GI-NEXT: umov w8, v0.h[2] +; CHECK-GI-NEXT: mov v1.s[1], w9 +; CHECK-GI-NEXT: mov v1.s[2], w8 +; CHECK-GI-NEXT: mov v0.16b, v1.16b +; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret %c = shufflevector <3 x i16> %a, <3 x i16> %b, <3 x i32> ret <3 x i16> %c } @@ -661,10 +692,20 @@ define <7 x i16> @shufflevector_v7i16_zeroes(<7 x i16> %a, <7 x i16> %b) { } define <3 x i32> @shufflevector_v3i32_zeroes(<3 x i32> %a, <3 x i32> %b) { -; CHECK-LABEL: shufflevector_v3i32_zeroes: -; CHECK: // %bb.0: -; CHECK-NEXT: dup v0.4s, v0.s[0] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: shufflevector_v3i32_zeroes: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: dup v0.4s, v0.s[0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: shufflevector_v3i32_zeroes: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v1.s[2], v0.s[2] +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[0] +; CHECK-GI-NEXT: ret %c = shufflevector <3 x i32> %a, <3 x i32> %b, <3 x i32> ret <3 x i32> %c } diff --git a/llvm/test/CodeGen/AArch64/sub.ll b/llvm/test/CodeGen/AArch64/sub.ll index 8f35a69f52b85..8cd1bcfb82dcc 100644 --- a/llvm/test/CodeGen/AArch64/sub.ll +++ b/llvm/test/CodeGen/AArch64/sub.ll @@ -343,10 +343,24 @@ entry: } define <3 x i32> @v3i32(<3 x i32> %d, <3 x i32> %e) { -; CHECK-LABEL: v3i32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: v3i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v3i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v2.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v3.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v2.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v3.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v2.s[2], v0.s[2] +; CHECK-GI-NEXT: mov v3.s[2], v1.s[2] +; CHECK-GI-NEXT: sub v1.4s, v2.4s, v3.4s +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] +; CHECK-GI-NEXT: ret entry: %s = sub <3 x i32> %d, %e ret <3 x i32> %s @@ -408,8 +422,9 @@ define <3 x i64> @v3i64(<3 x i64> %d, <3 x i64> %e) { ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: mov v3.d[1], v4.d[0] ; CHECK-GI-NEXT: sub x8, x8, x9 -; CHECK-GI-NEXT: fmov d2, x8 +; CHECK-GI-NEXT: mov v2.d[0], x8 ; CHECK-GI-NEXT: sub v0.2d, v0.2d, v3.2d +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll index d71aed2d17506..69fd0ad01b7c5 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll @@ -187,12 +187,22 @@ define i8 @test_v9i8(<9 x i8> %a) nounwind { } define i32 @test_v3i32(<3 x i32> %a) nounwind { -; CHECK-LABEL: test_v3i32: -; CHECK: // %bb.0: -; CHECK-NEXT: mov v0.s[3], wzr -; CHECK-NEXT: umaxv s0, v0.4s -; CHECK-NEXT: fmov w0, s0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_v3i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: mov v0.s[3], wzr +; CHECK-SD-NEXT: umaxv s0, v0.4s +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_v3i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v1.s[2], v0.s[2] +; CHECK-GI-NEXT: mov v1.s[3], wzr +; CHECK-GI-NEXT: umaxv s0, v1.4s +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: ret %b = call i32 @llvm.vector.reduce.umax.v3i32(<3 x i32> %a) ret i32 %b } diff --git a/llvm/test/CodeGen/AArch64/xtn.ll b/llvm/test/CodeGen/AArch64/xtn.ll index 8a4d6b8c7b789..96474a84ca992 100644 --- a/llvm/test/CodeGen/AArch64/xtn.ll +++ b/llvm/test/CodeGen/AArch64/xtn.ll @@ -293,10 +293,19 @@ entry: } define <3 x i16> @xtn_v3i32_v3i16(<3 x i32> %a) { -; CHECK-LABEL: xtn_v3i32_v3i16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: xtn v0.4h, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: xtn_v3i32_v3i16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: xtn v0.4h, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: xtn_v3i32_v3i16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov w8, v0.s[1] +; CHECK-GI-NEXT: mov w9, v0.s[2] +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: mov v0.h[2], w9 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret entry: %arg1 = trunc <3 x i32> %a to <3 x i16> ret <3 x i16> %arg1 diff --git a/llvm/test/CodeGen/AArch64/zext.ll b/llvm/test/CodeGen/AArch64/zext.ll index 0d5010113ce0b..2e979bb122560 100644 --- a/llvm/test/CodeGen/AArch64/zext.ll +++ b/llvm/test/CodeGen/AArch64/zext.ll @@ -243,11 +243,15 @@ define <3 x i16> @zext_v3i8_v3i16(<3 x i8> %a) { ; CHECK-GI-LABEL: zext_v3i8_v3i16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: and w8, w0, #0xff -; CHECK-GI-NEXT: and w9, w1, #0xff -; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: mov v0.s[0], w8 +; CHECK-GI-NEXT: and w8, w1, #0xff +; CHECK-GI-NEXT: mov v0.s[1], w8 ; CHECK-GI-NEXT: and w8, w2, #0xff -; CHECK-GI-NEXT: mov v0.h[1], w9 -; CHECK-GI-NEXT: mov v0.h[2], w8 +; CHECK-GI-NEXT: mov v0.s[2], w8 +; CHECK-GI-NEXT: mov w8, v0.s[1] +; CHECK-GI-NEXT: mov w9, v0.s[2] +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: mov v0.h[2], w9 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret entry: @@ -269,11 +273,14 @@ define <3 x i32> @zext_v3i8_v3i32(<3 x i8> %a) { ; CHECK-GI-LABEL: zext_v3i8_v3i32: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: and w8, w0, #0xff -; CHECK-GI-NEXT: mov v0.s[0], w8 +; CHECK-GI-NEXT: mov v1.s[0], w8 ; CHECK-GI-NEXT: and w8, w1, #0xff -; CHECK-GI-NEXT: mov v0.s[1], w8 +; CHECK-GI-NEXT: mov v1.s[1], w8 ; CHECK-GI-NEXT: and w8, w2, #0xff -; CHECK-GI-NEXT: mov v0.s[2], w8 +; CHECK-GI-NEXT: mov v1.s[2], w8 +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] ; CHECK-GI-NEXT: ret entry: %c = zext <3 x i8> %a to <3 x i32> @@ -301,14 +308,17 @@ define <3 x i64> @zext_v3i8_v3i64(<3 x i8> %a) { ; CHECK-GI-LABEL: zext_v3i8_v3i64: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-GI-NEXT: and x8, x0, #0xff ; CHECK-GI-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-GI-NEXT: // kill: def $w2 killed $w2 def $x2 -; CHECK-GI-NEXT: and x8, x0, #0xff -; CHECK-GI-NEXT: and x9, x1, #0xff -; CHECK-GI-NEXT: and x10, x2, #0xff -; CHECK-GI-NEXT: fmov d0, x8 -; CHECK-GI-NEXT: fmov d1, x9 -; CHECK-GI-NEXT: fmov d2, x10 +; CHECK-GI-NEXT: mov v0.d[0], x8 +; CHECK-GI-NEXT: and x8, x1, #0xff +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: and x8, x2, #0xff +; CHECK-GI-NEXT: mov v2.d[0], x8 +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-GI-NEXT: mov d1, v0.d[1] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret entry: %c = zext <3 x i8> %a to <3 x i64> @@ -330,7 +340,9 @@ define <3 x i32> @zext_v3i16_v3i32(<3 x i16> %a) { ; CHECK-GI-NEXT: umov w8, v0.h[2] ; CHECK-GI-NEXT: mov v1.s[1], w9 ; CHECK-GI-NEXT: mov v1.s[2], w8 -; CHECK-GI-NEXT: mov v0.16b, v1.16b +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] ; CHECK-GI-NEXT: ret entry: %c = zext <3 x i16> %a to <3 x i32> @@ -354,10 +366,13 @@ define <3 x i64> @zext_v3i16_v3i64(<3 x i16> %a) { ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-GI-NEXT: umov w8, v0.h[0] ; CHECK-GI-NEXT: umov w9, v0.h[1] -; CHECK-GI-NEXT: umov w10, v0.h[2] -; CHECK-GI-NEXT: fmov d0, x8 -; CHECK-GI-NEXT: fmov d1, x9 -; CHECK-GI-NEXT: fmov d2, x10 +; CHECK-GI-NEXT: mov v3.d[0], x8 +; CHECK-GI-NEXT: umov w8, v0.h[2] +; CHECK-GI-NEXT: mov v3.d[1], x9 +; CHECK-GI-NEXT: mov v2.d[0], x8 +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-GI-NEXT: mov d1, v3.d[1] +; CHECK-GI-NEXT: fmov d0, d3 ; CHECK-GI-NEXT: ret entry: %c = zext <3 x i16> %a to <3 x i64> @@ -379,10 +394,13 @@ define <3 x i64> @zext_v3i32_v3i64(<3 x i32> %a) { ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov w8, v0.s[0] ; CHECK-GI-NEXT: mov w9, v0.s[1] -; CHECK-GI-NEXT: mov w10, v0.s[2] -; CHECK-GI-NEXT: fmov d0, x8 -; CHECK-GI-NEXT: fmov d1, x9 -; CHECK-GI-NEXT: fmov d2, x10 +; CHECK-GI-NEXT: mov v3.d[0], x8 +; CHECK-GI-NEXT: mov w8, v0.s[2] +; CHECK-GI-NEXT: mov v3.d[1], x9 +; CHECK-GI-NEXT: mov v2.d[0], x8 +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-GI-NEXT: mov d1, v3.d[1] +; CHECK-GI-NEXT: fmov d0, d3 ; CHECK-GI-NEXT: ret entry: %c = zext <3 x i32> %a to <3 x i64> @@ -402,11 +420,15 @@ define <3 x i16> @zext_v3i10_v3i16(<3 x i10> %a) { ; CHECK-GI-LABEL: zext_v3i10_v3i16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: and w8, w0, #0x3ff -; CHECK-GI-NEXT: and w9, w1, #0x3ff -; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: mov v0.s[0], w8 +; CHECK-GI-NEXT: and w8, w1, #0x3ff +; CHECK-GI-NEXT: mov v0.s[1], w8 ; CHECK-GI-NEXT: and w8, w2, #0x3ff -; CHECK-GI-NEXT: mov v0.h[1], w9 -; CHECK-GI-NEXT: mov v0.h[2], w8 +; CHECK-GI-NEXT: mov v0.s[2], w8 +; CHECK-GI-NEXT: mov w8, v0.s[1] +; CHECK-GI-NEXT: mov w9, v0.s[2] +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: mov v0.h[2], w9 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret entry: @@ -428,11 +450,14 @@ define <3 x i32> @zext_v3i10_v3i32(<3 x i10> %a) { ; CHECK-GI-LABEL: zext_v3i10_v3i32: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: and w8, w0, #0x3ff -; CHECK-GI-NEXT: mov v0.s[0], w8 +; CHECK-GI-NEXT: mov v1.s[0], w8 ; CHECK-GI-NEXT: and w8, w1, #0x3ff -; CHECK-GI-NEXT: mov v0.s[1], w8 +; CHECK-GI-NEXT: mov v1.s[1], w8 ; CHECK-GI-NEXT: and w8, w2, #0x3ff -; CHECK-GI-NEXT: mov v0.s[2], w8 +; CHECK-GI-NEXT: mov v1.s[2], w8 +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[2] ; CHECK-GI-NEXT: ret entry: %c = zext <3 x i10> %a to <3 x i32> @@ -459,14 +484,17 @@ define <3 x i64> @zext_v3i10_v3i64(<3 x i10> %a) { ; CHECK-GI-LABEL: zext_v3i10_v3i64: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-GI-NEXT: and x8, x0, #0x3ff ; CHECK-GI-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-GI-NEXT: // kill: def $w2 killed $w2 def $x2 -; CHECK-GI-NEXT: and x8, x0, #0x3ff -; CHECK-GI-NEXT: and x9, x1, #0x3ff -; CHECK-GI-NEXT: and x10, x2, #0x3ff -; CHECK-GI-NEXT: fmov d0, x8 -; CHECK-GI-NEXT: fmov d1, x9 -; CHECK-GI-NEXT: fmov d2, x10 +; CHECK-GI-NEXT: mov v0.d[0], x8 +; CHECK-GI-NEXT: and x8, x1, #0x3ff +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: and x8, x2, #0x3ff +; CHECK-GI-NEXT: mov v2.d[0], x8 +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-GI-NEXT: mov d1, v0.d[1] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret entry: %c = zext <3 x i10> %a to <3 x i64> diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll index c8b82716a9fe1..74f259d7cd4cc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll @@ -9,8 +9,13 @@ define <2 x i16> @v_add_v2i16(<2 x i16> %a, <2 x i16> %b) { ; GFX7-LABEL: v_add_v2i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_add_v2i16: @@ -45,8 +50,13 @@ define <2 x i16> @v_add_v2i16_fneg_lhs(<2 x half> %a, <2 x i16> %b) { ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_add_v2i16_fneg_lhs: @@ -84,8 +94,13 @@ define <2 x i16> @v_add_v2i16_fneg_rhs(<2 x i16> %a, <2 x half> %b) { ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX7-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_add_v2i16_fneg_rhs: @@ -130,6 +145,11 @@ define <2 x i16> @v_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> %a, <2 x half> %b) { ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_add_v2i16_fneg_lhs_fneg_rhs: @@ -165,8 +185,13 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_splat(<2 x i16> %a) { ; GFX7-LABEL: v_add_v2i16_neg_inline_imm_splat: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xffffffc0, v0 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xffffffc0, v1 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xffffffc0, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_add_v2i16_neg_inline_imm_splat: @@ -197,8 +222,13 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_lo(<2 x i16> %a) { ; GFX7-LABEL: v_add_v2i16_neg_inline_imm_lo: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xffffffc0, v0 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, 4, v1 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xffffffc0, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_add_v2i16_neg_inline_imm_lo: @@ -230,8 +260,13 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_hi(<2 x i16> %a) { ; GFX7-LABEL: v_add_v2i16_neg_inline_imm_hi: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v0 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xffffffc0, v1 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_add_v2i16_neg_inline_imm_hi: @@ -614,6 +649,11 @@ define <2 x i16> @add_inline_imm_neg1_0(<2 x i16> %x) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, -1, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: add_inline_imm_neg1_0: @@ -645,6 +685,11 @@ define <2 x i16> @add_inline_imm_1_0(<2 x i16> %x) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: add_inline_imm_1_0: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll index 63f5464371cc6..aba7ded8fe17f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -753,6 +753,11 @@ define <2 x i16> @v_ashr_v2i16(<2 x i16> %value, <2 x i16> %amount) { ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, v2, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ashr_v2i16: @@ -782,10 +787,15 @@ define <2 x i16> @v_ashr_v2i16_15(<2 x i16> %value) { ; GFX6-LABEL: v_ashr_v2i16_15: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 15, v0 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 15, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ashr_v2i16_15: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll index 132dc876b3b05..b026fdb755c00 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll @@ -566,6 +566,11 @@ define <2 x i16> @v_bswap_v2i16(<2 x i16> %src) { ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 8, 8 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_bswap_v2i16: @@ -609,6 +614,10 @@ define <3 x i16> @v_bswap_v3i16(<3 x i16> %src) { ; GFX8-LABEL: v_bswap_v3i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX8-NEXT: s_mov_b32 s4, 0x2030001 ; GFX8-NEXT: v_perm_b32 v0, 0, v0, s4 ; GFX8-NEXT: v_perm_b32 v1, 0, v1, s4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-pre-legalize.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-pre-legalize.mir index 42e53bedb8d85..26e8fe2c9a27c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-pre-legalize.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-pre-legalize.mir @@ -838,11 +838,18 @@ body: | ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(<4 x s32>) = reassoc G_FADD [[FMUL]], [[BUILD_VECTOR2]] - ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX9-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C]](s32) + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C1]](s32) + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C2]](s32) + ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; GFX9-NEXT: [[EVEC3:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[EVEC]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[EVEC1]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[EVEC2]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[EVEC3]](s32) ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX9-CONTRACT-LABEL: name: test_4xfloat_add_mul @@ -864,11 +871,18 @@ body: | ; GFX9-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]] - ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>) - ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-CONTRACT-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX9-CONTRACT-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C]](s32) + ; GFX9-CONTRACT-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-CONTRACT-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C1]](s32) + ; GFX9-CONTRACT-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9-CONTRACT-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C2]](s32) + ; GFX9-CONTRACT-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; GFX9-CONTRACT-NEXT: [[EVEC3:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C3]](s32) + ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[EVEC]](s32) + ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[EVEC1]](s32) + ; GFX9-CONTRACT-NEXT: $vgpr2 = COPY [[EVEC2]](s32) + ; GFX9-CONTRACT-NEXT: $vgpr3 = COPY [[EVEC3]](s32) ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX9-DENORM-LABEL: name: test_4xfloat_add_mul @@ -891,11 +905,18 @@ body: | ; GFX9-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] ; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<4 x s32>) = reassoc G_FADD [[FMUL]], [[BUILD_VECTOR2]] - ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>) - ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-DENORM-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX9-DENORM-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C]](s32) + ; GFX9-DENORM-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-DENORM-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C1]](s32) + ; GFX9-DENORM-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9-DENORM-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C2]](s32) + ; GFX9-DENORM-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; GFX9-DENORM-NEXT: [[EVEC3:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C3]](s32) + ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[EVEC]](s32) + ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[EVEC1]](s32) + ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[EVEC2]](s32) + ; GFX9-DENORM-NEXT: $vgpr3 = COPY [[EVEC3]](s32) ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX9-UNSAFE-LABEL: name: test_4xfloat_add_mul @@ -917,11 +938,18 @@ body: | ; GFX9-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]] - ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>) - ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-UNSAFE-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX9-UNSAFE-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C]](s32) + ; GFX9-UNSAFE-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-UNSAFE-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C1]](s32) + ; GFX9-UNSAFE-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9-UNSAFE-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C2]](s32) + ; GFX9-UNSAFE-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; GFX9-UNSAFE-NEXT: [[EVEC3:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C3]](s32) + ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[EVEC]](s32) + ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[EVEC1]](s32) + ; GFX9-UNSAFE-NEXT: $vgpr2 = COPY [[EVEC2]](s32) + ; GFX9-UNSAFE-NEXT: $vgpr3 = COPY [[EVEC3]](s32) ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: test_4xfloat_add_mul @@ -944,11 +972,18 @@ body: | ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(<4 x s32>) = reassoc G_FADD [[FMUL]], [[BUILD_VECTOR2]] - ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX10-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C]](s32) + ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C1]](s32) + ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C2]](s32) + ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; GFX10-NEXT: [[EVEC3:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[EVEC]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[EVEC1]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[EVEC2]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[EVEC3]](s32) ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-CONTRACT-LABEL: name: test_4xfloat_add_mul @@ -970,11 +1005,18 @@ body: | ; GFX10-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]] - ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>) - ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-CONTRACT-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX10-CONTRACT-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C]](s32) + ; GFX10-CONTRACT-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-CONTRACT-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C1]](s32) + ; GFX10-CONTRACT-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-CONTRACT-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C2]](s32) + ; GFX10-CONTRACT-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; GFX10-CONTRACT-NEXT: [[EVEC3:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C3]](s32) + ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[EVEC]](s32) + ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[EVEC1]](s32) + ; GFX10-CONTRACT-NEXT: $vgpr2 = COPY [[EVEC2]](s32) + ; GFX10-CONTRACT-NEXT: $vgpr3 = COPY [[EVEC3]](s32) ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-DENORM-LABEL: name: test_4xfloat_add_mul @@ -997,11 +1039,18 @@ body: | ; GFX10-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<4 x s32>) = reassoc G_FADD [[FMUL]], [[BUILD_VECTOR2]] - ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>) - ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-DENORM-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX10-DENORM-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C]](s32) + ; GFX10-DENORM-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-DENORM-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C1]](s32) + ; GFX10-DENORM-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-DENORM-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C2]](s32) + ; GFX10-DENORM-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; GFX10-DENORM-NEXT: [[EVEC3:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<4 x s32>), [[C3]](s32) + ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[EVEC]](s32) + ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[EVEC1]](s32) + ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[EVEC2]](s32) + ; GFX10-DENORM-NEXT: $vgpr3 = COPY [[EVEC3]](s32) ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-UNSAFE-LABEL: name: test_4xfloat_add_mul @@ -1023,11 +1072,18 @@ body: | ; GFX10-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]] - ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>) - ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-UNSAFE-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX10-UNSAFE-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C]](s32) + ; GFX10-UNSAFE-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-UNSAFE-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C1]](s32) + ; GFX10-UNSAFE-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-UNSAFE-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C2]](s32) + ; GFX10-UNSAFE-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; GFX10-UNSAFE-NEXT: [[EVEC3:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<4 x s32>), [[C3]](s32) + ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[EVEC]](s32) + ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[EVEC1]](s32) + ; GFX10-UNSAFE-NEXT: $vgpr2 = COPY [[EVEC2]](s32) + ; GFX10-UNSAFE-NEXT: $vgpr3 = COPY [[EVEC3]](s32) ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 %4:_(s32) = COPY $vgpr0 %5:_(s32) = COPY $vgpr1 @@ -1077,10 +1133,15 @@ body: | ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32) ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = reassoc G_FADD [[BUILD_VECTOR2]], [[FMUL]] - ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX9-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<3 x s32>), [[C]](s32) + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<3 x s32>), [[C1]](s32) + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<3 x s32>), [[C2]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[EVEC]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[EVEC1]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[EVEC2]](s32) ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 ; ; GFX9-CONTRACT-LABEL: name: test_3xfloat_add_mul_rhs @@ -1099,10 +1160,15 @@ body: | ; GFX9-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32) ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<3 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]] - ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s32>) - ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX9-CONTRACT-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX9-CONTRACT-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<3 x s32>), [[C]](s32) + ; GFX9-CONTRACT-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-CONTRACT-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<3 x s32>), [[C1]](s32) + ; GFX9-CONTRACT-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9-CONTRACT-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<3 x s32>), [[C2]](s32) + ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[EVEC]](s32) + ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[EVEC1]](s32) + ; GFX9-CONTRACT-NEXT: $vgpr2 = COPY [[EVEC2]](s32) ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 ; ; GFX9-DENORM-LABEL: name: test_3xfloat_add_mul_rhs @@ -1122,10 +1188,15 @@ body: | ; GFX9-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32) ; GFX9-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] ; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = reassoc G_FADD [[BUILD_VECTOR2]], [[FMUL]] - ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>) - ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX9-DENORM-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX9-DENORM-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<3 x s32>), [[C]](s32) + ; GFX9-DENORM-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-DENORM-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<3 x s32>), [[C1]](s32) + ; GFX9-DENORM-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9-DENORM-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<3 x s32>), [[C2]](s32) + ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[EVEC]](s32) + ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[EVEC1]](s32) + ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[EVEC2]](s32) ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 ; ; GFX9-UNSAFE-LABEL: name: test_3xfloat_add_mul_rhs @@ -1144,10 +1215,15 @@ body: | ; GFX9-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32) ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<3 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]] - ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s32>) - ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX9-UNSAFE-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX9-UNSAFE-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<3 x s32>), [[C]](s32) + ; GFX9-UNSAFE-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-UNSAFE-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<3 x s32>), [[C1]](s32) + ; GFX9-UNSAFE-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9-UNSAFE-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<3 x s32>), [[C2]](s32) + ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[EVEC]](s32) + ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[EVEC1]](s32) + ; GFX9-UNSAFE-NEXT: $vgpr2 = COPY [[EVEC2]](s32) ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 ; ; GFX10-LABEL: name: test_3xfloat_add_mul_rhs @@ -1167,10 +1243,15 @@ body: | ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32) ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = reassoc G_FADD [[BUILD_VECTOR2]], [[FMUL]] - ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX10-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<3 x s32>), [[C]](s32) + ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<3 x s32>), [[C1]](s32) + ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<3 x s32>), [[C2]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[EVEC]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[EVEC1]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[EVEC2]](s32) ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 ; ; GFX10-CONTRACT-LABEL: name: test_3xfloat_add_mul_rhs @@ -1189,10 +1270,15 @@ body: | ; GFX10-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32) ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<3 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]] - ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s32>) - ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX10-CONTRACT-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX10-CONTRACT-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<3 x s32>), [[C]](s32) + ; GFX10-CONTRACT-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-CONTRACT-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<3 x s32>), [[C1]](s32) + ; GFX10-CONTRACT-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-CONTRACT-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<3 x s32>), [[C2]](s32) + ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[EVEC]](s32) + ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[EVEC1]](s32) + ; GFX10-CONTRACT-NEXT: $vgpr2 = COPY [[EVEC2]](s32) ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 ; ; GFX10-DENORM-LABEL: name: test_3xfloat_add_mul_rhs @@ -1212,10 +1298,15 @@ body: | ; GFX10-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32) ; GFX10-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = reassoc G_FADD [[BUILD_VECTOR2]], [[FMUL]] - ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>) - ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX10-DENORM-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX10-DENORM-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<3 x s32>), [[C]](s32) + ; GFX10-DENORM-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-DENORM-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<3 x s32>), [[C1]](s32) + ; GFX10-DENORM-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-DENORM-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FADD]](<3 x s32>), [[C2]](s32) + ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[EVEC]](s32) + ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[EVEC1]](s32) + ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[EVEC2]](s32) ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 ; ; GFX10-UNSAFE-LABEL: name: test_3xfloat_add_mul_rhs @@ -1234,10 +1325,15 @@ body: | ; GFX10-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32) ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<3 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]] - ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s32>) - ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX10-UNSAFE-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX10-UNSAFE-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<3 x s32>), [[C]](s32) + ; GFX10-UNSAFE-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-UNSAFE-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<3 x s32>), [[C1]](s32) + ; GFX10-UNSAFE-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-UNSAFE-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[FMA]](<3 x s32>), [[C2]](s32) + ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[EVEC]](s32) + ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[EVEC1]](s32) + ; GFX10-UNSAFE-NEXT: $vgpr2 = COPY [[EVEC2]](s32) ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 %4:_(s32) = COPY $vgpr0 %5:_(s32) = COPY $vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir index 2845a632a84b3..5777ecfce459f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir @@ -15,8 +15,9 @@ body: | ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr2_vgpr3 - ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1) - ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD %ptr, [[C]](s64) + ; GFX10-NEXT: %el1:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1) ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], %el1 ; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32) %0:_(s32) = COPY $vgpr0 @@ -45,8 +46,9 @@ body: | ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr2_vgpr3 - ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1) - ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD %ptr, [[C]](s64) + ; GFX10-NEXT: %el1:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1) ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], %el1 ; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32) %0:_(s32) = COPY $vgpr0 @@ -77,8 +79,9 @@ body: | ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1 ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) ; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1) - ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD %ptr, [[C]](s64) + ; GFX10-NEXT: %el1:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1) ; GFX10-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) ; GFX10-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FPEXT]], [[FPEXT1]], %el1 @@ -114,8 +117,9 @@ body: | ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1 ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) ; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1) - ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD %ptr, [[C]](s64) + ; GFX10-NEXT: %el1:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1) ; GFX10-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) ; GFX10-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FPEXT]], [[FPEXT1]], %el1 @@ -147,8 +151,9 @@ body: | ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr4_vgpr5 - ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1) - ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD %ptr, [[C]](s64) + ; GFX10-NEXT: %el1:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1) ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY2]], [[COPY3]], %el1 ; GFX10-NEXT: [[FMA1:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[FMA]] ; GFX10-NEXT: $vgpr0 = COPY [[FMA1]](s32) @@ -179,8 +184,9 @@ body: | ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr4_vgpr5 - ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1) - ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD %ptr, [[C]](s64) + ; GFX10-NEXT: %el1:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1) ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY2]], [[COPY3]], %el1 ; GFX10-NEXT: [[FMA1:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[FMA]] ; GFX10-NEXT: $vgpr0 = COPY [[FMA1]](s32) @@ -213,8 +219,9 @@ body: | ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr2_vgpr3 - ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1) - ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD %ptr, [[C]](s64) + ; GFX10-NEXT: %el1:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1) ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr4 ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr5 @@ -258,8 +265,9 @@ body: | ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) ; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr2_vgpr3 - ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1) - ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD %ptr, [[C]](s64) + ; GFX10-NEXT: %el1:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1) ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr4 ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr5 @@ -304,8 +312,9 @@ body: | ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1) - ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD %ptr, [[C]](s64) + ; GFX10-NEXT: %el1:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1) ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr2 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr4 @@ -347,8 +356,9 @@ body: | ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1) - ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD %ptr, [[C]](s64) + ; GFX10-NEXT: %el1:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1) ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr2 ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr3 @@ -399,8 +409,9 @@ body: | ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1) - ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD %ptr, [[C]](s64) + ; GFX10-NEXT: %el1:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1) ; GFX10-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG %el1 ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[FNEG]] ; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32) @@ -430,8 +441,9 @@ body: | ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr2_vgpr3 - ; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1) - ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD %ptr, [[C]](s64) + ; GFX10-NEXT: %el1:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1) ; GFX10-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[COPY]] ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[COPY1]], %el1 ; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll index 9eeb633f0a817..e91251186a18d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll @@ -68,8 +68,20 @@ define <2 x i16> @halfinsts_add_v2i16(<2 x i16> %arg0) #1 { ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY]] ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[COPY1]] - ; CHECK-NEXT: $vgpr0 = COPY [[ADD]](s32) - ; CHECK-NEXT: $vgpr1 = COPY [[ADD1]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C]] + ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[BITCAST]](<2 x s16>) + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C2]](s32) + ; CHECK-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[BITCAST]](<2 x s16>) + ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C1]](s32) + ; CHECK-NEXT: $vgpr0 = COPY [[LSHR]](s32) + ; CHECK-NEXT: $vgpr1 = COPY [[LSHR1]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 %add = add <2 x i16> %arg0, %arg0 ret <2 x i16> %add diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll index 5ba036c386a40..d723ccccda695 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll @@ -774,20 +774,23 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) { ; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7 ; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v5 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v5 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1 +; GFX6-IEEE-NEXT: v_fma_f32 v4, -v5, v6, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v4, v6, v6 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v2, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v7, -v5, v6, v2 ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v5, v6, v2 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v6 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v3, v1 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_fdiv_v2f16: @@ -826,6 +829,9 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) { ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v6 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-IEEE-LABEL: v_fdiv_v2f16: @@ -1076,16 +1082,19 @@ define <2 x half> @v_fdiv_v2f16_afn(<2 x half> %a, <2 x half> %b) { ; GFX6-LABEL: v_fdiv_v2f16_afn: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_rcp_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_rcp_f32_e32 v3, v3 -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_rcp_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_v2f16_afn: @@ -1152,20 +1161,23 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) { ; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7 ; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v5 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v5 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1 +; GFX6-IEEE-NEXT: v_fma_f32 v4, -v5, v6, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v4, v6, v6 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v2, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v7, -v5, v6, v2 ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v5, v6, v2 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v6 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v3, v1 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_fdiv_v2f16_ulp25: @@ -1204,6 +1216,9 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) { ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v6 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-IEEE-LABEL: v_fdiv_v2f16_ulp25: @@ -1467,20 +1482,23 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) { ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 ; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v1, v1, v2 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, v2, v1, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v4, v5, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v5, v5 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v6, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v5, v6 +; GFX6-IEEE-NEXT: v_fma_f32 v3, -v4, v6, v3 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v6 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_rcp_v2f16: @@ -1519,6 +1537,9 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) { ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-IEEE-LABEL: v_rcp_v2f16: @@ -1770,20 +1791,23 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) { ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 ; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v1, v1, v2 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, v2, v1, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v4, v5, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v5, v5 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v6, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v5, v6 +; GFX6-IEEE-NEXT: v_fma_f32 v3, -v4, v6, v3 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v6 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_neg_rcp_v2f16: @@ -1822,6 +1846,9 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) { ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-IEEE-LABEL: v_neg_rcp_v2f16: @@ -2067,6 +2094,7 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) { ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1 @@ -2076,22 +2104,24 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) { ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 ; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v3, v4, v6 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v2, v1 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v0, v0, v1 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v2, v3, v2, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, v1, v0, v1 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v4, v5, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v5, v5 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v6, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v5, v6 +; GFX6-IEEE-NEXT: v_fma_f32 v3, -v4, v6, v3 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v6 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v1 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v5, v5, v1 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v5, v1 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v3, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v6, v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v3, v6 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v6 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v5, v1 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_rcp_v2f16_fabs: @@ -2117,24 +2147,27 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) { ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v2, v1 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v3, v2, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v5, v5, v4 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v4, v5, v4 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v4 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v0, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v2, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, v6, v2, v2 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v3, v2 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v1, v6, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v2, v6 -; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v6, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v6 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v5, v4 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v4 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-IEEE-LABEL: v_rcp_v2f16_fabs: @@ -2389,6 +2422,7 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) { ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1 @@ -2398,22 +2432,24 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) { ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 ; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v3, v4, v6 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v2, v1 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v0, v0, v1 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v2, v3, v2, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, v1, v0, v1 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v4, v5, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v5, v5 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v6, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v5, v6 +; GFX6-IEEE-NEXT: v_fma_f32 v3, -v4, v6, v3 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v6 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v1 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v5, v5, v1 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v5, v1 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v3, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v6, v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v3, v6 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v6 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v5, v1 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_neg_rcp_v2f16_fabs: @@ -2439,24 +2475,27 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) { ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v2, v1 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v3, v2, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v5, v5, v4 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v4, v5, v4 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v4 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v0, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v2, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, v6, v2, v2 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v3, v2 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v1, v6, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v2, v6 -; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v6, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v6 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v5, v4 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v4 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-IEEE-LABEL: v_neg_rcp_v2f16_fabs: @@ -2717,20 +2756,23 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) { ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 ; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v1, v1, v2 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, v2, v1, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v4, v5, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v5, v5 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v6, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v5, v6 +; GFX6-IEEE-NEXT: v_fma_f32 v3, -v4, v6, v3 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v6 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_rcp_v2f16_arcp: @@ -2769,6 +2811,9 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) { ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_rcp_v2f16_arcp: @@ -2812,15 +2857,18 @@ define <2 x half> @v_rcp_v2f16_arcp_afn(<2 x half> %x) { ; GFX6-LABEL: v_rcp_v2f16_arcp_afn: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, 1.0 -; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, v2, v0 +; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, v2, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_rcp_v2f16_arcp_afn: @@ -2877,20 +2925,23 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) { ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 ; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v1, v1, v2 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, v2, v1, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v4, v5, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v5, v5 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v6, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v5, v6 +; GFX6-IEEE-NEXT: v_fma_f32 v3, -v4, v6, v3 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v6 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_rcp_v2f16_ulp25: @@ -2929,6 +2980,9 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) { ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-IEEE-LABEL: v_rcp_v2f16_ulp25: @@ -3167,16 +3221,19 @@ define <2 x half> @v_fdiv_v2f16_afn_ulp25(<2 x half> %a, <2 x half> %b) { ; GFX6-LABEL: v_fdiv_v2f16_afn_ulp25: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_rcp_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_rcp_f32_e32 v3, v3 -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_rcp_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_v2f16_afn_ulp25: @@ -3243,20 +3300,23 @@ define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) { ; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7 ; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v5 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v5 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1 +; GFX6-IEEE-NEXT: v_fma_f32 v4, -v5, v6, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v4, v6, v6 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v2, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v7, -v5, v6, v2 ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v5, v6, v2 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v6 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v3, v1 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_fdiv_v2f16_arcp_ulp25: @@ -3295,6 +3355,9 @@ define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) { ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v6 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_v2f16_arcp_ulp25: @@ -3347,16 +3410,19 @@ define <2 x half> @v_fdiv_v2f16_arcp_afn_ulp25(<2 x half> %a, <2 x half> %b) { ; GFX6-LABEL: v_fdiv_v2f16_arcp_afn_ulp25: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_rcp_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_rcp_f32_e32 v3, v3 -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_rcp_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_v2f16_arcp_afn_ulp25: @@ -5395,8 +5461,11 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) { ; GFX6-IEEE-NEXT: s_mov_b64 vcc, s[4:5] ; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v5, v3, v4 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_rsq_v2f16: @@ -5441,6 +5510,9 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) { ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-IEEE-LABEL: v_rsq_v2f16: @@ -5709,8 +5781,11 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) { ; GFX6-IEEE-NEXT: s_mov_b64 vcc, s[4:5] ; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v5, v3, v4 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_neg_rsq_v2f16: @@ -5755,6 +5830,9 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) { ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-IEEE-LABEL: v_neg_rsq_v2f16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll index 99e6c5d06a0e1..f3237a2612616 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll @@ -237,16 +237,19 @@ define <2 x half> @v_fma_v2f16(<2 x half> %x, <2 x half> %y, <2 x half> %z) { ; GFX6-LABEL: v_fma_v2f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_fma_f32 v0, v0, v2, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_fma_f32 v1, v1, v3, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_fma_f32 v0, v0, v2, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fma_v2f16: @@ -291,16 +294,19 @@ define <2 x half> @v_fma_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y, <2 x half> ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_fma_f32 v0, v0, v2, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_fma_f32 v1, v1, v3, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_fma_f32 v0, v0, v2, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fma_v2f16_fneg_lhs: @@ -347,16 +353,19 @@ define <2 x half> @v_fma_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y, <2 x half> ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX6-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_fma_f32 v0, v0, v2, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_fma_f32 v1, v1, v3, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_fma_f32 v0, v0, v2, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fma_v2f16_fneg_rhs: @@ -398,16 +407,19 @@ define <2 x half> @v_fma_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y, <2 x h ; GFX6-LABEL: v_fma_v2f16_fneg_lhs_rhs: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_fma_f32 v0, v0, v2, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_fma_f32 v1, v1, v3, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_fma_f32 v0, v0, v2, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fma_v2f16_fneg_lhs_rhs: @@ -511,22 +523,28 @@ define <4 x half> @v_fma_v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z) { ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX6-NEXT: v_fma_f32 v0, v0, v4, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v9 +; GFX6-NEXT: v_cvt_f32_f16_e32 v9, v11 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v6 -; GFX6-NEXT: v_fma_f32 v1, v1, v5, v9 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v10 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v11 -; GFX6-NEXT: v_fma_f32 v2, v2, v4, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v8, v10 +; GFX6-NEXT: v_fma_f32 v1, v1, v4, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_fma_f32 v3, v3, v6, v7 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_fma_f32 v3, v3, v7, v9 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_fma_f32 v2, v2, v6, v8 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fma_v4f16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll index 543f8e413abd8..882eacafef195 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll @@ -144,8 +144,12 @@ define <3 x half> @v_fmul_v3f16_fneg_lhs(<3 x half> %a, <3 x half> %b) { ; GFX8-LABEL: v_fmul_v3f16_fneg_lhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX8-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, 0x80008000 +; GFX8-NEXT: v_xor_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2 ; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v1, v1, v3 @@ -174,8 +178,12 @@ define <3 x half> @v_fmul_v3f16_fneg_rhs(<3 x half> %a, <3 x half> %b) { ; GFX8-LABEL: v_fmul_v3f16_fneg_rhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, 0x80008000, v3 +; GFX8-NEXT: v_mov_b32_e32 v4, 0x80008000 +; GFX8-NEXT: v_xor_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2 ; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v1, v1, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll index 0577117e9d9e1..228d30a040aad 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll @@ -376,31 +376,34 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) { ; GFX6-LABEL: v_pow_v2f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_mov_b32_e32 v4, 0xc2fc0000 +; GFX6-NEXT: v_log_f32_e32 v1, v1 ; GFX6-NEXT: v_log_f32_e32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v4, 0xc2fc0000 ; GFX6-NEXT: v_mov_b32_e32 v5, 0x42800000 -; GFX6-NEXT: v_log_f32_e32 v1, v1 +; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v3 ; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_cmp_lt_f32_e64 s[4:5], v1, v4 ; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v4 +; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc +; GFX6-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX6-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX6-NEXT: v_mov_b32_e32 v3, 0x1f800000 -; GFX6-NEXT: v_cndmask_b32_e32 v6, 1.0, v3, vcc -; GFX6-NEXT: v_exp_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v2 -; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v1, v4 -; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc -; GFX6-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX6-NEXT: v_exp_f32_e32 v1, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_exp_f32_e32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, 0x1f800000 +; GFX6-NEXT: v_cndmask_b32_e32 v3, 1.0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v2, 1.0, v2, s[4:5] ; GFX6-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_v2f16: @@ -506,21 +509,24 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) { ; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v2 ; GFX6-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 ; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v1, v2 +; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v3 ; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc +; GFX6-NEXT: v_cmp_lt_f32_e64 s[4:5], v0, v2 ; GFX6-NEXT: v_add_f32_e32 v1, v1, v5 -; GFX6-NEXT: v_mov_b32_e32 v5, 0x1f800000 -; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v3 -; GFX6-NEXT: v_cndmask_b32_e32 v6, 1.0, v5, vcc -; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[4:5] ; GFX6-NEXT: v_exp_f32_e32 v1, v1 ; GFX6-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX6-NEXT: v_exp_f32_e32 v2, v0 -; GFX6-NEXT: v_mul_f32_e32 v0, v1, v6 -; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v5, vcc -; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX6-NEXT: v_exp_f32_e32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v5, 0x1f800000 +; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v5, vcc +; GFX6-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v2, 1.0, v5, s[4:5] +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_v2f16_fneg_lhs: @@ -620,9 +626,9 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) { ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX6-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX6-NEXT: v_log_f32_e32 v0, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -632,21 +638,24 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) { ; GFX6-NEXT: v_mov_b32_e32 v3, 0xc2fc0000 ; GFX6-NEXT: v_mov_b32_e32 v4, 0x42800000 ; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v3 +; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc +; GFX6-NEXT: v_cmp_lt_f32_e64 s[4:5], v1, v3 ; GFX6-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX6-NEXT: v_mov_b32_e32 v5, 0x1f800000 -; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v6, 1.0, v5, vcc -; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v1, v3 -; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc -; GFX6-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[4:5] ; GFX6-NEXT: v_exp_f32_e32 v0, v0 +; GFX6-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX6-NEXT: v_exp_f32_e32 v1, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, 0x1f800000 ; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v5, vcc -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v6 +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v2, 1.0, v5, s[4:5] ; GFX6-NEXT: v_mul_f32_e32 v1, v1, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_v2f16_fneg_rhs: @@ -748,11 +757,11 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) { ; GFX6-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX6-NEXT: v_log_f32_e32 v3, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -762,21 +771,24 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) { ; GFX6-NEXT: v_mov_b32_e32 v3, 0xc2fc0000 ; GFX6-NEXT: v_mov_b32_e32 v4, 0x42800000 ; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v2, v3 +; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc +; GFX6-NEXT: v_cmp_lt_f32_e64 s[4:5], v0, v3 ; GFX6-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX6-NEXT: v_mov_b32_e32 v5, 0x1f800000 -; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v6, 1.0, v5, vcc -; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v3 -; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, v4, s[4:5] ; GFX6-NEXT: v_exp_f32_e32 v2, v2 ; GFX6-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX6-NEXT: v_exp_f32_e32 v1, v0 -; GFX6-NEXT: v_mul_f32_e32 v0, v2, v6 -; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v5, vcc -; GFX6-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX6-NEXT: v_exp_f32_e32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v5, 0x1f800000 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v5, vcc +; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX6-NEXT: v_cndmask_b32_e64 v2, 1.0, v5, s[4:5] +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_v2f16_fneg_lhs_rhs: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll index 3bd3486ec261d..3dc014a3588dd 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -3983,6 +3983,11 @@ define <2 x i16> @v_fshl_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) { ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v3, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fshl_v2i16: @@ -4063,6 +4068,11 @@ define <2 x i16> @v_fshl_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 7, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fshl_v2i16_4_8: @@ -5037,7 +5047,17 @@ define <4 x half> @v_fshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt) ; GFX6-NEXT: v_bfe_u32 v4, v7, 1, 15 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v1, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fshl_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll index 58304d2072d7f..b12ad74462e7e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -3763,6 +3763,11 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) { ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v3, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fshr_v2i16: @@ -3852,6 +3857,11 @@ define <2 x i16> @v_fshr_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 7, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fshr_v2i16_4_8: @@ -4341,6 +4351,10 @@ define amdgpu_ps i48 @s_fshr_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, < ; ; GFX8-LABEL: s_fshr_v3i16: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_lshr_b32 s8, s4, 16 +; GFX8-NEXT: s_and_b32 s4, s4, 0xffff +; GFX8-NEXT: s_lshl_b32 s8, s8, 16 +; GFX8-NEXT: s_or_b32 s4, s4, s8 ; GFX8-NEXT: s_and_b32 s8, 0xffff, s2 ; GFX8-NEXT: s_lshr_b32 s6, s0, 16 ; GFX8-NEXT: s_lshr_b32 s7, s2, 16 @@ -4373,6 +4387,7 @@ define amdgpu_ps i48 @s_fshr_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, < ; GFX8-NEXT: s_lshr_b32 s4, s6, s4 ; GFX8-NEXT: s_or_b32 s2, s2, s4 ; GFX8-NEXT: s_and_b32 s4, 0xffff, s3 +; GFX8-NEXT: s_and_b32 s5, s5, 0xffff ; GFX8-NEXT: s_lshl_b32 s1, s1, 1 ; GFX8-NEXT: s_lshr_b32 s4, s4, 15 ; GFX8-NEXT: s_or_b32 s1, s1, s4 @@ -4593,6 +4608,9 @@ define <3 x half> @v_fshr_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) ; GFX8-LABEL: v_fshr_v3i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_or_b32_sdwa v4, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b16_e32 v6, 1, v0 ; GFX8-NEXT: v_lshrrev_b16_e32 v7, 15, v2 ; GFX8-NEXT: v_or_b32_e32 v6, v6, v7 @@ -4623,7 +4641,7 @@ define <3 x half> @v_fshr_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) ; GFX8-NEXT: v_lshrrev_b16_e32 v2, 15, v3 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 1, v3 -; GFX8-NEXT: v_xor_b32_e32 v3, -1, v5 +; GFX8-NEXT: v_xor_b32_sdwa v3, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_e32 v4, 15, v3 ; GFX8-NEXT: v_xor_b32_e32 v3, -1, v3 ; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 @@ -5013,36 +5031,46 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v8 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX6-NEXT: v_bfe_u32 v4, v6, 1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 14, v4 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX6-NEXT: v_bfe_u32 v4, v7, 1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 14, v4 -; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v6 -; GFX6-NEXT: v_xor_b32_e32 v6, -1, v9 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 1, v7 -; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GFX6-NEXT: v_and_b32_e32 v8, 15, v6 -; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6 -; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX6-NEXT: v_bfe_u32 v4, v4, 1, 15 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v8, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v6, v4 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX6-NEXT: v_and_b32_e32 v4, 15, v7 -; GFX6-NEXT: v_xor_b32_e32 v6, -1, v7 -; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v4, v3 -; GFX6-NEXT: v_bfe_u32 v4, v5, 1, 15 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4 -; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v2 +; GFX6-NEXT: v_bfe_u32 v2, v6, 1, 15 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 14, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v3 +; GFX6-NEXT: v_bfe_u32 v3, v7, 1, 15 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 14, v3 +; GFX6-NEXT: v_xor_b32_e32 v5, -1, v9 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v7, 15, v5 +; GFX6-NEXT: v_xor_b32_e32 v5, -1, v5 +; GFX6-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX6-NEXT: v_bfe_u32 v3, v3, 1, 15 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, v7, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v3 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 15, v6 +; GFX6-NEXT: v_xor_b32_e32 v5, -1, v6 +; GFX6-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v3, v2 +; GFX6-NEXT: v_bfe_u32 v3, v4, 1, 15 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v4, v3 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v1, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fshr_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll index 8e4e4cf2c5b87..cd02df5882ca1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll @@ -27,10 +27,14 @@ define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10NSA-LABEL: name: load_1d @@ -55,10 +59,14 @@ define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: load_1d @@ -83,10 +91,14 @@ define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -120,10 +132,14 @@ define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10NSA-LABEL: name: load_2d @@ -151,10 +167,14 @@ define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: load_2d @@ -182,10 +202,14 @@ define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -227,10 +251,14 @@ define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_l ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY12]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY13]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10NSA-LABEL: name: load_3d @@ -265,10 +293,14 @@ define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_l ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY12]](s32) + ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY13]](s32) ; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: load_3d @@ -302,10 +334,14 @@ define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_l ; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY12]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY13]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -348,10 +384,14 @@ define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, <2 x i16> %coords ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY12]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY13]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10NSA-LABEL: name: load_cube @@ -386,10 +426,14 @@ define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, <2 x i16> %coords ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY12]](s32) + ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY13]](s32) ; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: load_cube @@ -423,10 +467,14 @@ define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, <2 x i16> %coords ; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.cube), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY12]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY13]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -462,10 +510,14 @@ define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, <2 x i16> %coo ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1darray), 15, [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10NSA-LABEL: name: load_1darray @@ -493,10 +545,14 @@ define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, <2 x i16> %coo ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1darray), 15, [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: load_1darray @@ -524,10 +580,14 @@ define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, <2 x i16> %coo ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1darray), 15, [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -569,10 +629,14 @@ define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, <2 x i16> %coo ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY12]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY13]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10NSA-LABEL: name: load_2darray @@ -607,10 +671,14 @@ define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, <2 x i16> %coo ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY12]](s32) + ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY13]](s32) ; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: load_2darray @@ -644,10 +712,14 @@ define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, <2 x i16> %coo ; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darray), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY12]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY13]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -690,10 +762,14 @@ define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2dmsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY12]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY13]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10NSA-LABEL: name: load_2dmsaa @@ -728,10 +804,14 @@ define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2dmsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY12]](s32) + ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY13]](s32) ; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: load_2dmsaa @@ -765,10 +845,14 @@ define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2dmsaa), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY12]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY13]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -813,10 +897,14 @@ define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, <2 x i16> ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY12]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY13]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10NSA-LABEL: name: load_2darraymsaa @@ -853,10 +941,14 @@ define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, <2 x i16> ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY12]](s32) + ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY13]](s32) ; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: load_2darraymsaa @@ -892,10 +984,14 @@ define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, <2 x i16> ; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY12]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY13]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -932,10 +1028,14 @@ define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10NSA-LABEL: name: load_mip_1d @@ -963,10 +1063,14 @@ define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: load_mip_1d @@ -994,10 +1098,14 @@ define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -1039,10 +1147,14 @@ define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY12]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY13]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10NSA-LABEL: name: load_mip_2d @@ -1077,10 +1189,14 @@ define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY12]](s32) + ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY13]](s32) ; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: load_mip_2d @@ -1114,10 +1230,14 @@ define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY12]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY13]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -1162,10 +1282,14 @@ define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY12]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY13]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10NSA-LABEL: name: load_mip_3d @@ -1202,10 +1326,14 @@ define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY12]](s32) + ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY13]](s32) ; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: load_mip_3d @@ -1241,10 +1369,14 @@ define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.3d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY12]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY13]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -1290,10 +1422,14 @@ define amdgpu_ps <4 x float> @load_mip_cube(<8 x i32> inreg %rsrc, <2 x i16> %co ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY12]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY13]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10NSA-LABEL: name: load_mip_cube @@ -1330,10 +1466,14 @@ define amdgpu_ps <4 x float> @load_mip_cube(<8 x i32> inreg %rsrc, <2 x i16> %co ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY12]](s32) + ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY13]](s32) ; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: load_mip_cube @@ -1369,10 +1509,14 @@ define amdgpu_ps <4 x float> @load_mip_cube(<8 x i32> inreg %rsrc, <2 x i16> %co ; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.cube), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY12]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY13]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -1416,10 +1560,14 @@ define amdgpu_ps <4 x float> @load_mip_1darray(<8 x i32> inreg %rsrc, <2 x i16> ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY12]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY13]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10NSA-LABEL: name: load_mip_1darray @@ -1454,10 +1602,14 @@ define amdgpu_ps <4 x float> @load_mip_1darray(<8 x i32> inreg %rsrc, <2 x i16> ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY12]](s32) + ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY13]](s32) ; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: load_mip_1darray @@ -1491,10 +1643,14 @@ define amdgpu_ps <4 x float> @load_mip_1darray(<8 x i32> inreg %rsrc, <2 x i16> ; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1darray), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY12]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY13]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -1539,10 +1695,14 @@ define amdgpu_ps <4 x float> @load_mip_2darray(<8 x i32> inreg %rsrc, <2 x i16> ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY12]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY13]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10NSA-LABEL: name: load_mip_2darray @@ -1579,10 +1739,14 @@ define amdgpu_ps <4 x float> @load_mip_2darray(<8 x i32> inreg %rsrc, <2 x i16> ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY12]](s32) + ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY13]](s32) ; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: load_mip_2darray @@ -1618,10 +1782,14 @@ define amdgpu_ps <4 x float> @load_mip_2darray(<8 x i32> inreg %rsrc, <2 x i16> ; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2darray), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY12]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY13]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -3283,10 +3451,14 @@ define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, <2 x i16> %co ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10NSA-LABEL: name: getresinfo_1d @@ -3311,10 +3483,14 @@ define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, <2 x i16> %co ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: getresinfo_1d @@ -3339,10 +3515,14 @@ define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, <2 x i16> %co ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %mip = extractelement <2 x i16> %coords, i32 0 @@ -3373,10 +3553,14 @@ define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, <2 x i16> %co ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10NSA-LABEL: name: getresinfo_2d @@ -3401,10 +3585,14 @@ define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, <2 x i16> %co ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: getresinfo_2d @@ -3429,10 +3617,14 @@ define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, <2 x i16> %co ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %mip = extractelement <2 x i16> %coords, i32 0 @@ -3463,10 +3655,14 @@ define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, <2 x i16> %co ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.3d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10NSA-LABEL: name: getresinfo_3d @@ -3491,10 +3687,14 @@ define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, <2 x i16> %co ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.3d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: getresinfo_3d @@ -3519,10 +3719,14 @@ define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, <2 x i16> %co ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.3d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %mip = extractelement <2 x i16> %coords, i32 0 @@ -3553,10 +3757,14 @@ define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, <2 x i16> % ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.cube), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10NSA-LABEL: name: getresinfo_cube @@ -3581,10 +3789,14 @@ define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, <2 x i16> % ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.cube), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: getresinfo_cube @@ -3609,10 +3821,14 @@ define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, <2 x i16> % ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.cube), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %mip = extractelement <2 x i16> %coords, i32 0 @@ -3643,10 +3859,14 @@ define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, <2 x i16 ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1darray), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10NSA-LABEL: name: getresinfo_1darray @@ -3671,10 +3891,14 @@ define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, <2 x i16 ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1darray), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: getresinfo_1darray @@ -3699,10 +3923,14 @@ define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, <2 x i16 ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1darray), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %mip = extractelement <2 x i16> %coords, i32 0 @@ -3733,10 +3961,14 @@ define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, <2 x i16 ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darray), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10NSA-LABEL: name: getresinfo_2darray @@ -3761,10 +3993,14 @@ define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, <2 x i16 ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darray), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: getresinfo_2darray @@ -3789,10 +4025,14 @@ define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, <2 x i16 ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darray), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %mip = extractelement <2 x i16> %coords, i32 0 @@ -3823,10 +4063,14 @@ define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2dmsaa), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10NSA-LABEL: name: getresinfo_2dmsaa @@ -3851,10 +4095,14 @@ define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2dmsaa), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: getresinfo_2dmsaa @@ -3879,10 +4127,14 @@ define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2dmsaa), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %mip = extractelement <2 x i16> %coords, i32 0 @@ -3913,10 +4165,14 @@ define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, <2 x ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darraymsaa), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10NSA-LABEL: name: getresinfo_2darraymsaa @@ -3941,10 +4197,14 @@ define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, <2 x ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darraymsaa), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: getresinfo_2darraymsaa @@ -3969,10 +4229,14 @@ define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, <2 x ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darraymsaa), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %mip = extractelement <2 x i16> %coords, i32 0 @@ -4081,8 +4345,10 @@ define amdgpu_ps <2 x float> @load_1d_V2(<8 x i32> inreg %rsrc, <2 x i16> %coord ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 9, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<2 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY10]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 ; ; GFX10NSA-LABEL: name: load_1d_V2 @@ -4107,8 +4373,10 @@ define amdgpu_ps <2 x float> @load_1d_V2(<8 x i32> inreg %rsrc, <2 x i16> %coord ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 9, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<2 x s32>), addrspace 8) ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) - ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY10]](s32) ; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 ; ; GFX12-LABEL: name: load_1d_V2 @@ -4133,8 +4401,10 @@ define amdgpu_ps <2 x float> @load_1d_V2(<8 x i32> inreg %rsrc, <2 x i16> %coord ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 9, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<2 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -4327,10 +4597,14 @@ define amdgpu_ps <4 x float> @load_1d_glc(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 1, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10NSA-LABEL: name: load_1d_glc @@ -4355,10 +4629,14 @@ define amdgpu_ps <4 x float> @load_1d_glc(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 1, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: load_1d_glc @@ -4383,10 +4661,14 @@ define amdgpu_ps <4 x float> @load_1d_glc(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 1, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -4417,10 +4699,14 @@ define amdgpu_ps <4 x float> @load_1d_slc(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 2, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10NSA-LABEL: name: load_1d_slc @@ -4445,10 +4731,14 @@ define amdgpu_ps <4 x float> @load_1d_slc(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 2, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: load_1d_slc @@ -4473,10 +4763,14 @@ define amdgpu_ps <4 x float> @load_1d_slc(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 2, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -4507,10 +4801,14 @@ define amdgpu_ps <4 x float> @load_1d_glc_slc(<8 x i32> inreg %rsrc, <2 x i16> % ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 3, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10NSA-LABEL: name: load_1d_glc_slc @@ -4535,10 +4833,14 @@ define amdgpu_ps <4 x float> @load_1d_glc_slc(<8 x i32> inreg %rsrc, <2 x i16> % ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 3, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: load_1d_glc_slc @@ -4563,10 +4865,14 @@ define amdgpu_ps <4 x float> @load_1d_glc_slc(<8 x i32> inreg %rsrc, <2 x i16> % ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 3, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -4851,10 +5157,14 @@ define amdgpu_ps <4 x float> @getresinfo_dmask0(<8 x i32> inreg %rsrc, <4 x floa ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY1]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY2]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY3]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10NSA-LABEL: name: getresinfo_dmask0 @@ -4863,10 +5173,14 @@ define amdgpu_ps <4 x float> @getresinfo_dmask0(<8 x i32> inreg %rsrc, <4 x floa ; GFX10NSA-NEXT: {{ $}} ; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<4 x s32>) - ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10NSA-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10NSA-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10NSA-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY]](s32) + ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY1]](s32) + ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY2]](s32) + ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY3]](s32) ; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: getresinfo_dmask0 @@ -4875,10 +5189,14 @@ define amdgpu_ps <4 x float> @getresinfo_dmask0(<8 x i32> inreg %rsrc, <4 x floa ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY1]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY2]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY3]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %mip = extractelement <2 x i16> %coords, i32 0 @@ -4911,10 +5229,14 @@ define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>) ; GFX9-NEXT: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10NSA-LABEL: name: load_1d_tfe @@ -4941,10 +5263,14 @@ define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 1, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>) ; GFX10NSA-NEXT: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) - ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: load_1d_tfe @@ -4971,10 +5297,14 @@ define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 1, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>) ; GFX12-NEXT: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -5013,10 +5343,14 @@ define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>) ; GFX9-NEXT: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10NSA-LABEL: name: load_2d_tfe @@ -5046,10 +5380,14 @@ define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>) ; GFX10NSA-NEXT: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) - ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: load_2d_tfe @@ -5079,10 +5417,14 @@ define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>) ; GFX12-NEXT: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY11]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY12]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -5129,10 +5471,14 @@ define amdgpu_ps <4 x float> @load_3d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>) ; GFX9-NEXT: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY12]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY13]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10NSA-LABEL: name: load_3d_tfe @@ -5169,10 +5515,14 @@ define amdgpu_ps <4 x float> @load_3d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>) ; GFX10NSA-NEXT: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) - ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY12]](s32) + ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY13]](s32) ; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: load_3d_tfe @@ -5208,10 +5558,14 @@ define amdgpu_ps <4 x float> @load_3d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>) ; GFX12-NEXT: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY12]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY13]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -5261,10 +5615,14 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, <2 x i ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>) ; GFX9-NEXT: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY12]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY13]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10NSA-LABEL: name: load_2darraymsaa_tfe @@ -5303,10 +5661,14 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, <2 x i ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>) ; GFX10NSA-NEXT: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) - ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY12]](s32) + ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY13]](s32) ; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: load_2darraymsaa_tfe @@ -5344,10 +5706,14 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, <2 x i ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>) ; GFX12-NEXT: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY12]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY13]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll index f61f985cd24ab..294172336aef0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll @@ -119,6 +119,7 @@ define amdgpu_ps <3 x half> @image_load_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 ; UNPACKED-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) ; UNPACKED-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD_D16_:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD_D16 intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<3 x s16>), align 8, addrspace 8) ; UNPACKED-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD_D16_]](<3 x s32>) + ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) ; UNPACKED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; UNPACKED-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UV]], [[C]] ; UNPACKED-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C]] @@ -126,7 +127,7 @@ define amdgpu_ps <3 x half> @image_load_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 ; UNPACKED-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) ; UNPACKED-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; UNPACKED-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; UNPACKED-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV2]], [[C]] + ; UNPACKED-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C]] ; UNPACKED-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; UNPACKED-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C1]](s32) ; UNPACKED-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] @@ -363,6 +364,7 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16(<8 x i32> inreg %rsrc, i32 %s, ; UNPACKED-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD_D16_:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD_D16 intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load (<3 x s16>), align 8, addrspace 8) ; UNPACKED-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD_D16_]](<4 x s32>) ; UNPACKED-NEXT: G_STORE [[UV3]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) + ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) ; UNPACKED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; UNPACKED-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UV]], [[C]] ; UNPACKED-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C]] @@ -370,7 +372,7 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16(<8 x i32> inreg %rsrc, i32 %s, ; UNPACKED-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) ; UNPACKED-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; UNPACKED-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; UNPACKED-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV2]], [[C]] + ; UNPACKED-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C]] ; UNPACKED-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; UNPACKED-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C1]](s32) ; UNPACKED-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] @@ -598,6 +600,8 @@ define amdgpu_ps <3 x half> @image_load_v3f16_dmask_1100(<8 x i32> inreg %rsrc, ; UNPACKED-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) ; UNPACKED-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD_D16_:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD_D16 intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<2 x s16>), addrspace 8) ; UNPACKED-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD_D16_]](<2 x s32>) + ; UNPACKED-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[DEF]](s32) ; UNPACKED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; UNPACKED-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UV]], [[C]] ; UNPACKED-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C]] @@ -605,9 +609,10 @@ define amdgpu_ps <3 x half> @image_load_v3f16_dmask_1100(<8 x i32> inreg %rsrc, ; UNPACKED-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) ; UNPACKED-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; UNPACKED-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; UNPACKED-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C]] ; UNPACKED-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; UNPACKED-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C1]](s32) - ; UNPACKED-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL1]] + ; UNPACKED-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] ; UNPACKED-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; UNPACKED-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>) ; UNPACKED-NEXT: $vgpr1 = COPY [[BITCAST1]](<2 x s16>) @@ -630,10 +635,14 @@ define amdgpu_ps <3 x half> @image_load_v3f16_dmask_1100(<8 x i32> inreg %rsrc, ; PACKED-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; PACKED-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) ; PACKED-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD_D16_:%[0-9]+]]:_(<2 x s16>) = G_AMDGPU_INTRIN_IMAGE_LOAD_D16 intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<2 x s16>), addrspace 8) - ; PACKED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; PACKED-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; PACKED-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C1]](s32) - ; PACKED-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[C]], [[SHL]] + ; PACKED-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; PACKED-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[DEF]](s32) + ; PACKED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; PACKED-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C]] + ; PACKED-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; PACKED-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; PACKED-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C2]](s32) + ; PACKED-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; PACKED-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; PACKED-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD_D16_]](<2 x s16>) ; PACKED-NEXT: $vgpr1 = COPY [[BITCAST]](<2 x s16>) @@ -660,6 +669,9 @@ define amdgpu_ps <3 x half> @image_load_v3f16_dmask_1000(<8 x i32> inreg %rsrc, ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; UNPACKED-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) ; UNPACKED-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD_D16_:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD_D16 intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (s16), addrspace 8) + ; UNPACKED-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[DEF]](s32) + ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32) ; UNPACKED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; UNPACKED-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[AMDGPU_INTRIN_IMAGE_LOAD_D16_]], [[C]] ; UNPACKED-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 @@ -667,7 +679,8 @@ define amdgpu_ps <3 x half> @image_load_v3f16_dmask_1000(<8 x i32> inreg %rsrc, ; UNPACKED-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C2]](s32) ; UNPACKED-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; UNPACKED-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; UNPACKED-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[C1]], [[SHL]] + ; UNPACKED-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C]] + ; UNPACKED-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL]] ; UNPACKED-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; UNPACKED-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>) ; UNPACKED-NEXT: $vgpr1 = COPY [[BITCAST1]](<2 x s16>) @@ -690,10 +703,14 @@ define amdgpu_ps <3 x half> @image_load_v3f16_dmask_1000(<8 x i32> inreg %rsrc, ; PACKED-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; PACKED-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) ; PACKED-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD_D16_:%[0-9]+]]:_(<2 x s16>) = G_AMDGPU_INTRIN_IMAGE_LOAD_D16 intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (s16), addrspace 8) - ; PACKED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; PACKED-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; PACKED-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C1]](s32) - ; PACKED-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[C]], [[SHL]] + ; PACKED-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; PACKED-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[DEF]](s32) + ; PACKED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; PACKED-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C]] + ; PACKED-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; PACKED-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; PACKED-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C2]](s32) + ; PACKED-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; PACKED-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; PACKED-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD_D16_]](<2 x s16>) ; PACKED-NEXT: $vgpr1 = COPY [[BITCAST]](<2 x s16>) @@ -1145,6 +1162,8 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16_dmask_1100(<8 x i32> inreg %rs ; UNPACKED-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD_D16_:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD_D16 intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load (<2 x s16>), addrspace 8) ; UNPACKED-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD_D16_]](<3 x s32>) ; UNPACKED-NEXT: G_STORE [[UV2]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) + ; UNPACKED-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32) ; UNPACKED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; UNPACKED-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UV]], [[C]] ; UNPACKED-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C]] @@ -1152,9 +1171,10 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16_dmask_1100(<8 x i32> inreg %rs ; UNPACKED-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) ; UNPACKED-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; UNPACKED-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; UNPACKED-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C]] ; UNPACKED-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; UNPACKED-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C1]](s32) - ; UNPACKED-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL1]] + ; UNPACKED-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] ; UNPACKED-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; UNPACKED-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>) ; UNPACKED-NEXT: $vgpr1 = COPY [[BITCAST1]](<2 x s16>) @@ -1181,10 +1201,14 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16_dmask_1100(<8 x i32> inreg %rs ; PACKED-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD_D16_]](<2 x s32>) ; PACKED-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[UV]](s32) ; PACKED-NEXT: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) - ; PACKED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; PACKED-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; PACKED-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C1]](s32) - ; PACKED-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[C]], [[SHL]] + ; PACKED-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; PACKED-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32) + ; PACKED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; PACKED-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C]] + ; PACKED-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; PACKED-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; PACKED-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C2]](s32) + ; PACKED-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; PACKED-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; PACKED-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>) ; PACKED-NEXT: $vgpr1 = COPY [[BITCAST1]](<2 x s16>) @@ -1217,6 +1241,9 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16_dmask_1000(<8 x i32> inreg %rs ; UNPACKED-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD_D16_:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD_D16 intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load (s16), addrspace 8) ; UNPACKED-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD_D16_]](<2 x s32>) ; UNPACKED-NEXT: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) + ; UNPACKED-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32) + ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32) ; UNPACKED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; UNPACKED-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UV]], [[C]] ; UNPACKED-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 @@ -1224,7 +1251,8 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16_dmask_1000(<8 x i32> inreg %rs ; UNPACKED-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C2]](s32) ; UNPACKED-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; UNPACKED-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; UNPACKED-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[C1]], [[SHL]] + ; UNPACKED-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C]] + ; UNPACKED-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL]] ; UNPACKED-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; UNPACKED-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>) ; UNPACKED-NEXT: $vgpr1 = COPY [[BITCAST1]](<2 x s16>) @@ -1251,10 +1279,14 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16_dmask_1000(<8 x i32> inreg %rs ; PACKED-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD_D16_]](<2 x s32>) ; PACKED-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[UV]](s32) ; PACKED-NEXT: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) - ; PACKED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; PACKED-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; PACKED-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C1]](s32) - ; PACKED-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[C]], [[SHL]] + ; PACKED-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; PACKED-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32) + ; PACKED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; PACKED-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C]] + ; PACKED-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; PACKED-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; PACKED-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C2]](s32) + ; PACKED-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; PACKED-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; PACKED-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>) ; PACKED-NEXT: $vgpr1 = COPY [[BITCAST1]](<2 x s16>) @@ -1287,6 +1319,9 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16_dmask_0000(<8 x i32> inreg %rs ; UNPACKED-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD_D16_:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD_D16 intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load (s16), addrspace 8) ; UNPACKED-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD_D16_]](<2 x s32>) ; UNPACKED-NEXT: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) + ; UNPACKED-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32) + ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32) ; UNPACKED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; UNPACKED-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UV]], [[C]] ; UNPACKED-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 @@ -1294,7 +1329,8 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16_dmask_0000(<8 x i32> inreg %rs ; UNPACKED-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C2]](s32) ; UNPACKED-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; UNPACKED-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; UNPACKED-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[C1]], [[SHL]] + ; UNPACKED-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C]] + ; UNPACKED-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL]] ; UNPACKED-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; UNPACKED-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>) ; UNPACKED-NEXT: $vgpr1 = COPY [[BITCAST1]](<2 x s16>) @@ -1321,10 +1357,14 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16_dmask_0000(<8 x i32> inreg %rs ; PACKED-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD_D16_]](<2 x s32>) ; PACKED-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[UV]](s32) ; PACKED-NEXT: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) - ; PACKED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; PACKED-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; PACKED-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C1]](s32) - ; PACKED-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[C]], [[SHL]] + ; PACKED-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; PACKED-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32) + ; PACKED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; PACKED-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C]] + ; PACKED-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; PACKED-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; PACKED-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C2]](s32) + ; PACKED-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; PACKED-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; PACKED-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>) ; PACKED-NEXT: $vgpr1 = COPY [[BITCAST1]](<2 x s16>) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.ll index adf7e6d38b989..52030a90ef66e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.ll @@ -44,8 +44,10 @@ define amdgpu_ps <2 x float> @image_load_v2f32(<8 x i32> inreg %rsrc, i32 %s, i3 ; GCN-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) ; GCN-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<2 x s32>), addrspace 8) ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) - ; GCN-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GCN-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GCN-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GCN-NEXT: $vgpr1 = COPY [[COPY11]](s32) ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 %tex = call <2 x float> @llvm.amdgcn.image.load.2d.v2f32.i32(i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret <2 x float> %tex @@ -70,9 +72,12 @@ define amdgpu_ps <3 x float> @image_load_v3f32(<8 x i32> inreg %rsrc, i32 %s, i3 ; GCN-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) ; GCN-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<3 x s32>), align 16, addrspace 8) ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<3 x s32>) - ; GCN-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GCN-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GCN-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GCN-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GCN-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GCN-NEXT: $vgpr2 = COPY [[COPY12]](s32) ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 %tex = call <3 x float> @llvm.amdgcn.image.load.2d.v3f32.i32(i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret <3 x float> %tex @@ -97,10 +102,14 @@ define amdgpu_ps <4 x float> @image_load_v4f32(<8 x i32> inreg %rsrc, i32 %s, i3 ; GCN-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) ; GCN-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GCN-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GCN-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GCN-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GCN-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GCN-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GCN-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GCN-NEXT: $vgpr2 = COPY [[COPY12]](s32) + ; GCN-NEXT: $vgpr3 = COPY [[COPY13]](s32) ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 %tex = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %tex @@ -157,8 +166,10 @@ define amdgpu_ps <2 x float> @image_load_tfe_v2f32(<8 x i32> inreg %rsrc, i32 %s ; GCN-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load (<2 x s32>), addrspace 8) ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<3 x s32>) ; GCN-NEXT: G_STORE [[UV2]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) - ; GCN-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GCN-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GCN-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GCN-NEXT: $vgpr1 = COPY [[COPY11]](s32) ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 %res = call { <2 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v2f32i32s.i32(i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) %tex = extractvalue { <2 x float>, i32 } %res, 0 @@ -188,9 +199,12 @@ define amdgpu_ps <3 x float> @image_load_tfe_v3f32(<8 x i32> inreg %rsrc, i32 %s ; GCN-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load (<3 x s32>), align 16, addrspace 8) ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GCN-NEXT: G_STORE [[UV3]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) - ; GCN-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GCN-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GCN-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GCN-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GCN-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GCN-NEXT: $vgpr2 = COPY [[COPY12]](s32) ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 %res = call { <3 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v3f32i32s.i32(i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) %tex = extractvalue { <3 x float>, i32 } %res, 0 @@ -220,10 +234,14 @@ define amdgpu_ps <4 x float> @image_load_tfe_v4f32(<8 x i32> inreg %rsrc, i32 %s ; GCN-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>) ; GCN-NEXT: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) - ; GCN-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GCN-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GCN-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GCN-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GCN-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GCN-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GCN-NEXT: $vgpr2 = COPY [[COPY12]](s32) + ; GCN-NEXT: $vgpr3 = COPY [[COPY13]](s32) ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 %res = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) %tex = extractvalue { <4 x float>, i32 } %res, 0 @@ -265,8 +283,10 @@ define amdgpu_ps <2 x float> @image_load_v2f32_dmask_1000(<8 x i32> inreg %rsrc, ; GCN-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) ; GCN-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (s32), addrspace 8) ; GCN-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) - ; GCN-NEXT: $vgpr1 = COPY [[DEF]](s32) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[DEF]](s32) + ; GCN-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GCN-NEXT: $vgpr1 = COPY [[COPY11]](s32) ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 %tex = call <2 x float> @llvm.amdgcn.image.load.2d.v2f32.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret <2 x float> %tex @@ -281,8 +301,10 @@ define amdgpu_ps <2 x float> @image_load_v2f32_dmask_0000(<8 x i32> inreg %rsrc, ; GCN-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<2 x s32>) - ; GCN-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GCN-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GCN-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GCN-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GCN-NEXT: $vgpr0 = COPY [[COPY2]](s32) + ; GCN-NEXT: $vgpr1 = COPY [[COPY3]](s32) ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 %tex = call <2 x float> @llvm.amdgcn.image.load.2d.v2f32.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret <2 x float> %tex @@ -308,9 +330,12 @@ define amdgpu_ps <3 x float> @image_load_v3f32_dmask_1100(<8 x i32> inreg %rsrc, ; GCN-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<2 x s32>), addrspace 8) ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; GCN-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GCN-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GCN-NEXT: $vgpr2 = COPY [[DEF]](s32) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[DEF]](s32) + ; GCN-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GCN-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GCN-NEXT: $vgpr2 = COPY [[COPY12]](s32) ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 %tex = call <3 x float> @llvm.amdgcn.image.load.2d.v3f32.i32(i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret <3 x float> %tex @@ -335,9 +360,12 @@ define amdgpu_ps <3 x float> @image_load_v3f32_dmask_1000(<8 x i32> inreg %rsrc, ; GCN-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) ; GCN-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (s32), addrspace 8) ; GCN-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) - ; GCN-NEXT: $vgpr1 = COPY [[DEF]](s32) - ; GCN-NEXT: $vgpr2 = COPY [[DEF]](s32) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[DEF]](s32) + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[DEF]](s32) + ; GCN-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GCN-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GCN-NEXT: $vgpr2 = COPY [[COPY12]](s32) ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 %tex = call <3 x float> @llvm.amdgcn.image.load.2d.v3f32.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret <3 x float> %tex @@ -352,9 +380,12 @@ define amdgpu_ps <3 x float> @image_load_v3f32_dmask_0000(<8 x i32> inreg %rsrc, ; GCN-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<3 x s32>) - ; GCN-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GCN-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GCN-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GCN-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GCN-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GCN-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GCN-NEXT: $vgpr0 = COPY [[COPY2]](s32) + ; GCN-NEXT: $vgpr1 = COPY [[COPY3]](s32) + ; GCN-NEXT: $vgpr2 = COPY [[COPY4]](s32) ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 %tex = call <3 x float> @llvm.amdgcn.image.load.2d.v3f32.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret <3 x float> %tex @@ -380,10 +411,14 @@ define amdgpu_ps <4 x float> @image_load_v4f32_dmask_1110(<8 x i32> inreg %rsrc, ; GCN-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<3 x s32>), align 16, addrspace 8) ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<3 x s32>) ; GCN-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GCN-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GCN-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GCN-NEXT: $vgpr3 = COPY [[DEF]](s32) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[DEF]](s32) + ; GCN-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GCN-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GCN-NEXT: $vgpr2 = COPY [[COPY12]](s32) + ; GCN-NEXT: $vgpr3 = COPY [[COPY13]](s32) ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 %tex = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %tex @@ -409,10 +444,14 @@ define amdgpu_ps <4 x float> @image_load_v4f32_dmask_1100(<8 x i32> inreg %rsrc, ; GCN-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<2 x s32>), addrspace 8) ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; GCN-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GCN-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GCN-NEXT: $vgpr2 = COPY [[DEF]](s32) - ; GCN-NEXT: $vgpr3 = COPY [[DEF]](s32) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[DEF]](s32) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[DEF]](s32) + ; GCN-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GCN-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GCN-NEXT: $vgpr2 = COPY [[COPY12]](s32) + ; GCN-NEXT: $vgpr3 = COPY [[COPY13]](s32) ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 %tex = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %tex @@ -437,10 +476,14 @@ define amdgpu_ps <4 x float> @image_load_v4f32_dmask_1000(<8 x i32> inreg %rsrc, ; GCN-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) ; GCN-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (s32), addrspace 8) ; GCN-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) - ; GCN-NEXT: $vgpr1 = COPY [[DEF]](s32) - ; GCN-NEXT: $vgpr2 = COPY [[DEF]](s32) - ; GCN-NEXT: $vgpr3 = COPY [[DEF]](s32) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[DEF]](s32) + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[DEF]](s32) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[DEF]](s32) + ; GCN-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GCN-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GCN-NEXT: $vgpr2 = COPY [[COPY12]](s32) + ; GCN-NEXT: $vgpr3 = COPY [[COPY13]](s32) ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 %tex = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %tex @@ -455,10 +498,14 @@ define amdgpu_ps <4 x float> @image_load_v4f32_dmask_0000(<8 x i32> inreg %rsrc, ; GCN-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<4 x s32>) - ; GCN-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GCN-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GCN-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GCN-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GCN-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GCN-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GCN-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GCN-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GCN-NEXT: $vgpr0 = COPY [[COPY2]](s32) + ; GCN-NEXT: $vgpr1 = COPY [[COPY3]](s32) + ; GCN-NEXT: $vgpr2 = COPY [[COPY4]](s32) + ; GCN-NEXT: $vgpr3 = COPY [[COPY5]](s32) ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 %tex = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %tex @@ -516,8 +563,10 @@ define amdgpu_ps <2 x float> @image_load_tfe_v2f32_dmask_1000(<8 x i32> inreg %r ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; GCN-NEXT: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) ; GCN-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) - ; GCN-NEXT: $vgpr0 = COPY [[UV2]](s32) - ; GCN-NEXT: $vgpr1 = COPY [[UV3]](s32) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GCN-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GCN-NEXT: $vgpr1 = COPY [[COPY11]](s32) ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 %res = call { <2 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v2f32i32s.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) %tex = extractvalue { <2 x float>, i32 } %res, 0 @@ -548,8 +597,10 @@ define amdgpu_ps <2 x float> @image_load_tfe_v2f32_dmask_0000(<8 x i32> inreg %r ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; GCN-NEXT: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) ; GCN-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) - ; GCN-NEXT: $vgpr0 = COPY [[UV2]](s32) - ; GCN-NEXT: $vgpr1 = COPY [[UV3]](s32) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GCN-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GCN-NEXT: $vgpr1 = COPY [[COPY11]](s32) ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 %res = call { <2 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v2f32i32s.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) %tex = extractvalue { <2 x float>, i32 } %res, 0 @@ -580,9 +631,12 @@ define amdgpu_ps <3 x float> @image_load_tfe_v3f32_dmask_1100(<8 x i32> inreg %r ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<3 x s32>) ; GCN-NEXT: G_STORE [[UV2]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) ; GCN-NEXT: [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<3 x s32>) - ; GCN-NEXT: $vgpr0 = COPY [[UV3]](s32) - ; GCN-NEXT: $vgpr1 = COPY [[UV4]](s32) - ; GCN-NEXT: $vgpr2 = COPY [[UV5]](s32) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV4]](s32) + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV5]](s32) + ; GCN-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GCN-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GCN-NEXT: $vgpr2 = COPY [[COPY12]](s32) ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 %res = call { <3 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v3f32i32s.i32(i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) %tex = extractvalue { <3 x float>, i32 } %res, 0 @@ -613,9 +667,12 @@ define amdgpu_ps <3 x float> @image_load_tfe_v3f32_dmask_1000(<8 x i32> inreg %r ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GCN-NEXT: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) - ; GCN-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GCN-NEXT: $vgpr1 = COPY [[DEF1]](s32) - ; GCN-NEXT: $vgpr2 = COPY [[DEF1]](s32) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32) + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32) + ; GCN-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GCN-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GCN-NEXT: $vgpr2 = COPY [[COPY12]](s32) ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 %res = call { <3 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v3f32i32s.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) %tex = extractvalue { <3 x float>, i32 } %res, 0 @@ -646,9 +703,12 @@ define amdgpu_ps <3 x float> @image_load_tfe_v3f32_dmask_0000(<8 x i32> inreg %r ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GCN-NEXT: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) - ; GCN-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GCN-NEXT: $vgpr1 = COPY [[DEF1]](s32) - ; GCN-NEXT: $vgpr2 = COPY [[DEF1]](s32) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32) + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32) + ; GCN-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GCN-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GCN-NEXT: $vgpr2 = COPY [[COPY12]](s32) ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 %res = call { <3 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v3f32i32s.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) %tex = extractvalue { <3 x float>, i32 } %res, 0 @@ -679,10 +739,14 @@ define amdgpu_ps <4 x float> @image_load_tfe_v4f32_dmask_1110(<8 x i32> inreg %r ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GCN-NEXT: G_STORE [[UV3]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) ; GCN-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GCN-NEXT: $vgpr0 = COPY [[UV4]](s32) - ; GCN-NEXT: $vgpr1 = COPY [[UV5]](s32) - ; GCN-NEXT: $vgpr2 = COPY [[UV6]](s32) - ; GCN-NEXT: $vgpr3 = COPY [[UV7]](s32) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV4]](s32) + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV5]](s32) + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV6]](s32) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV7]](s32) + ; GCN-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GCN-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GCN-NEXT: $vgpr2 = COPY [[COPY12]](s32) + ; GCN-NEXT: $vgpr3 = COPY [[COPY13]](s32) ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 %res = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) %tex = extractvalue { <4 x float>, i32 } %res, 0 @@ -713,10 +777,14 @@ define amdgpu_ps <4 x float> @image_load_tfe_v4f32_dmask_1100(<8 x i32> inreg %r ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<3 x s32>) ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GCN-NEXT: G_STORE [[UV2]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) - ; GCN-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GCN-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GCN-NEXT: $vgpr2 = COPY [[DEF1]](s32) - ; GCN-NEXT: $vgpr3 = COPY [[DEF1]](s32) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32) + ; GCN-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GCN-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GCN-NEXT: $vgpr2 = COPY [[COPY12]](s32) + ; GCN-NEXT: $vgpr3 = COPY [[COPY13]](s32) ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 %res = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) %tex = extractvalue { <4 x float>, i32 } %res, 0 @@ -747,10 +815,14 @@ define amdgpu_ps <4 x float> @image_load_tfe_v4f32_dmask_1000(<8 x i32> inreg %r ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GCN-NEXT: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) - ; GCN-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GCN-NEXT: $vgpr1 = COPY [[DEF1]](s32) - ; GCN-NEXT: $vgpr2 = COPY [[DEF1]](s32) - ; GCN-NEXT: $vgpr3 = COPY [[DEF1]](s32) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32) + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32) + ; GCN-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GCN-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GCN-NEXT: $vgpr2 = COPY [[COPY12]](s32) + ; GCN-NEXT: $vgpr3 = COPY [[COPY13]](s32) ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 %res = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) %tex = extractvalue { <4 x float>, i32 } %res, 0 @@ -781,10 +853,14 @@ define amdgpu_ps <4 x float> @image_load_tfe_v4f32_dmask_0000(<8 x i32> inreg %r ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GCN-NEXT: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) - ; GCN-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GCN-NEXT: $vgpr1 = COPY [[DEF1]](s32) - ; GCN-NEXT: $vgpr2 = COPY [[DEF1]](s32) - ; GCN-NEXT: $vgpr3 = COPY [[DEF1]](s32) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32) + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32) + ; GCN-NEXT: $vgpr0 = COPY [[COPY10]](s32) + ; GCN-NEXT: $vgpr1 = COPY [[COPY11]](s32) + ; GCN-NEXT: $vgpr2 = COPY [[COPY12]](s32) + ; GCN-NEXT: $vgpr3 = COPY [[COPY13]](s32) ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 %res = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) %tex = extractvalue { <4 x float>, i32 } %res, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2darraymsaa.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2darraymsaa.ll index 4d36e0f797016..3d90783b5cf69 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2darraymsaa.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2darraymsaa.ll @@ -25,10 +25,14 @@ define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, i32 %s, i3 ; GFX6-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX6-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[BUILD_VECTOR1]](<4 x s32>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX6-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX6-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX6-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX6-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX6-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX6-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX6-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX6-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX6-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX6-NEXT: $vgpr0 = COPY [[COPY12]](s32) + ; GFX6-NEXT: $vgpr1 = COPY [[COPY13]](s32) + ; GFX6-NEXT: $vgpr2 = COPY [[COPY14]](s32) + ; GFX6-NEXT: $vgpr3 = COPY [[COPY15]](s32) ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10NSA-LABEL: name: load_2darraymsaa @@ -50,10 +54,14 @@ define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, i32 %s, i3 ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10NSA-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10NSA-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY12]](s32) + ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY13]](s32) + ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY14]](s32) + ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY15]](s32) ; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 %v = call <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v @@ -84,10 +92,14 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, ptr ad ; GFX6-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[BUILD_VECTOR1]](<4 x s32>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX6-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>) ; GFX6-NEXT: G_STORE [[UV4]](s32), [[MV]](p1) :: (store (s32) into %ir.out, addrspace 1) - ; GFX6-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX6-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX6-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX6-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX6-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX6-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX6-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX6-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX6-NEXT: $vgpr0 = COPY [[COPY14]](s32) + ; GFX6-NEXT: $vgpr1 = COPY [[COPY15]](s32) + ; GFX6-NEXT: $vgpr2 = COPY [[COPY16]](s32) + ; GFX6-NEXT: $vgpr3 = COPY [[COPY17]](s32) ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10NSA-LABEL: name: load_2darraymsaa_tfe @@ -113,10 +125,14 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, ptr ad ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>) ; GFX10NSA-NEXT: G_STORE [[UV4]](s32), [[MV]](p1) :: (store (s32) into %ir.out, addrspace 1) - ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10NSA-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10NSA-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10NSA-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10NSA-NEXT: $vgpr0 = COPY [[COPY14]](s32) + ; GFX10NSA-NEXT: $vgpr1 = COPY [[COPY15]](s32) + ; GFX10NSA-NEXT: $vgpr2 = COPY [[COPY16]](s32) + ; GFX10NSA-NEXT: $vgpr3 = COPY [[COPY17]](s32) ; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2darraymsaa.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0) %v.vec = extractvalue { <4 x float>, i32 } %v, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.a16.ll index 5b017ad89a0ed..f058551644684 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.a16.ll @@ -29,10 +29,14 @@ define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg % ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY13]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY14]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY15]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY16]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: sample_1d @@ -59,10 +63,14 @@ define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg % ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY13]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY14]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY15]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY16]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_1d @@ -89,10 +97,14 @@ define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg % ; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY13]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY14]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY15]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY16]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_1d @@ -119,10 +131,14 @@ define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg % ; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY13]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY14]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY15]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY16]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16(i32 15, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -155,10 +171,14 @@ define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg % ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY14]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY15]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY16]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY17]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: sample_2d @@ -186,10 +206,14 @@ define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg % ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY14]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY15]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY16]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY17]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_2d @@ -217,10 +241,14 @@ define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg % ; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY14]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY15]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY16]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY17]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_2d @@ -248,10 +276,14 @@ define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg % ; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY14]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY15]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY16]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY17]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16(i32 15, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -289,10 +321,14 @@ define amdgpu_ps <4 x float> @sample_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg % ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: sample_3d @@ -325,10 +361,14 @@ define amdgpu_ps <4 x float> @sample_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg % ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_3d @@ -361,10 +401,14 @@ define amdgpu_ps <4 x float> @sample_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg % ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_3d @@ -396,10 +440,14 @@ define amdgpu_ps <4 x float> @sample_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg % ; GFX12-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.3d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f16(i32 15, half %s, half %t, half %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -437,10 +485,14 @@ define amdgpu_ps <4 x float> @sample_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: sample_cube @@ -473,10 +525,14 @@ define amdgpu_ps <4 x float> @sample_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_cube @@ -509,10 +565,14 @@ define amdgpu_ps <4 x float> @sample_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_cube @@ -544,10 +604,14 @@ define amdgpu_ps <4 x float> @sample_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cube), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f16(i32 15, half %s, half %t, half %face, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -580,10 +644,14 @@ define amdgpu_ps <4 x float> @sample_1darray(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1darray), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY14]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY15]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY16]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY17]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: sample_1darray @@ -611,10 +679,14 @@ define amdgpu_ps <4 x float> @sample_1darray(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1darray), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY14]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY15]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY16]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY17]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_1darray @@ -642,10 +714,14 @@ define amdgpu_ps <4 x float> @sample_1darray(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1darray), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY14]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY15]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY16]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY17]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_1darray @@ -673,10 +749,14 @@ define amdgpu_ps <4 x float> @sample_1darray(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1darray), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY14]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY15]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY16]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY17]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f16(i32 15, half %s, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -714,10 +794,14 @@ define amdgpu_ps <4 x float> @sample_2darray(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: sample_2darray @@ -750,10 +834,14 @@ define amdgpu_ps <4 x float> @sample_2darray(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_2darray @@ -786,10 +874,14 @@ define amdgpu_ps <4 x float> @sample_2darray(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_2darray @@ -821,10 +913,14 @@ define amdgpu_ps <4 x float> @sample_2darray(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.2darray), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f16(i32 15, half %s, half %t, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -859,10 +955,14 @@ define amdgpu_ps <4 x float> @sample_c_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY14]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY15]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY16]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY17]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: sample_c_1d @@ -892,10 +992,14 @@ define amdgpu_ps <4 x float> @sample_c_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY14]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY15]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY16]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY17]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_c_1d @@ -925,10 +1029,14 @@ define amdgpu_ps <4 x float> @sample_c_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY14]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY15]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY16]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY17]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_c_1d @@ -957,10 +1065,14 @@ define amdgpu_ps <4 x float> @sample_c_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY14]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY15]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY16]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY17]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f16(i32 15, float %zcompare, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -996,10 +1108,14 @@ define amdgpu_ps <4 x float> @sample_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: sample_c_2d @@ -1030,10 +1146,14 @@ define amdgpu_ps <4 x float> @sample_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_c_2d @@ -1064,10 +1184,14 @@ define amdgpu_ps <4 x float> @sample_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_c_2d @@ -1097,10 +1221,14 @@ define amdgpu_ps <4 x float> @sample_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f16(i32 15, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1133,10 +1261,14 @@ define amdgpu_ps <4 x float> @sample_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY14]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY15]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY16]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY17]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: sample_cl_1d @@ -1164,10 +1296,14 @@ define amdgpu_ps <4 x float> @sample_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY14]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY15]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY16]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY17]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_cl_1d @@ -1195,10 +1331,14 @@ define amdgpu_ps <4 x float> @sample_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY14]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY15]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY16]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY17]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_cl_1d @@ -1226,10 +1366,14 @@ define amdgpu_ps <4 x float> @sample_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY14]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY15]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY16]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY17]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cl.1d.v4f32.f16(i32 15, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1267,10 +1411,14 @@ define amdgpu_ps <4 x float> @sample_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cl.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: sample_cl_2d @@ -1303,10 +1451,14 @@ define amdgpu_ps <4 x float> @sample_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cl.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_cl_2d @@ -1339,10 +1491,14 @@ define amdgpu_ps <4 x float> @sample_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cl.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_cl_2d @@ -1374,10 +1530,14 @@ define amdgpu_ps <4 x float> @sample_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX12-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cl.2d.v4f32.f16(i32 15, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1413,10 +1573,14 @@ define amdgpu_ps <4 x float> @sample_c_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cl.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: sample_c_cl_1d @@ -1447,10 +1611,14 @@ define amdgpu_ps <4 x float> @sample_c_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cl.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_c_cl_1d @@ -1481,10 +1649,14 @@ define amdgpu_ps <4 x float> @sample_c_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cl.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_c_cl_1d @@ -1514,10 +1686,14 @@ define amdgpu_ps <4 x float> @sample_c_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f16(i32 15, float %zcompare, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1557,10 +1733,14 @@ define amdgpu_ps <4 x float> @sample_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cl.2d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: sample_c_cl_2d @@ -1594,10 +1774,14 @@ define amdgpu_ps <4 x float> @sample_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_c_cl_2d @@ -1631,10 +1815,14 @@ define amdgpu_ps <4 x float> @sample_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX11-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_c_cl_2d @@ -1668,10 +1856,14 @@ define amdgpu_ps <4 x float> @sample_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.cl.2d.v4f32.f16(i32 15, float %zcompare, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1707,10 +1899,14 @@ define amdgpu_ps <4 x float> @sample_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY14]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY15]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY16]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY17]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: sample_b_1d @@ -1741,10 +1937,14 @@ define amdgpu_ps <4 x float> @sample_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY14]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY15]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY16]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY17]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_b_1d @@ -1775,10 +1975,14 @@ define amdgpu_ps <4 x float> @sample_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY14]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY15]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY16]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY17]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_b_1d @@ -1808,10 +2012,14 @@ define amdgpu_ps <4 x float> @sample_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY14]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY15]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY16]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY17]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f16.f16(i32 15, half %bias, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1849,10 +2057,14 @@ define amdgpu_ps <4 x float> @sample_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: sample_b_2d @@ -1885,10 +2097,14 @@ define amdgpu_ps <4 x float> @sample_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_b_2d @@ -1921,10 +2137,14 @@ define amdgpu_ps <4 x float> @sample_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_b_2d @@ -1956,10 +2176,14 @@ define amdgpu_ps <4 x float> @sample_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f16.f16(i32 15, half %bias, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1997,10 +2221,14 @@ define amdgpu_ps <4 x float> @sample_c_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.1d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: sample_c_b_1d @@ -2032,10 +2260,14 @@ define amdgpu_ps <4 x float> @sample_c_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_c_b_1d @@ -2067,10 +2299,14 @@ define amdgpu_ps <4 x float> @sample_c_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX11-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_c_b_1d @@ -2102,10 +2338,14 @@ define amdgpu_ps <4 x float> @sample_c_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX12-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f16.f16(i32 15, half %bias, float %zcompare, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -2145,10 +2385,14 @@ define amdgpu_ps <4 x float> @sample_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.2d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: sample_c_b_2d @@ -2182,10 +2426,14 @@ define amdgpu_ps <4 x float> @sample_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_c_b_2d @@ -2219,10 +2467,14 @@ define amdgpu_ps <4 x float> @sample_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX11-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_c_b_2d @@ -2256,10 +2508,14 @@ define amdgpu_ps <4 x float> @sample_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX12-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f16.f16(i32 15, half %bias, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -2297,10 +2553,14 @@ define amdgpu_ps <4 x float> @sample_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.cl.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: sample_b_cl_1d @@ -2333,10 +2593,14 @@ define amdgpu_ps <4 x float> @sample_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.cl.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_b_cl_1d @@ -2369,10 +2633,14 @@ define amdgpu_ps <4 x float> @sample_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.cl.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_b_cl_1d @@ -2404,10 +2672,14 @@ define amdgpu_ps <4 x float> @sample_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f16.f16(i32 15, half %bias, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -2448,10 +2720,14 @@ define amdgpu_ps <4 x float> @sample_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.cl.2d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: sample_b_cl_2d @@ -2486,10 +2762,14 @@ define amdgpu_ps <4 x float> @sample_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[DEF]](s16) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_b_cl_2d @@ -2524,10 +2804,14 @@ define amdgpu_ps <4 x float> @sample_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX11-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[DEF]](s16) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_b_cl_2d @@ -2562,10 +2846,14 @@ define amdgpu_ps <4 x float> @sample_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[DEF]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f16.f16(i32 15, half %bias, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -2605,10 +2893,14 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.cl.1d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: sample_c_b_cl_1d @@ -2642,10 +2934,14 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_c_b_cl_1d @@ -2679,10 +2975,14 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX11-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_c_b_cl_1d @@ -2716,10 +3016,14 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX12-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f16.f16(i32 15, half %bias, float %zcompare, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -2762,10 +3066,14 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.cl.2d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY17]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY18]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY19]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY20]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: sample_c_b_cl_2d @@ -2802,10 +3110,14 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[DEF]](s16) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY17]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY18]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY19]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY20]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_c_b_cl_2d @@ -2842,10 +3154,14 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX11-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[DEF]](s16) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY17]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY18]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY19]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY20]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_c_b_cl_2d @@ -2882,10 +3198,14 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX12-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[DEF]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY17]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY18]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY19]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY20]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f16.f16(i32 15, half %bias, float %zcompare, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -2924,10 +3244,14 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.1d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: sample_d_1d @@ -2960,10 +3284,14 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_d_1d @@ -2996,10 +3324,14 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX11-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_d_1d @@ -3032,10 +3364,14 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f16(i32 15, half %dsdh, half %dsdv, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -3079,10 +3415,14 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.2d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY18]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY19]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY20]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY21]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: sample_d_2d @@ -3120,10 +3460,14 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY18]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY19]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY20]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY21]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_d_2d @@ -3161,10 +3505,14 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX11-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY18]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY19]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY20]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY21]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_d_2d @@ -3202,10 +3550,14 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY18]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY19]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY20]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY21]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -3259,10 +3611,14 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), [[BUILD_VECTOR6]](<2 x s16>), [[BUILD_VECTOR7]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY21]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY22]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY23]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY24]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: sample_d_3d @@ -3311,10 +3667,14 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), [[BUILD_VECTOR6]](<2 x s16>), [[BUILD_VECTOR7]](<2 x s16>) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY21]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY22]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY23]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY24]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_d_3d @@ -3363,10 +3723,14 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR6]](<2 x s16>), [[BUILD_VECTOR7]](<2 x s16>) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY21]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY22]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY23]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY24]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_d_3d @@ -3415,10 +3779,14 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR5]](<2 x s16>), [[BUILD_VECTOR6]](<2 x s16>), [[BUILD_VECTOR7]](<2 x s16>) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY21]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY22]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY23]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY24]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, half %s, half %t, half %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -3459,10 +3827,14 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.1d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: sample_c_d_1d @@ -3497,10 +3869,14 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_c_d_1d @@ -3535,10 +3911,14 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX11-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_c_d_1d @@ -3573,10 +3953,14 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX12-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dsdv, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -3622,10 +4006,14 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.2d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY19]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY20]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY21]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY22]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: sample_c_d_2d @@ -3665,10 +4053,14 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY19]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY20]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY21]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY22]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_c_d_2d @@ -3708,10 +4100,14 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX11-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY19]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY20]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY21]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY22]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_c_d_2d @@ -3751,10 +4147,14 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX12-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY19]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY20]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY21]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY22]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -3795,10 +4195,14 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.1d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: sample_d_cl_1d @@ -3833,10 +4237,14 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_d_cl_1d @@ -3871,10 +4279,14 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX11-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_d_cl_1d @@ -3909,10 +4321,14 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f16(i32 15, half %dsdh, half %dsdv, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -3960,10 +4376,14 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.2d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY19]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY20]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY21]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY22]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: sample_d_cl_2d @@ -4005,10 +4425,14 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[DEF]](s16) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY19]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY20]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY21]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY22]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_d_cl_2d @@ -4050,10 +4474,14 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX11-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[DEF]](s16) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY19]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY20]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY21]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY22]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_d_cl_2d @@ -4095,10 +4523,14 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[DEF]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY19]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY20]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY21]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY22]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -4141,10 +4573,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.1d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY17]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY18]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY19]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY20]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: sample_c_d_cl_1d @@ -4181,10 +4617,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY17]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY18]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY19]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY20]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_c_d_cl_1d @@ -4221,10 +4661,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX11-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY17]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY18]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY19]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY20]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_c_d_cl_1d @@ -4261,10 +4705,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX12-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY17]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY18]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY19]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY20]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dsdv, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -4314,10 +4762,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<10 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.2d), 15, [[CONCAT_VECTORS]](<10 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY20]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY21]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY22]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY23]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: sample_c_d_cl_2d @@ -4361,10 +4813,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[DEF]](s16) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY20]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY21]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY22]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY23]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_c_d_cl_2d @@ -4408,10 +4864,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX11-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[DEF]](s16) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY20]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY21]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY22]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY23]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_c_d_cl_2d @@ -4456,10 +4916,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY20]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY21]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY22]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY23]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -4498,10 +4962,14 @@ define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.1d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: sample_cd_1d @@ -4534,10 +5002,14 @@ define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_cd_1d @@ -4570,10 +5042,14 @@ define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX11-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_cd_1d @@ -4606,10 +5082,14 @@ define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX12-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f16(i32 15, half %dsdh, half %dsdv, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -4653,10 +5133,14 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.2d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY18]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY19]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY20]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY21]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: sample_cd_2d @@ -4694,10 +5178,14 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY18]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY19]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY20]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY21]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_cd_2d @@ -4735,10 +5223,14 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX11-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY18]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY19]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY20]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY21]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_cd_2d @@ -4776,10 +5268,14 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX12-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY18]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY19]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY20]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY21]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -4820,10 +5316,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.1d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: sample_c_cd_1d @@ -4858,10 +5358,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_c_cd_1d @@ -4896,10 +5400,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX11-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_c_cd_1d @@ -4934,10 +5442,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dsdv, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -4983,10 +5495,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.2d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY19]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY20]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY21]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY22]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: sample_c_cd_2d @@ -5026,10 +5542,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY19]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY20]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY21]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY22]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_c_cd_2d @@ -5069,10 +5589,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX11-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY19]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY20]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY21]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY22]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_c_cd_2d @@ -5112,10 +5636,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY19]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY20]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY21]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY22]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -5156,10 +5684,14 @@ define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.1d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: sample_cd_cl_1d @@ -5194,10 +5726,14 @@ define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_cd_cl_1d @@ -5232,10 +5768,14 @@ define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX11-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_cd_cl_1d @@ -5270,10 +5810,14 @@ define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX12-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f16(i32 15, half %dsdh, half %dsdv, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -5321,10 +5865,14 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.2d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY19]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY20]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY21]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY22]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: sample_cd_cl_2d @@ -5366,10 +5914,14 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX10-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[DEF]](s16) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY19]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY20]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY21]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY22]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_cd_cl_2d @@ -5411,10 +5963,14 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX11-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[DEF]](s16) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY19]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY20]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY21]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY22]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_cd_cl_2d @@ -5456,10 +6012,14 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX12-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[DEF]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY19]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY20]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY21]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY22]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -5502,10 +6062,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.1d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY17]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY18]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY19]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY20]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: sample_c_cd_cl_1d @@ -5542,10 +6106,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY17]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY18]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY19]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY20]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_c_cd_cl_1d @@ -5582,10 +6150,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX11-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY17]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY18]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY19]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY20]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_c_cd_cl_1d @@ -5622,10 +6194,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX12-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY17]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY18]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY19]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY20]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dsdv, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -5675,10 +6251,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<10 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.2d), 15, [[CONCAT_VECTORS]](<10 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY20]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY21]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY22]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY23]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: sample_c_cd_cl_2d @@ -5722,10 +6302,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[DEF]](s16) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY20]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY21]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY22]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY23]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_c_cd_cl_2d @@ -5769,10 +6353,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX11-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[DEF]](s16) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY20]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY21]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY22]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY23]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_c_cd_cl_2d @@ -5817,10 +6405,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY20]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY21]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY22]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY23]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -5853,10 +6445,14 @@ define amdgpu_ps <4 x float> @sample_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY14]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY15]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY16]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY17]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: sample_l_1d @@ -5884,10 +6480,14 @@ define amdgpu_ps <4 x float> @sample_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY14]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY15]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY16]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY17]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_l_1d @@ -5915,10 +6515,14 @@ define amdgpu_ps <4 x float> @sample_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY14]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY15]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY16]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY17]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_l_1d @@ -5946,10 +6550,14 @@ define amdgpu_ps <4 x float> @sample_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY14]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY15]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY16]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY17]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f16(i32 15, half %s, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -5987,10 +6595,14 @@ define amdgpu_ps <4 x float> @sample_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: sample_l_2d @@ -6023,10 +6635,14 @@ define amdgpu_ps <4 x float> @sample_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_l_2d @@ -6059,10 +6675,14 @@ define amdgpu_ps <4 x float> @sample_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_l_2d @@ -6094,10 +6714,14 @@ define amdgpu_ps <4 x float> @sample_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f16(i32 15, half %s, half %t, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -6133,10 +6757,14 @@ define amdgpu_ps <4 x float> @sample_c_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: sample_c_l_1d @@ -6167,10 +6795,14 @@ define amdgpu_ps <4 x float> @sample_c_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_c_l_1d @@ -6201,10 +6833,14 @@ define amdgpu_ps <4 x float> @sample_c_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_c_l_1d @@ -6234,10 +6870,14 @@ define amdgpu_ps <4 x float> @sample_c_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f16(i32 15, float %zcompare, half %s, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -6277,10 +6917,14 @@ define amdgpu_ps <4 x float> @sample_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.2d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: sample_c_l_2d @@ -6314,10 +6958,14 @@ define amdgpu_ps <4 x float> @sample_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_c_l_2d @@ -6351,10 +6999,14 @@ define amdgpu_ps <4 x float> @sample_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX11-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_c_l_2d @@ -6388,10 +7040,14 @@ define amdgpu_ps <4 x float> @sample_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX12-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f16(i32 15, float %zcompare, half %s, half %t, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -6423,10 +7079,14 @@ define amdgpu_ps <4 x float> @sample_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.lz.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY13]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY14]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY15]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY16]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: sample_lz_1d @@ -6453,10 +7113,14 @@ define amdgpu_ps <4 x float> @sample_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.lz.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY13]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY14]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY15]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY16]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_lz_1d @@ -6483,10 +7147,14 @@ define amdgpu_ps <4 x float> @sample_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.lz.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY13]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY14]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY15]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY16]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_lz_1d @@ -6513,10 +7181,14 @@ define amdgpu_ps <4 x float> @sample_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.lz.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY13]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY14]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY15]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY16]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f16(i32 15, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -6549,10 +7221,14 @@ define amdgpu_ps <4 x float> @sample_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.lz.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY14]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY15]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY16]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY17]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: sample_lz_2d @@ -6580,10 +7256,14 @@ define amdgpu_ps <4 x float> @sample_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.lz.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY14]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY15]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY16]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY17]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_lz_2d @@ -6611,10 +7291,14 @@ define amdgpu_ps <4 x float> @sample_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.lz.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY14]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY15]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY16]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY17]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_lz_2d @@ -6642,10 +7326,14 @@ define amdgpu_ps <4 x float> @sample_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.lz.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY14]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY15]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY16]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY17]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f16(i32 15, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -6680,10 +7368,14 @@ define amdgpu_ps <4 x float> @sample_c_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.lz.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY14]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY15]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY16]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY17]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: sample_c_lz_1d @@ -6713,10 +7405,14 @@ define amdgpu_ps <4 x float> @sample_c_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.lz.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY14]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY15]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY16]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY17]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_c_lz_1d @@ -6746,10 +7442,14 @@ define amdgpu_ps <4 x float> @sample_c_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.lz.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY14]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY15]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY16]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY17]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_c_lz_1d @@ -6778,10 +7478,14 @@ define amdgpu_ps <4 x float> @sample_c_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.lz.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY14]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY15]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY16]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY17]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f16(i32 15, float %zcompare, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -6817,10 +7521,14 @@ define amdgpu_ps <4 x float> @sample_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.lz.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX10-LABEL: name: sample_c_lz_2d @@ -6851,10 +7559,14 @@ define amdgpu_ps <4 x float> @sample_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.lz.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_c_lz_2d @@ -6885,10 +7597,14 @@ define amdgpu_ps <4 x float> @sample_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.lz.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_c_lz_2d @@ -6918,10 +7634,14 @@ define amdgpu_ps <4 x float> @sample_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.lz.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f16(i32 15, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -7162,8 +7882,10 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 6, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<2 x s32>), addrspace 8) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) - ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX9-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[COPY21]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[COPY22]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 ; ; GFX10-LABEL: name: sample_c_d_o_2darray_V2 @@ -7210,8 +7932,10 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 6, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<2 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY21]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY22]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 ; ; GFX11-LABEL: name: sample_c_d_o_2darray_V2 @@ -7258,8 +7982,10 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 6, [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<2 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY21]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY22]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 ; ; GFX12-LABEL: name: sample_c_d_o_2darray_V2 @@ -7306,8 +8032,10 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 6, [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<2 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY21]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY22]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 main_body: %v = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f16(i32 6, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.d.ll index 241170b94318a..d7c1c7a6bef5e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.d.ll @@ -34,10 +34,14 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<9 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[BUILD_VECTOR2]](<9 x s32>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY21]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY22]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY23]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY24]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_d_3d @@ -70,10 +74,14 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[BUILD_VECTOR2]](<5 x s32>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY21]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY22]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY23]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY24]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_d_3d @@ -106,10 +114,14 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<6 x s32>) = G_BUILD_VECTOR [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[BUILD_VECTOR2]](<6 x s32>), $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY21]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY22]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY23]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY24]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f32.f32(i32 15, float %dsdh, float %dtdh, float %drdh, float %dsdv, float %dtdv, float %drdv, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -148,10 +160,14 @@ define amdgpu_ps <4 x float> @sample_c_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<10 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.3d), 15, [[BUILD_VECTOR2]](<10 x s32>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY25:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY22]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY23]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY24]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY25]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_c_d_3d @@ -185,10 +201,14 @@ define amdgpu_ps <4 x float> @sample_c_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<6 x s32>) = G_BUILD_VECTOR [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.3d), 15, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[BUILD_VECTOR2]](<6 x s32>), $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY25:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY22]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY23]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY24]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY25]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_c_d_3d @@ -222,10 +242,14 @@ define amdgpu_ps <4 x float> @sample_c_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<7 x s32>) = G_BUILD_VECTOR [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.3d), 15, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[BUILD_VECTOR2]](<7 x s32>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY25:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY22]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY23]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY24]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY25]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.3d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dtdh, float %drdh, float %dsdv, float %dtdv, float %drdv, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -265,10 +289,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_3d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<11 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.3d), 15, [[BUILD_VECTOR2]](<11 x s32>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY25:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY26:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY23]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY24]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY25]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY26]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_c_d_cl_3d @@ -303,10 +331,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_3d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<7 x s32>) = G_BUILD_VECTOR [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.3d), 15, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[BUILD_VECTOR2]](<7 x s32>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY25:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY26:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY23]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY24]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY25]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY26]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_c_d_cl_3d @@ -341,10 +373,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_3d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.3d), 15, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[BUILD_VECTOR2]](<8 x s32>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY25:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY26:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY23]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY24]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY25]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY26]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.3d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dtdh, float %drdh, float %dsdv, float %dtdv, float %drdv, float %s, float %t, float %r, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -385,10 +421,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_o_3d(<8 x i32> inreg %rsrc, <4 x i32 ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<12 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.o.3d), 15, [[BUILD_VECTOR2]](<12 x s32>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY25:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY26:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY27:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY24]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY25]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY26]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY27]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_c_d_cl_o_3d @@ -424,10 +464,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_o_3d(<8 x i32> inreg %rsrc, <4 x i32 ; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.o.3d), 15, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[BUILD_VECTOR2]](<8 x s32>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY25:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY26:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY27:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY24]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY25]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY26]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY27]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_c_d_cl_o_3d @@ -463,10 +507,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_o_3d(<8 x i32> inreg %rsrc, <4 x i32 ; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<9 x s32>) = G_BUILD_VECTOR [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.o.3d), 15, [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[BUILD_VECTOR2]](<9 x s32>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY25:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY26:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY27:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY24]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY25]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY26]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY27]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.3d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %drdh, float %dsdv, float %dtdv, float %drdv, float %s, float %t, float %r, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.a16.ll index f05b258c974d1..477965ab8981b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.a16.ll @@ -34,10 +34,14 @@ define amdgpu_ps <4 x float> @sample_d_1d_g16_a16(<8 x i32> inreg %rsrc, <4 x i3 ; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_d_1d_g16_a16 @@ -70,10 +74,14 @@ define amdgpu_ps <4 x float> @sample_d_1d_g16_a16(<8 x i32> inreg %rsrc, <4 x i3 ; GFX11-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_d_1d_g16_a16 @@ -106,10 +114,14 @@ define amdgpu_ps <4 x float> @sample_d_1d_g16_a16(<8 x i32> inreg %rsrc, <4 x i3 ; GFX12-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f16(i32 15, half %dsdh, half %dsdv, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -152,10 +164,14 @@ define amdgpu_ps <4 x float> @sample_d_2d_g16_a16(<8 x i32> inreg %rsrc, <4 x i3 ; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY18]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY19]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY20]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY21]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_d_2d_g16_a16 @@ -193,10 +209,14 @@ define amdgpu_ps <4 x float> @sample_d_2d_g16_a16(<8 x i32> inreg %rsrc, <4 x i3 ; GFX11-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY18]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY19]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY20]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY21]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_d_2d_g16_a16 @@ -234,10 +254,14 @@ define amdgpu_ps <4 x float> @sample_d_2d_g16_a16(<8 x i32> inreg %rsrc, <4 x i3 ; GFX12-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY18]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY19]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY20]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY21]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -291,10 +315,14 @@ define amdgpu_ps <4 x float> @sample_d_3d_g16_a16(<8 x i32> inreg %rsrc, <4 x i3 ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), [[BUILD_VECTOR6]](<2 x s16>), [[BUILD_VECTOR7]](<2 x s16>) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY21]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY22]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY23]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY24]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_d_3d_g16_a16 @@ -343,10 +371,14 @@ define amdgpu_ps <4 x float> @sample_d_3d_g16_a16(<8 x i32> inreg %rsrc, <4 x i3 ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR6]](<2 x s16>), [[BUILD_VECTOR7]](<2 x s16>) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY21]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY22]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY23]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY24]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_d_3d_g16_a16 @@ -395,10 +427,14 @@ define amdgpu_ps <4 x float> @sample_d_3d_g16_a16(<8 x i32> inreg %rsrc, <4 x i3 ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR5]](<2 x s16>), [[BUILD_VECTOR6]](<2 x s16>), [[BUILD_VECTOR7]](<2 x s16>) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY21]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY22]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY23]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY24]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, half %s, half %t, half %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.ll index cc2a8ba9c4d5d..e78a9897be9c5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.ll @@ -33,10 +33,14 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY14]](s32) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_d_1d @@ -68,10 +72,14 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY14]](s32) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_d_1d @@ -103,10 +111,14 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY14]](s32) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -148,10 +160,14 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY18]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY19]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY20]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY21]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_d_2d @@ -188,10 +204,14 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY18]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY19]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY20]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY21]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_d_2d @@ -228,10 +248,14 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY18]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY19]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY20]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY21]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -283,10 +307,14 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<14 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[CONCAT_VECTORS]](<14 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY21]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY22]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY23]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY24]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_d_3d @@ -333,10 +361,14 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY21]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY22]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY23]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY24]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_d_3d @@ -383,10 +415,14 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR5]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY21]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY22]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY23]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY24]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -425,10 +461,14 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_c_d_1d @@ -462,10 +502,14 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_c_d_1d @@ -499,10 +543,14 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -546,10 +594,14 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY19]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY20]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY21]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY22]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_c_d_2d @@ -588,10 +640,14 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX11-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY19]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY20]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY21]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY22]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_c_d_2d @@ -631,10 +687,14 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY19]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY20]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY21]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY22]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -673,10 +733,14 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_d_cl_1d @@ -710,10 +774,14 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_d_cl_1d @@ -747,10 +815,14 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -794,10 +866,14 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY19]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY20]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY21]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY22]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_d_cl_2d @@ -836,10 +912,14 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX11-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY19]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY20]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY21]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY22]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_d_cl_2d @@ -879,10 +959,14 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY19]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY20]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY21]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY22]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -923,10 +1007,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY16]](s32) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY17]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY18]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY19]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY20]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_c_d_cl_1d @@ -962,10 +1050,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX11-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY16]](s32) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY17]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY18]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY19]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY20]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_c_d_cl_1d @@ -1002,10 +1094,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY17]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY18]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY19]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY20]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1052,10 +1148,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.2d), 15, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY20]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY21]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY22]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY23]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_c_d_cl_2d @@ -1097,10 +1197,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY20]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY21]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY22]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY23]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_c_d_cl_2d @@ -1142,10 +1246,14 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY20]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY21]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY22]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY23]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1182,10 +1290,14 @@ define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY14]](s32) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_cd_1d @@ -1217,10 +1329,14 @@ define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY14]](s32) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_cd_1d @@ -1252,10 +1368,14 @@ define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY14]](s32) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY15]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY16]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY17]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY18]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1297,10 +1417,14 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY18]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY19]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY20]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY21]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_cd_2d @@ -1337,10 +1461,14 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY18]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY19]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY20]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY21]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_cd_2d @@ -1377,10 +1505,14 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY18]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY19]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY20]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY21]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1419,10 +1551,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_c_cd_1d @@ -1456,10 +1592,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_c_cd_1d @@ -1493,10 +1633,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1540,10 +1684,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY19]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY20]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY21]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY22]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_c_cd_2d @@ -1582,10 +1730,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX11-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY19]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY20]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY21]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY22]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_c_cd_2d @@ -1625,10 +1777,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY19]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY20]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY21]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY22]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1667,10 +1823,14 @@ define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_cd_cl_1d @@ -1704,10 +1864,14 @@ define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_cd_cl_1d @@ -1741,10 +1905,14 @@ define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY16]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY17]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY18]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY19]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1788,10 +1956,14 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY19]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY20]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY21]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY22]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_cd_cl_2d @@ -1830,10 +2002,14 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX11-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY19]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY20]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY21]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY22]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_cd_cl_2d @@ -1873,10 +2049,14 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY19]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY20]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY21]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY22]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1917,10 +2097,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY16]](s32) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY17]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY18]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY19]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY20]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_c_cd_cl_1d @@ -1956,10 +2140,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX11-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY16]](s32) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY17]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY18]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY19]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY20]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_c_cd_cl_1d @@ -1996,10 +2184,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY17]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY18]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY19]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY20]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -2046,10 +2238,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.2d), 15, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY20]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY21]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[COPY22]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[COPY23]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX11-LABEL: name: sample_c_cd_cl_2d @@ -2091,10 +2287,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX11-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX11-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY20]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY21]](s32) + ; GFX11-NEXT: $vgpr2 = COPY [[COPY22]](s32) + ; GFX11-NEXT: $vgpr3 = COPY [[COPY23]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; ; GFX12-LABEL: name: sample_c_cd_cl_2d @@ -2136,10 +2336,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY20]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY21]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[COPY22]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[COPY23]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -2322,8 +2526,10 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<14 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 6, [[CONCAT_VECTORS]](<14 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<2 x s32>), addrspace 8) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) - ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[COPY21]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[COPY22]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 ; ; GFX11-LABEL: name: sample_c_d_o_2darray_V2 @@ -2367,8 +2573,10 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 6, [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<2 x s32>), addrspace 8) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) - ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX11-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX11-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[COPY21]](s32) + ; GFX11-NEXT: $vgpr1 = COPY [[COPY22]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 ; ; GFX12-LABEL: name: sample_c_d_o_2darray_V2 @@ -2412,8 +2620,10 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>) ; GFX12-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 6, [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<2 x s32>), addrspace 8) ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) - ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX12-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: $vgpr0 = COPY [[COPY21]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[COPY22]](s32) ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 main_body: %v = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32 6, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll index 12234088adca6..67ff69a70c1ce 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll @@ -242,12 +242,23 @@ define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 - ; UNPACKED-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY10]](<2 x s16>) - ; UNPACKED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; UNPACKED-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; UNPACKED-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY11]](<2 x s16>) + ; UNPACKED-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>) + ; UNPACKED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; UNPACKED-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>) + ; UNPACKED-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<2 x s32>) + ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; UNPACKED-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY12]], [[C]](s32) + ; UNPACKED-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>) + ; UNPACKED-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST1]](<2 x s32>) + ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; UNPACKED-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNPACKED-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY13]], [[C1]](s32) + ; UNPACKED-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>) + ; UNPACKED-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST2]](<2 x s32>) + ; UNPACKED-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV5]](s32) + ; UNPACKED-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY14]], [[C]](s32) ; UNPACKED-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; UNPACKED-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[BITCAST]](s32), [[LSHR]](s32), [[BITCAST1]](s32) + ; UNPACKED-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LSHR]](s32), [[LSHR1]](s32), [[LSHR2]](s32) ; UNPACKED-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE_D16 intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR2]](<3 x s32>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<3 x s16>), align 8, addrspace 8) ; UNPACKED-NEXT: S_ENDPGM 0 ; @@ -268,26 +279,37 @@ define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, ; GFX81-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX81-NEXT: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 ; GFX81-NEXT: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 - ; GFX81-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY10]](<2 x s16>) - ; GFX81-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX81-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX81-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY11]](<2 x s16>) + ; GFX81-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>) + ; GFX81-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX81-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>) + ; GFX81-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<2 x s32>) + ; GFX81-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX81-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY12]], [[C]](s32) + ; GFX81-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>) + ; GFX81-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST1]](<2 x s32>) + ; GFX81-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX81-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX81-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY13]], [[C1]](s32) + ; GFX81-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>) + ; GFX81-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST2]](<2 x s32>) + ; GFX81-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV5]](s32) + ; GFX81-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY14]], [[C]](s32) ; GFX81-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GFX81-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX81-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]] - ; GFX81-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C]](s32) + ; GFX81-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX81-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]] + ; GFX81-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR1]], [[C1]](s32) ; GFX81-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] - ; GFX81-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; GFX81-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]] - ; GFX81-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX81-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32) + ; GFX81-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; GFX81-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C2]] + ; GFX81-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; GFX81-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY15]], [[C1]](s32) ; GFX81-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL1]] - ; GFX81-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; GFX81-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL1]] - ; GFX81-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) - ; GFX81-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>) - ; GFX81-NEXT: [[BITCAST5:%[0-9]+]]:_(<3 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<6 x s16>) - ; GFX81-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE_D16 intrinsic(@llvm.amdgcn.image.store.2d), [[BITCAST5]](<3 x s32>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<3 x s16>), align 8, addrspace 8) + ; GFX81-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) + ; GFX81-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL1]] + ; GFX81-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; GFX81-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) + ; GFX81-NEXT: [[BITCAST6:%[0-9]+]]:_(<3 x s32>) = G_BITCAST [[CONCAT_VECTORS1]](<6 x s16>) + ; GFX81-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE_D16 intrinsic(@llvm.amdgcn.image.store.2d), [[BITCAST6]](<3 x s32>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<3 x s16>), align 8, addrspace 8) ; GFX81-NEXT: S_ENDPGM 0 ; ; GFX9-LABEL: name: image_store_v3f16 @@ -308,8 +330,29 @@ define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>) + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<2 x s32>) + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY12]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>) + ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST1]](<2 x s32>) + ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY13]], [[C1]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>) + ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST2]](<2 x s32>) + ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV5]](s32) + ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY14]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GFX9-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE_D16 intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS]](<4 x s16>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<3 x s16>), align 8, addrspace 8) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) + ; GFX9-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE_D16 intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS1]](<4 x s16>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<3 x s16>), align 8, addrspace 8) ; GFX9-NEXT: S_ENDPGM 0 ; ; GFX10-LABEL: name: image_store_v3f16 @@ -330,8 +373,29 @@ define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, ; GFX10-NEXT: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>) + ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<2 x s32>) + ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY12]], [[C]](s32) + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>) + ; GFX10-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST1]](<2 x s32>) + ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY13]], [[C1]](s32) + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>) + ; GFX10-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST2]](<2 x s32>) + ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV5]](s32) + ; GFX10-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY14]], [[C]](s32) + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GFX10-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE_D16 intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS]](<4 x s16>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<3 x s16>), align 8, addrspace 8) + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) + ; GFX10-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE_D16 intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS1]](<4 x s16>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<3 x s16>), align 8, addrspace 8) ; GFX10-NEXT: S_ENDPGM 0 ; ; GFX12-LABEL: name: image_store_v3f16 @@ -352,7 +416,28 @@ define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, ; GFX12-NEXT: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 ; GFX12-NEXT: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>) - ; GFX12-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE_D16 intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS]](<4 x s16>), 7, [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<3 x s16>), align 8, addrspace 8) + ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>) + ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<2 x s32>) + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY12]], [[C]](s32) + ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>) + ; GFX12-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST1]](<2 x s32>) + ; GFX12-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY13]], [[C1]](s32) + ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX12-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>) + ; GFX12-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST2]](<2 x s32>) + ; GFX12-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV5]](s32) + ; GFX12-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY14]], [[C]](s32) + ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) + ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX12-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) + ; GFX12-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE_D16 intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS1]](<4 x s16>), 7, [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<3 x s16>), align 8, addrspace 8) ; GFX12-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.image.store.2d.v3f16.i32(<3 x half> %in, i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll index 41e915a4c1011..7c2c61deca375 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll @@ -375,10 +375,15 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) { define amdgpu_cs <2 x i16> @abs_sgpr_v2i16(<2 x i16> inreg %arg) { ; GFX6-LABEL: abs_sgpr_v2i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_sext_i32_i16 s0, s0 ; GFX6-NEXT: s_sext_i32_i16 s1, s1 -; GFX6-NEXT: s_abs_i32 s0, s0 +; GFX6-NEXT: s_sext_i32_i16 s0, s0 ; GFX6-NEXT: s_abs_i32 s1, s1 +; GFX6-NEXT: s_abs_i32 s0, s0 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_lshr_b32 s1, s0, 16 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: abs_sgpr_v2i16: @@ -415,6 +420,11 @@ define amdgpu_cs <2 x i16> @abs_vgpr_v2i16(<2 x i16> %arg) { ; GFX6-NEXT: v_max_i32_e32 v0, v0, v2 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v1 ; GFX6-NEXT: v_max_i32_e32 v1, v1, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0 ; GFX6-NEXT: v_readfirstlane_b32 s1, v1 ; GFX6-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-legalize-range-metadata.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-legalize-range-metadata.ll index b6b4301dadc7a..9c2ac009e44a3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-legalize-range-metadata.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-legalize-range-metadata.ll @@ -14,16 +14,18 @@ define <4 x i8> @global_load_v4i8_align4__rangemd(ptr addrspace(1) %ptr) { ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV]](p1) :: (load (s32) from %ir.ptr, addrspace 1) - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32) - ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32) - ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C2]](s32) - ; CHECK-NEXT: $vgpr0 = COPY [[LOAD]](s32) - ; CHECK-NEXT: $vgpr1 = COPY [[LSHR]](s32) - ; CHECK-NEXT: $vgpr2 = COPY [[LSHR1]](s32) - ; CHECK-NEXT: $vgpr3 = COPY [[LSHR2]](s32) + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C3]](s32) + ; CHECK-NEXT: $vgpr0 = COPY [[LSHR]](s32) + ; CHECK-NEXT: $vgpr1 = COPY [[LSHR1]](s32) + ; CHECK-NEXT: $vgpr2 = COPY [[LSHR2]](s32) + ; CHECK-NEXT: $vgpr3 = COPY [[LSHR3]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 %load = load <4 x i8>, ptr addrspace(1) %ptr, align 4, !range !0, !noundef !1 ret <4 x i8> %load diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll index 5dd4fa0809131..d2793000a31e2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -712,6 +712,9 @@ define <2 x i16> @v_lshr_v2i16(<2 x i16> %value, <2 x i16> %amount) { ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_lshr_v2i16: @@ -741,8 +744,11 @@ define <2 x i16> @v_lshr_v2i16_15(<2 x i16> %value) { ; GFX6-LABEL: v_lshr_v2i16_15: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_bfe_u32 v0, v0, 15, 1 ; GFX6-NEXT: v_bfe_u32 v1, v1, 15, 1 +; GFX6-NEXT: v_bfe_u32 v0, v0, 15, 1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_lshr_v2i16_15: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll index 168e6dfa5f147..e361ebdf9b608 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -2750,25 +2750,32 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_min_i32_e32 v5, 0, v0 +; GFX6-NEXT: v_min_i32_e32 v6, 0, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v7, 1 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_max_i32_e32 v4, 0, v0 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0x80000000, v5 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v7, v6 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0x7fffffff, v4 -; GFX6-NEXT: v_max_i32_e32 v2, v5, v2 +; GFX6-NEXT: v_max_i32_e32 v2, v6, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 ; GFX6-NEXT: v_min_i32_e32 v4, 0, v1 +; GFX6-NEXT: v_bfrev_b32_e32 v5, -2 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX6-NEXT: v_max_i32_e32 v3, 0, v1 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0x80000000, v4 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x7fffffff, v3 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v5, v3 ; GFX6-NEXT: v_max_i32_e32 v2, v4, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v3 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_saddsat_v2i16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll index bac80f0777c02..d641913ada13d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll @@ -636,8 +636,13 @@ define <2 x i16> @v_sext_inreg_v2i16_8(<2 x i16> %value) { ; GFX6-LABEL: v_sext_inreg_v2i16_8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_sext_inreg_v2i16_8: @@ -673,8 +678,13 @@ define <2 x i16> @v_sext_inreg_v2i16_15(<2 x i16> %value) { ; GFX6-LABEL: v_sext_inreg_v2i16_15: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 1 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 1 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_sext_inreg_v2i16_15: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll index 4cf1c92539c36..08fc956f2dc45 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll @@ -720,6 +720,11 @@ define <2 x i16> @v_shl_v2i16(<2 x i16> %value, <2 x i16> %amount) { ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_shl_v2i16: @@ -750,7 +755,10 @@ define <2 x i16> @v_shl_v2i16_15(<2 x i16> %value) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 15, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 15, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 31, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_shl_v2i16_15: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll index 2572f8581f0ed..4d5a8cb6d6902 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -2753,22 +2753,29 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX6-NEXT: v_max_i32_e32 v4, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 0x7fffffff, v4 -; GFX6-NEXT: v_min_i32_e32 v5, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, 0x80000000, v5 +; GFX6-NEXT: v_min_i32_e32 v6, -1, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v7, 1 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v7 ; GFX6-NEXT: v_max_i32_e32 v2, v4, v2 -; GFX6-NEXT: v_min_i32_e32 v2, v2, v5 +; GFX6-NEXT: v_min_i32_e32 v2, v2, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_bfrev_b32_e32 v5, -2 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX6-NEXT: v_max_i32_e32 v3, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x7fffffff, v3 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v5 ; GFX6-NEXT: v_min_i32_e32 v4, -1, v1 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 0x80000000, v4 ; GFX6-NEXT: v_max_i32_e32 v2, v3, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_v2i16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll index 788692c94b0cf..a52e70a4cfc48 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -1872,8 +1872,9 @@ define <2 x i16> @v_uaddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX6-NEXT: v_not_b32_e32 v3, v1 ; GFX6-NEXT: v_min_u32_e32 v2, v3, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uaddsat_v2i16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll index 0042d34e235d1..1e3c6d1559dab 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -1784,8 +1784,9 @@ define <2 x i16> @v_usubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX6-NEXT: v_min_u32_e32 v2, v1, v2 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_usubsat_v2i16: diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll index 9f093cc7b5abf..d21e3e7165ef0 100644 --- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll @@ -9251,11 +9251,21 @@ define <4 x i16> @multi_use_mul_mad_v2i16_var(<2 x i16> %x, <2 x i16> %y, <2 x i ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v0, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v0, v1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v0, v1 +; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, v1, v5 +; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v7 ; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, v2, v4 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v5 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX67-GISEL-NEXT: v_or_b32_e32 v2, v2, v1 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: multi_use_mul_mad_v2i16_var: @@ -9373,12 +9383,17 @@ define <2 x i16> @other_use_mul_mad_v2i16_var(<2 x i16> %x, <2 x i16> %y, <2 x i ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX67-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5 ; GFX67-GISEL-NEXT: s_mov_b32 m0, -1 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX67-GISEL-NEXT: ds_write_b32 v6, v2 ; GFX67-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll index 4bed23487445a..9661154a64381 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll @@ -6338,14 +6338,17 @@ define <2 x half> @v_exp_v2f16(<2 x half> %in) { ; SI-GISEL-LABEL: v_exp_v2f16: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 ; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp_v2f16: @@ -6444,15 +6447,18 @@ define <2 x half> @v_exp_fabs_v2f16(<2 x half> %in) { ; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-GISEL-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 -; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; SI-GISEL-NEXT: v_exp_f32_e32 v2, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp_fabs_v2f16: @@ -6556,15 +6562,18 @@ define <2 x half> @v_exp_fneg_fabs_v2f16(<2 x half> %in) { ; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-GISEL-NEXT: v_or_b32_e32 v0, 0x80008000, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 -; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; SI-GISEL-NEXT: v_exp_f32_e32 v2, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp_fneg_fabs_v2f16: @@ -6669,15 +6678,18 @@ define <2 x half> @v_exp_fneg_v2f16(<2 x half> %in) { ; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-GISEL-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 -; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; SI-GISEL-NEXT: v_exp_f32_e32 v2, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp_fneg_v2f16: @@ -6758,19 +6770,22 @@ define <2 x half> @v_exp_v2f16_fast(<2 x half> %in) { ; SI-GISEL-LABEL: v_exp_v2f16_fast: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, 0x3dc5 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v2 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v2 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v2 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp_v2f16_fast: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll index ec7e52532cd32..045492aeed07b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll @@ -6431,14 +6431,17 @@ define <2 x half> @v_exp10_v2f16(<2 x half> %in) { ; SI-GISEL-LABEL: v_exp10_v2f16: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 ; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp10_v2f16: @@ -6537,15 +6540,18 @@ define <2 x half> @v_exp10_fabs_v2f16(<2 x half> %in) { ; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-GISEL-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 -; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; SI-GISEL-NEXT: v_exp_f32_e32 v2, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp10_fabs_v2f16: @@ -6649,15 +6655,18 @@ define <2 x half> @v_exp10_fneg_fabs_v2f16(<2 x half> %in) { ; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-GISEL-NEXT: v_or_b32_e32 v0, 0x80008000, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 -; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; SI-GISEL-NEXT: v_exp_f32_e32 v2, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp10_fneg_fabs_v2f16: @@ -6762,15 +6771,18 @@ define <2 x half> @v_exp10_fneg_v2f16(<2 x half> %in) { ; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-GISEL-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 -; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; SI-GISEL-NEXT: v_exp_f32_e32 v2, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp10_fneg_v2f16: @@ -6852,19 +6864,22 @@ define <2 x half> @v_exp10_v2f16_fast(<2 x half> %in) { ; SI-GISEL-LABEL: v_exp10_v2f16_fast: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, 0x3dc5 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v2 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v2 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v2 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp10_v2f16_fast: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll index 32b599e63c61d..1e520c1750f5f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll @@ -2307,12 +2307,15 @@ define <2 x half> @v_exp2_v2f16(<2 x half> %in) { ; SI-GISEL-LABEL: v_exp2_v2f16: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_exp2_v2f16: @@ -2384,12 +2387,15 @@ define <2 x half> @v_exp2_fabs_v2f16(<2 x half> %in) { ; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-GISEL-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 ; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_exp2_fabs_v2f16: @@ -2468,12 +2474,15 @@ define <2 x half> @v_exp2_fneg_fabs_v2f16(<2 x half> %in) { ; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-GISEL-NEXT: v_or_b32_e32 v0, 0x80008000, v0 ; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_exp2_fneg_fabs_v2f16: @@ -2553,12 +2562,15 @@ define <2 x half> @v_exp2_fneg_v2f16(<2 x half> %in) { ; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-GISEL-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 ; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_exp2_fneg_v2f16: @@ -2628,12 +2640,15 @@ define <2 x half> @v_exp2_v2f16_fast(<2 x half> %in) { ; SI-GISEL-LABEL: v_exp2_v2f16_fast: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_exp2_v2f16_fast: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll index b9fef0834cb24..fa85f0db33e2e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll @@ -236,9 +236,12 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) { ; GFX6-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, v3 ; GFX6-GISEL-NEXT: v_frexp_exp_i32_f32_e32 v5, v1 ; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc +; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: test_frexp_v2f16_v2i32: @@ -323,8 +326,11 @@ define <2 x half> @test_frexp_v2f16_v2i32_only_use_fract(<2 x half> %a) { ; GFX6-GISEL-NEXT: v_frexp_mant_f32_e32 v3, v1 ; GFX6-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, v2 ; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: test_frexp_v2f16_v2i32_only_use_fract: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll index 72e86f1f6f999..17b24ad2ee08b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll @@ -504,12 +504,15 @@ define <2 x half> @test_ldexp_v2f16_v2i32(<2 x half> %a, <2 x i32> %b) { ; GFX6-GISEL-LABEL: test_ldexp_v2f16_v2i32: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v1, v1, v3 -; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: test_ldexp_v2f16_v2i32: @@ -638,8 +641,11 @@ define <2 x half> @test_ldexp_v2f16_v2i16(<2 x half> %a, <2 x i16> %b) { ; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; GFX6-GISEL-NEXT: v_bfe_i32 v2, v3, 0, 16 ; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v1, v1, v2 -; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: test_ldexp_v2f16_v2i16: @@ -1087,18 +1093,24 @@ define <4 x half> @test_ldexp_v4f16_v4i32(<4 x half> %a, <4 x i32> %b) { ; GFX6-GISEL-LABEL: test_ldexp_v4f16_v4i32: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v4 +; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v1, v1, v5 -; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v2, v2, v6 +; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v4 +; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v3, v3, v7 ; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v2, v2, v6 ; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX6-GISEL-NEXT: v_or_b32_e32 v2, v2, v1 +; GFX6-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: test_ldexp_v4f16_v4i32: @@ -1292,11 +1304,17 @@ define <4 x half> @test_ldexp_v4f16_v4i16(<4 x half> %a, <4 x i16> %b) { ; GFX6-GISEL-NEXT: v_bfe_i32 v4, v6, 0, 16 ; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v2, v2, v4 ; GFX6-GISEL-NEXT: v_bfe_i32 v4, v7, 0, 16 +; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v3, v3, v4 ; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX6-GISEL-NEXT: v_or_b32_e32 v2, v2, v1 +; GFX6-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: test_ldexp_v4f16_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll index 7f4cf19e9b85b..897f0e9f024b2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll @@ -6608,14 +6608,17 @@ define <2 x half> @v_log_v2f16(<2 x half> %in) { ; SI-GISEL-LABEL: v_log_v2f16: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log_v2f16: @@ -6701,15 +6704,18 @@ define <2 x half> @v_log_fabs_v2f16(<2 x half> %in) { ; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-GISEL-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_log_f32_e32 v2, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317218, v2 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log_fabs_v2f16: @@ -6827,15 +6833,18 @@ define <2 x half> @v_log_fneg_fabs_v2f16(<2 x half> %in) { ; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-GISEL-NEXT: v_or_b32_e32 v0, 0x80008000, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_log_f32_e32 v2, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317218, v2 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log_fneg_fabs_v2f16: @@ -6954,15 +6963,18 @@ define <2 x half> @v_log_fneg_v2f16(<2 x half> %in) { ; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-GISEL-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_log_f32_e32 v2, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317218, v2 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log_fneg_v2f16: @@ -7072,14 +7084,17 @@ define <2 x half> @v_log_v2f16_fast(<2 x half> %in) { ; SI-GISEL-LABEL: v_log_v2f16_fast: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log_v2f16_fast: @@ -7363,22 +7378,28 @@ define <4 x half> @v_log_v4f16(<4 x half> %in) { ; SI-GISEL-LABEL: v_log_v4f16: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_log_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v3, v3 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; SI-GISEL-NEXT: v_log_f32_e32 v2, v2 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 -; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3f317218, v3 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-GISEL-NEXT: v_or_b32_e32 v2, v2, v1 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log_v4f16: @@ -7531,22 +7552,28 @@ define <4 x half> @v_log_v4f16_fast(<4 x half> %in) { ; SI-GISEL-LABEL: v_log_v4f16_fast: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_log_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v3, v3 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; SI-GISEL-NEXT: v_log_f32_e32 v2, v2 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 -; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3f317218, v3 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-GISEL-NEXT: v_or_b32_e32 v2, v2, v1 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log_v4f16_fast: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll index 1c64e6b76c957..74c56f5f22875 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll @@ -6608,14 +6608,17 @@ define <2 x half> @v_log10_v2f16(<2 x half> %in) { ; SI-GISEL-LABEL: v_log10_v2f16: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log10_v2f16: @@ -6701,15 +6704,18 @@ define <2 x half> @v_log10_fabs_v2f16(<2 x half> %in) { ; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-GISEL-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_log_f32_e32 v2, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v2 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log10_fabs_v2f16: @@ -6827,15 +6833,18 @@ define <2 x half> @v_log10_fneg_fabs_v2f16(<2 x half> %in) { ; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-GISEL-NEXT: v_or_b32_e32 v0, 0x80008000, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_log_f32_e32 v2, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v2 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log10_fneg_fabs_v2f16: @@ -6954,15 +6963,18 @@ define <2 x half> @v_log10_fneg_v2f16(<2 x half> %in) { ; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-GISEL-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_log_f32_e32 v2, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v2 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log10_fneg_v2f16: @@ -7072,14 +7084,17 @@ define <2 x half> @v_log10_v2f16_fast(<2 x half> %in) { ; SI-GISEL-LABEL: v_log10_v2f16_fast: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log10_v2f16_fast: @@ -7363,22 +7378,28 @@ define <4 x half> @v_log10_v4f16(<4 x half> %in) { ; SI-GISEL-LABEL: v_log10_v4f16: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_log_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v3, v3 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; SI-GISEL-NEXT: v_log_f32_e32 v2, v2 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 -; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3e9a209b, v3 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v2 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-GISEL-NEXT: v_or_b32_e32 v2, v2, v1 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log10_v4f16: @@ -7531,22 +7552,28 @@ define <4 x half> @v_log10_v4f16_fast(<4 x half> %in) { ; SI-GISEL-LABEL: v_log10_v4f16_fast: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_log_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v3, v3 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; SI-GISEL-NEXT: v_log_f32_e32 v2, v2 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 -; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3e9a209b, v3 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v2 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-GISEL-NEXT: v_or_b32_e32 v2, v2, v1 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log10_v4f16_fast: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll index 50c52037dc4d3..87f46f6000961 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll @@ -3073,12 +3073,15 @@ define <2 x half> @v_log2_v2f16(<2 x half> %in) { ; SI-GISEL-LABEL: v_log2_v2f16: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log2_v2f16: @@ -3161,12 +3164,15 @@ define <2 x half> @v_log2_fabs_v2f16(<2 x half> %in) { ; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-GISEL-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 ; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log2_fabs_v2f16: @@ -3268,12 +3274,15 @@ define <2 x half> @v_log2_fneg_fabs_v2f16(<2 x half> %in) { ; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-GISEL-NEXT: v_or_b32_e32 v0, 0x80008000, v0 ; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log2_fneg_fabs_v2f16: @@ -3376,12 +3385,15 @@ define <2 x half> @v_log2_fneg_v2f16(<2 x half> %in) { ; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-GISEL-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 ; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log2_fneg_v2f16: @@ -3474,12 +3486,15 @@ define <2 x half> @v_log2_v2f16_fast(<2 x half> %in) { ; SI-GISEL-LABEL: v_log2_v2f16_fast: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log2_v2f16_fast: @@ -3759,18 +3774,24 @@ define <4 x half> @v_log2_v4f16(<4 x half> %in) { ; SI-GISEL-LABEL: v_log2_v4f16: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_log_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v3, v3 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_log_f32_e32 v2, v2 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-GISEL-NEXT: v_or_b32_e32 v2, v2, v1 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log2_v4f16: @@ -3889,18 +3910,24 @@ define <4 x half> @v_log2_v4f16_fast(<4 x half> %in) { ; SI-GISEL-LABEL: v_log2_v4f16_fast: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_log_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v3, v3 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_log_f32_e32 v2, v2 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-GISEL-NEXT: v_or_b32_e32 v2, v2, v1 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log2_v4f16_fast: diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll index 95d579be04ed2..267236e53b40b 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll @@ -472,16 +472,19 @@ define <2 x half> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half ; GISEL-CI-LABEL: v_mad_mix_v2f32: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GISEL-CI-NEXT: v_mac_f32_e32 v4, v0, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GISEL-CI-NEXT: v_mac_f32_e32 v5, v1, v3 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; GISEL-CI-NEXT: v_mac_f32_e32 v4, v0, v2 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GISEL-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GISEL-CI-NEXT: v_or_b32_e32 v0, v1, v0 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> @@ -794,26 +797,32 @@ define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half ; GISEL-CI-LABEL: v_mad_mix_v4f32: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GISEL-CI-NEXT: v_mac_f32_e32 v8, v0, v4 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; GISEL-CI-NEXT: v_mac_f32_e32 v9, v1, v5 -; GISEL-CI-NEXT: v_mac_f32_e32 v10, v2, v6 +; GISEL-CI-NEXT: v_mac_f32_e32 v8, v0, v4 ; GISEL-CI-NEXT: v_mac_f32_e32 v11, v3, v7 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v9 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v10 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v11 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; GISEL-CI-NEXT: v_mac_f32_e32 v10, v2, v6 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v8 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v11 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v10 +; GISEL-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GISEL-CI-NEXT: v_or_b32_e32 v0, v1, v0 +; GISEL-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GISEL-CI-NEXT: v_or_b32_e32 v2, v3, v1 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <4 x half> %src0 to <4 x float> %src1.ext = fpext <4 x half> %src1 to <4 x float> @@ -909,30 +918,33 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %s ; GISEL-CI-LABEL: v_mad_mix_v2f32_clamp_postcvt: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GISEL-CI-NEXT: v_mac_f32_e32 v5, v1, v3 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v5 ; GISEL-CI-NEXT: v_mac_f32_e32 v4, v0, v2 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v4 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0 -; GISEL-CI-NEXT: v_mac_f32_e32 v5, v1, v3 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v2 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v2 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v2 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v2 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v2 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> @@ -1322,52 +1334,58 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s ; GISEL-CI-LABEL: v_mad_mix_v4f32_clamp_postcvt: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GISEL-CI-NEXT: v_mac_f32_e32 v8, v0, v4 ; GISEL-CI-NEXT: v_mac_f32_e32 v9, v1, v5 +; GISEL-CI-NEXT: v_mac_f32_e32 v8, v0, v4 ; GISEL-CI-NEXT: v_mac_f32_e32 v10, v2, v6 ; GISEL-CI-NEXT: v_mac_f32_e32 v11, v3, v7 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v8 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v9 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v8 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v10 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v4, v11 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v2 ; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2 +; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v2 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GISEL-CI-NEXT: v_max_f32_e32 v3, v3, v2 ; GISEL-CI-NEXT: v_max_f32_e32 v2, v4, v2 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, 1.0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v5 ; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v5 -; GISEL-CI-NEXT: v_min_f32_e32 v2, v3, v5 -; GISEL-CI-NEXT: v_min_f32_e32 v3, v4, v5 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v5 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GISEL-CI-NEXT: v_min_f32_e32 v2, v2, v5 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-CI-NEXT: v_min_f32_e32 v3, v3, v5 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GISEL-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GISEL-CI-NEXT: v_or_b32_e32 v2, v3, v1 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <4 x half> %src0 to <4 x float> %src1.ext = fpext <4 x half> %src1 to <4 x float> @@ -1514,17 +1532,15 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half> ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v4 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0 ; GISEL-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GISEL-CI-NEXT: v_or_b32_e32 v0, v1, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GISEL-CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GISEL-CI-NEXT: v_or_b32_e32 v1, v1, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v2 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 1.0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v2 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v1 -; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v2 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> @@ -1676,16 +1692,12 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half> ; GISEL-CI-NEXT: v_or_b32_e32 v0, v1, v0 ; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v2 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GISEL-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v1 -; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> @@ -1824,16 +1836,19 @@ define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %sr ; GISEL-CI-LABEL: v_mad_mix_v2f32_clamp_precvt: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GISEL-CI-NEXT: v_mad_f32 v0, v0, v2, v4 clamp +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GISEL-CI-NEXT: v_mad_f32 v1, v1, v3, v5 clamp -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GISEL-CI-NEXT: v_mad_f32 v0, v0, v2, v4 clamp +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> @@ -2222,26 +2237,32 @@ define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %sr ; GISEL-CI-LABEL: v_mad_mix_v4f32_clamp_precvt: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GISEL-CI-NEXT: v_mad_f32 v0, v0, v4, v8 clamp +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; GISEL-CI-NEXT: v_mad_f32 v1, v1, v5, v9 clamp -; GISEL-CI-NEXT: v_mad_f32 v2, v2, v6, v10 clamp +; GISEL-CI-NEXT: v_mad_f32 v0, v0, v4, v8 clamp ; GISEL-CI-NEXT: v_mad_f32 v3, v3, v7, v11 clamp -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GISEL-CI-NEXT: v_mad_f32 v2, v2, v6, v10 clamp +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GISEL-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GISEL-CI-NEXT: v_or_b32_e32 v2, v2, v1 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <4 x half> %src0 to <4 x float> %src1.ext = fpext <4 x half> %src1 to <4 x float> diff --git a/llvm/test/CodeGen/AMDGPU/roundeven.ll b/llvm/test/CodeGen/AMDGPU/roundeven.ll index 0f95c0255d3ab..3015707418d0a 100644 --- a/llvm/test/CodeGen/AMDGPU/roundeven.ll +++ b/llvm/test/CodeGen/AMDGPU/roundeven.ll @@ -404,23 +404,29 @@ define <2 x half> @v_roundeven_v2f16(<2 x half> %x) { ; GFX6-LABEL: v_roundeven_v2f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_rndne_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_rndne_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_rndne_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_roundeven_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_rndne_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_rndne_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_rndne_f32_e32 v0, v0 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_roundeven_v2f16: @@ -522,13 +528,16 @@ define <2 x half> @v_roundeven_v2f16_fneg(<2 x half> %x) { ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX6-NEXT: v_rndne_f32_e32 v0, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_rndne_f32_e32 v1, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_rndne_f32_e32 v1, v1 +; GFX6-NEXT: v_rndne_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_roundeven_v2f16_fneg: @@ -538,13 +547,16 @@ define <2 x half> @v_roundeven_v2f16_fneg(<2 x half> %x) { ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX7-NEXT: v_rndne_f32_e32 v0, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_rndne_f32_e32 v1, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_rndne_f32_e32 v1, v1 +; GFX7-NEXT: v_rndne_f32_e32 v0, v0 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_roundeven_v2f16_fneg: @@ -655,35 +667,47 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) { ; GFX6-LABEL: v_roundeven_v4f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_rndne_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_rndne_f32_e32 v1, v1 -; GFX6-NEXT: v_rndne_f32_e32 v2, v2 +; GFX6-NEXT: v_rndne_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_rndne_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_rndne_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_roundeven_v4f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_rndne_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_rndne_f32_e32 v1, v1 -; GFX7-NEXT: v_rndne_f32_e32 v2, v2 +; GFX7-NEXT: v_rndne_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_rndne_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_rndne_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_roundeven_v4f16: diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll b/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll index d9e0e0298e072..6dedc6920a30e 100644 --- a/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll +++ b/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll @@ -11,11 +11,11 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY killed $sgpr1 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:sreg_64 = COPY killed [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:sreg_64 = COPY killed [[COPY1]] - ; CHECK-NEXT: early-clobber %11:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY2]], 0, 0 :: (invariant load (<2 x s32>) from %ir.ptr, align 4, addrspace 4) + ; CHECK-NEXT: early-clobber %17:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY2]], 0, 0 :: (invariant load (<2 x s32>) from %ir.ptr, align 4, addrspace 4) ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed [[COPY2]], 8, 0 :: (invariant load (s32) from %ir.ptr + 8, addrspace 4) - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY %11.sub0 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY %17.sub0 ; CHECK-NEXT: $sgpr0 = COPY killed [[COPY3]] - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY killed %11.sub1 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY killed %17.sub1 ; CHECK-NEXT: $sgpr1 = COPY killed [[COPY4]] ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY killed [[S_LOAD_DWORD_IMM]] ; CHECK-NEXT: $sgpr2 = COPY killed [[COPY5]]