diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 649deee346e90..a6d0f35c4f94e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -223,8 +223,9 @@ static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { }; } -static bool isRegisterSize(unsigned Size) { - return Size % 32 == 0 && Size <= MaxRegisterSize; +static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size) { + return ((ST.useRealTrue16Insts() && Size == 16) || Size % 32 == 0) && + Size <= MaxRegisterSize; } static bool isRegisterVectorElementType(LLT EltTy) { @@ -240,8 +241,8 @@ static bool isRegisterVectorType(LLT Ty) { } // TODO: replace all uses of isRegisterType with isRegisterClassType -static bool isRegisterType(LLT Ty) { - if (!isRegisterSize(Ty.getSizeInBits())) +static bool isRegisterType(const GCNSubtarget &ST, LLT Ty) { + if (!isRegisterSize(ST, Ty.getSizeInBits())) return false; if (Ty.isVector()) @@ -252,19 +253,21 @@ static bool isRegisterType(LLT Ty) { // Any combination of 32 or 64-bit elements up the maximum register size, and // multiples of v2s16. -static LegalityPredicate isRegisterType(unsigned TypeIdx) { - return [=](const LegalityQuery &Query) { - return isRegisterType(Query.Types[TypeIdx]); +static LegalityPredicate isRegisterType(const GCNSubtarget &ST, + unsigned TypeIdx) { + return [=, &ST](const LegalityQuery &Query) { + return isRegisterType(ST, Query.Types[TypeIdx]); }; } // RegisterType that doesn't have a corresponding RegClass. // TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this // should be removed. -static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) { - return [=](const LegalityQuery &Query) { +static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST, + unsigned TypeIdx) { + return [=, &ST](const LegalityQuery &Query) { LLT Ty = Query.Types[TypeIdx]; - return isRegisterType(Ty) && + return isRegisterType(ST, Ty) && !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits()); }; } @@ -348,17 +351,20 @@ static std::initializer_list AllS64Vectors = {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; // Checks whether a type is in the list of legal register types. -static bool isRegisterClassType(LLT Ty) { +static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty) { if (Ty.isPointerOrPointerVector()) Ty = Ty.changeElementType(LLT::scalar(Ty.getScalarSizeInBits())); return is_contained(AllS32Vectors, Ty) || is_contained(AllS64Vectors, Ty) || - is_contained(AllScalarTypes, Ty) || is_contained(AllS16Vectors, Ty); + is_contained(AllScalarTypes, Ty) || + (ST.useRealTrue16Insts() && Ty == S16) || + is_contained(AllS16Vectors, Ty); } -static LegalityPredicate isRegisterClassType(unsigned TypeIdx) { - return [TypeIdx](const LegalityQuery &Query) { - return isRegisterClassType(Query.Types[TypeIdx]); +static LegalityPredicate isRegisterClassType(const GCNSubtarget &ST, + unsigned TypeIdx) { + return [&ST, TypeIdx](const LegalityQuery &Query) { + return isRegisterClassType(ST, Query.Types[TypeIdx]); }; } @@ -510,7 +516,7 @@ static bool loadStoreBitcastWorkaround(const LLT Ty) { static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { const LLT Ty = Query.Types[0]; - return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && + return isRegisterType(ST, Ty) && isLoadStoreSizeLegal(ST, Query) && !hasBufferRsrcWorkaround(Ty) && !loadStoreBitcastWorkaround(Ty); } @@ -523,12 +529,12 @@ static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, if (Size != MemSizeInBits) return Size <= 32 && Ty.isVector(); - if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) + if (loadStoreBitcastWorkaround(Ty) && isRegisterType(ST, Ty)) return true; // Don't try to handle bitcasting vector ext loads for now. return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && - (Size <= 32 || isRegisterSize(Size)) && + (Size <= 32 || isRegisterSize(ST, Size)) && !isRegisterVectorElementType(Ty.getElementType()); } @@ -875,7 +881,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, getActionDefinitionsBuilder(G_BITCAST) // Don't worry about the size constraint. - .legalIf(all(isRegisterClassType(0), isRegisterClassType(1))) + .legalIf(all(isRegisterClassType(ST, 0), isRegisterClassType(ST, 1))) .lower(); getActionDefinitionsBuilder(G_CONSTANT) @@ -890,7 +896,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .clampScalar(0, S16, S64); getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) - .legalIf(isRegisterClassType(0)) + .legalIf(isRegisterClassType(ST, 0)) // s1 and s16 are special cases because they have legal operations on // them, but don't really occupy registers in the normal way. .legalFor({S1, S16}) @@ -1779,7 +1785,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, unsigned IdxTypeIdx = 2; getActionDefinitionsBuilder(Op) - .customIf([=](const LegalityQuery &Query) { + .customIf([=](const LegalityQuery &Query) { const LLT EltTy = Query.Types[EltTypeIdx]; const LLT VecTy = Query.Types[VecTypeIdx]; const LLT IdxTy = Query.Types[IdxTypeIdx]; @@ -1800,36 +1806,37 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, IdxTy.getSizeInBits() == 32 && isLegalVecType; }) - .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), - bitcastToVectorElement32(VecTypeIdx)) - //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) - .bitcastIf( - all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), - [=](const LegalityQuery &Query) { - // For > 64-bit element types, try to turn this into a 64-bit - // element vector since we may be able to do better indexing - // if this is scalar. If not, fall back to 32. - const LLT EltTy = Query.Types[EltTypeIdx]; - const LLT VecTy = Query.Types[VecTypeIdx]; - const unsigned DstEltSize = EltTy.getSizeInBits(); - const unsigned VecSize = VecTy.getSizeInBits(); - - const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; - return std::pair( - VecTypeIdx, - LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); - }) - .clampScalar(EltTypeIdx, S32, S64) - .clampScalar(VecTypeIdx, S32, S64) - .clampScalar(IdxTypeIdx, S32, S32) - .clampMaxNumElements(VecTypeIdx, S32, 32) - // TODO: Clamp elements for 64-bit vectors? - .moreElementsIf( - isIllegalRegisterType(VecTypeIdx), - moreElementsToNextExistingRegClass(VecTypeIdx)) - // It should only be necessary with variable indexes. - // As a last resort, lower to the stack - .lower(); + .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), + scalarOrEltNarrowerThan(VecTypeIdx, 32)), + bitcastToVectorElement32(VecTypeIdx)) + //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) + .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), + scalarOrEltWiderThan(VecTypeIdx, 64)), + [=](const LegalityQuery &Query) { + // For > 64-bit element types, try to turn this into a + // 64-bit element vector since we may be able to do better + // indexing if this is scalar. If not, fall back to 32. + const LLT EltTy = Query.Types[EltTypeIdx]; + const LLT VecTy = Query.Types[VecTypeIdx]; + const unsigned DstEltSize = EltTy.getSizeInBits(); + const unsigned VecSize = VecTy.getSizeInBits(); + + const unsigned TargetEltSize = + DstEltSize % 64 == 0 ? 64 : 32; + return std::pair(VecTypeIdx, + LLT::fixed_vector(VecSize / TargetEltSize, + TargetEltSize)); + }) + .clampScalar(EltTypeIdx, S32, S64) + .clampScalar(VecTypeIdx, S32, S64) + .clampScalar(IdxTypeIdx, S32, S32) + .clampMaxNumElements(VecTypeIdx, S32, 32) + // TODO: Clamp elements for 64-bit vectors? + .moreElementsIf(isIllegalRegisterType(ST, VecTypeIdx), + moreElementsToNextExistingRegClass(VecTypeIdx)) + // It should only be necessary with variable indexes. + // As a last resort, lower to the stack + .lower(); } getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) @@ -1876,15 +1883,15 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, } - auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) - .legalForCartesianProduct(AllS32Vectors, {S32}) - .legalForCartesianProduct(AllS64Vectors, {S64}) - .clampNumElements(0, V16S32, V32S32) - .clampNumElements(0, V2S64, V16S64) - .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)) - .moreElementsIf( - isIllegalRegisterType(0), - moreElementsToNextExistingRegClass(0)); + auto &BuildVector = + getActionDefinitionsBuilder(G_BUILD_VECTOR) + .legalForCartesianProduct(AllS32Vectors, {S32}) + .legalForCartesianProduct(AllS64Vectors, {S64}) + .clampNumElements(0, V16S32, V32S32) + .clampNumElements(0, V2S64, V16S64) + .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)) + .moreElementsIf(isIllegalRegisterType(ST, 0), + moreElementsToNextExistingRegClass(0)); if (ST.hasScalarPackInsts()) { BuildVector @@ -1904,14 +1911,14 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .lower(); } - BuildVector.legalIf(isRegisterType(0)); + BuildVector.legalIf(isRegisterType(ST, 0)); // FIXME: Clamp maximum size getActionDefinitionsBuilder(G_CONCAT_VECTORS) - .legalIf(all(isRegisterType(0), isRegisterType(1))) - .clampMaxNumElements(0, S32, 32) - .clampMaxNumElements(1, S16, 2) // TODO: Make 4? - .clampMaxNumElements(0, S16, 64); + .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1))) + .clampMaxNumElements(0, S32, 32) + .clampMaxNumElements(1, S16, 2) // TODO: Make 4? + .clampMaxNumElements(0, S16, 64); getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); @@ -1932,34 +1939,40 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, return false; }; - auto &Builder = getActionDefinitionsBuilder(Op) - .legalIf(all(isRegisterType(0), isRegisterType(1))) - .lowerFor({{S16, V2S16}}) - .lowerIf([=](const LegalityQuery &Query) { - const LLT BigTy = Query.Types[BigTyIdx]; - return BigTy.getSizeInBits() == 32; - }) - // Try to widen to s16 first for small types. - // TODO: Only do this on targets with legal s16 shifts - .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) - .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) - .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) - .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), - elementTypeIs(1, S16)), - changeTo(1, V2S16)) - // Clamp the little scalar to s8-s256 and make it a power of 2. It's not - // worth considering the multiples of 64 since 2*192 and 2*384 are not - // valid. - .clampScalar(LitTyIdx, S32, S512) - .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) - // Break up vectors with weird elements into scalars - .fewerElementsIf( - [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, - scalarize(0)) - .fewerElementsIf( - [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, - scalarize(1)) - .clampScalar(BigTyIdx, S32, MaxScalar); + auto &Builder = + getActionDefinitionsBuilder(Op) + .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1))) + .lowerFor({{S16, V2S16}}) + .lowerIf([=](const LegalityQuery &Query) { + const LLT BigTy = Query.Types[BigTyIdx]; + return BigTy.getSizeInBits() == 32; + }) + // Try to widen to s16 first for small types. + // TODO: Only do this on targets with legal s16 shifts + .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) + .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) + .moreElementsIf(isSmallOddVector(BigTyIdx), + oneMoreElement(BigTyIdx)) + .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), + elementTypeIs(1, S16)), + changeTo(1, V2S16)) + // Clamp the little scalar to s8-s256 and make it a power of 2. It's + // not worth considering the multiples of 64 since 2*192 and 2*384 + // are not valid. + .clampScalar(LitTyIdx, S32, S512) + .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) + // Break up vectors with weird elements into scalars + .fewerElementsIf( + [=](const LegalityQuery &Query) { + return notValidElt(Query, LitTyIdx); + }, + scalarize(0)) + .fewerElementsIf( + [=](const LegalityQuery &Query) { + return notValidElt(Query, BigTyIdx); + }, + scalarize(1)) + .clampScalar(BigTyIdx, S32, MaxScalar); if (Op == G_MERGE_VALUES) { Builder.widenScalarIf( @@ -3146,7 +3159,7 @@ bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, } else { // Extract the subvector. - if (isRegisterType(ValTy)) { + if (isRegisterType(ST, ValTy)) { // If this a case where G_EXTRACT is legal, use it. // (e.g. <3 x s32> -> <4 x s32>) WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td index 2d8dc9d47225e..1c1a6dac75a17 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td @@ -11,7 +11,7 @@ def SGPRRegBank : RegisterBank<"SGPR", >; def VGPRRegBank : RegisterBank<"VGPR", - [VGPR_32, VReg_64, VReg_96, VReg_128, VReg_160, VReg_192, VReg_224, VReg_256, VReg_288, VReg_320, VReg_352, VReg_384, VReg_512, VReg_1024] + [VGPR_16_Lo128, VGPR_16, VGPR_32, VReg_64, VReg_96, VReg_128, VReg_160, VReg_192, VReg_224, VReg_256, VReg_288, VReg_320, VReg_352, VReg_384, VReg_512, VReg_1024] >; // It is helpful to distinguish conditions from ordinary SGPRs. diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 71c720ed09b5f..e365690f8b4dc 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -35,7 +35,7 @@ static cl::opt EnableSpillSGPRToVGPR( cl::ReallyHidden, cl::init(true)); -std::array, 16> SIRegisterInfo::RegSplitParts; +std::array, 32> SIRegisterInfo::RegSplitParts; std::array, 9> SIRegisterInfo::SubRegFromChannelTable; // Map numbers of DWORDs to indexes in SubRegFromChannelTable. @@ -351,9 +351,9 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) static auto InitializeRegSplitPartsOnce = [this]() { for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) { unsigned Size = getSubRegIdxSize(Idx); - if (Size & 31) + if (Size & 15) continue; - std::vector &Vec = RegSplitParts[Size / 32 - 1]; + std::vector &Vec = RegSplitParts[Size / 16 - 1]; unsigned Pos = getSubRegIdxOffset(Idx); if (Pos % Size) continue; @@ -3554,14 +3554,14 @@ bool SIRegisterInfo::isUniformReg(const MachineRegisterInfo &MRI, ArrayRef SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const { const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC); - assert(RegBitWidth >= 32 && RegBitWidth <= 1024); + assert(RegBitWidth >= 32 && RegBitWidth <= 1024 && EltSize >= 2); - const unsigned RegDWORDs = RegBitWidth / 32; - const unsigned EltDWORDs = EltSize / 4; - assert(RegSplitParts.size() + 1 >= EltDWORDs); + const unsigned RegHalves = RegBitWidth / 16; + const unsigned EltHalves = EltSize / 2; + assert(RegSplitParts.size() + 1 >= EltHalves); - const std::vector &Parts = RegSplitParts[EltDWORDs - 1]; - const unsigned NumParts = RegDWORDs / EltDWORDs; + const std::vector &Parts = RegSplitParts[EltHalves - 1]; + const unsigned NumParts = RegHalves / EltHalves; return ArrayRef(Parts.data(), NumParts); } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index a434efb70d052..a64180daea2ad 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -37,11 +37,11 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo { BitVector RegPressureIgnoredUnits; /// Sub reg indexes for getRegSplitParts. - /// First index represents subreg size from 1 to 16 DWORDs. + /// First index represents subreg size from 1 to 32 Half DWORDS. /// The inner vector is sorted by bit offset. /// Provided a register can be fully split with given subregs, /// all elements of the inner vector combined give a full lane mask. - static std::array, 16> RegSplitParts; + static std::array, 32> RegSplitParts; // Table representing sub reg of given width and offset. // First index is subreg size: 32, 64, 96, 128, 160, 192, 224, 256, 512. diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index c521d0dd3ad2d..6a92e54b69edc 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -2483,6 +2483,8 @@ bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo) { // (move from MC* level to Target* level). Return size in bits. unsigned getRegBitWidth(unsigned RCID) { switch (RCID) { + case AMDGPU::VGPR_16RegClassID: + case AMDGPU::VGPR_16_Lo128RegClassID: case AMDGPU::SGPR_LO16RegClassID: case AMDGPU::AGPR_LO16RegClassID: return 16; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll index 3d7fec9a5986c..2389924b82484 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -3,7 +3,8 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-TRUE16 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-FAKE16 %s define i7 @v_uaddsat_i7(i7 %lhs, i7 %rhs) { ; GFX6-LABEL: v_uaddsat_i7: @@ -35,14 +36,32 @@ define i7 @v_uaddsat_i7(i7 %lhs, i7 %rhs) { ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 9, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10PLUS-LABEL: v_uaddsat_i7: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_lshlrev_b16 v0, 9, v0 -; GFX10PLUS-NEXT: v_lshlrev_b16 v1, 9, v1 -; GFX10PLUS-NEXT: v_add_nc_u16 v0, v0, v1 clamp -; GFX10PLUS-NEXT: v_lshrrev_b16 v0, 9, v0 -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: v_uaddsat_i7: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_lshlrev_b16 v0, 9, v0 +; GFX10-NEXT: v_lshlrev_b16 v1, 9, v1 +; GFX10-NEXT: v_add_nc_u16 v0, v0, v1 clamp +; GFX10-NEXT: v_lshrrev_b16 v0, 9, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: v_uaddsat_i7: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 9, v0.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 9, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h clamp +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.l, 9, v0.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_uaddsat_i7: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v0, 9, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 9, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v0, v1 clamp +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v0, 9, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call i7 @llvm.uadd.sat.i7(i7 %lhs, i7 %rhs) ret i7 %result } @@ -78,14 +97,32 @@ define amdgpu_ps i7 @s_uaddsat_i7(i7 inreg %lhs, i7 inreg %rhs) { ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: s_uaddsat_i7: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 9 -; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 9 -; GFX10PLUS-NEXT: v_add_nc_u16 v0, s0, s1 clamp -; GFX10PLUS-NEXT: v_lshrrev_b16 v0, 9, v0 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: s_uaddsat_i7: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshl_b32 s0, s0, 9 +; GFX10-NEXT: s_lshl_b32 s1, s1, 9 +; GFX10-NEXT: v_add_nc_u16 v0, s0, s1 clamp +; GFX10-NEXT: v_lshrrev_b16 v0, 9, v0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-TRUE16-LABEL: s_uaddsat_i7: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 9 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, s0, s1 clamp +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.l, 9, v0.l +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: s_uaddsat_i7: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 9 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 9 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, s0, s1 clamp +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v0, 9, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %result = call i7 @llvm.uadd.sat.i7(i7 %lhs, i7 %rhs) ret i7 %result } @@ -120,14 +157,32 @@ define i8 @v_uaddsat_i8(i8 %lhs, i8 %rhs) { ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10PLUS-LABEL: v_uaddsat_i8: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_lshlrev_b16 v0, 8, v0 -; GFX10PLUS-NEXT: v_lshlrev_b16 v1, 8, v1 -; GFX10PLUS-NEXT: v_add_nc_u16 v0, v0, v1 clamp -; GFX10PLUS-NEXT: v_lshrrev_b16 v0, 8, v0 -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: v_uaddsat_i8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0 +; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1 +; GFX10-NEXT: v_add_nc_u16 v0, v0, v1 clamp +; GFX10-NEXT: v_lshrrev_b16 v0, 8, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: v_uaddsat_i8: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v0.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h clamp +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.l, 8, v0.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_uaddsat_i8: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v0, 8, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v0, v1 clamp +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v0, 8, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call i8 @llvm.uadd.sat.i8(i8 %lhs, i8 %rhs) ret i8 %result } @@ -163,14 +218,32 @@ define amdgpu_ps i8 @s_uaddsat_i8(i8 inreg %lhs, i8 inreg %rhs) { ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: s_uaddsat_i8: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8 -; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8 -; GFX10PLUS-NEXT: v_add_nc_u16 v0, s0, s1 clamp -; GFX10PLUS-NEXT: v_lshrrev_b16 v0, 8, v0 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: s_uaddsat_i8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshl_b32 s0, s0, 8 +; GFX10-NEXT: s_lshl_b32 s1, s1, 8 +; GFX10-NEXT: v_add_nc_u16 v0, s0, s1 clamp +; GFX10-NEXT: v_lshrrev_b16 v0, 8, v0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-TRUE16-LABEL: s_uaddsat_i8: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, s0, s1 clamp +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.l, 8, v0.l +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: s_uaddsat_i8: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, s0, s1 clamp +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v0, 8, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %result = call i8 @llvm.uadd.sat.i8(i8 %lhs, i8 %rhs) ret i8 %result } @@ -247,25 +320,40 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_uaddsat_v2i8: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1 -; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_u16 v0, v0, v1 clamp -; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 -; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_uaddsat_v2i8: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l +; GFX11-TRUE16-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, v1 clamp +; GFX11-TRUE16-NEXT: v_pk_lshrrev_b16 v1, 8, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_uaddsat_v2i8: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX11-FAKE16-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, v1 clamp +; GFX11-FAKE16-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i16 %lhs.arg to <2 x i8> %rhs = bitcast i16 %rhs.arg to <2 x i8> %result = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs) @@ -358,29 +446,50 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: s_uaddsat_v2i8: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_lshr_b32 s2, s0, 8 -; GFX11-NEXT: s_lshr_b32 s3, s1, 8 -; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX11-NEXT: s_lshr_b32 s2, s0, 16 -; GFX11-NEXT: s_lshr_b32 s3, s1, 16 -; GFX11-NEXT: s_lshl_b32 s0, s0, 0x80008 -; GFX11-NEXT: s_lshl_b32 s2, s2, 8 -; GFX11-NEXT: s_lshl_b32 s1, s1, 0x80008 -; GFX11-NEXT: s_lshl_b32 s3, s3, 8 -; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX11-NEXT: v_pk_add_u16 v0, s0, s1 clamp -; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 -; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: s_uaddsat_v2i8: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s1, 8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s1, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 0x80008 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s3 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, s1 clamp +; GFX11-TRUE16-NEXT: v_pk_lshrrev_b16 v1, 8, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: s_uaddsat_v2i8: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s1, 8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s1, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 0x80008 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s3 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s0, s1 clamp +; GFX11-FAKE16-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %lhs = bitcast i16 %lhs.arg to <2 x i8> %rhs = bitcast i16 %rhs.arg to <2 x i8> %result = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs) @@ -524,36 +633,69 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX10-NEXT: v_or3_b32 v0, v1, v2, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_uaddsat_v4i8: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v0 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 24, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX11-NEXT: v_lshl_or_b32 v2, v2, 16, v4 -; GFX11-NEXT: v_lshl_or_b32 v3, v3, 16, v5 -; GFX11-NEXT: v_alignbit_b32 v0, v6, v0, 16 -; GFX11-NEXT: v_alignbit_b32 v1, v7, v1, 16 -; GFX11-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_u16 v2, v2, v3 clamp -; GFX11-NEXT: v_pk_add_u16 v0, v0, v1 clamp -; GFX11-NEXT: v_pk_lshrrev_b16 v1, 8, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 8 -; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v0 -; GFX11-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX11-NEXT: v_and_or_b32 v1, 0xff, v1, v2 -; GFX11-NEXT: v_or3_b32 v0, v1, v3, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_uaddsat_v4i8: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v7.l +; GFX11-TRUE16-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_lshlrev_b16 v2, 8, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, v1 clamp +; GFX11-TRUE16-NEXT: v_pk_lshlrev_b16 v1, 8, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v2, v1 clamp +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h +; GFX11-TRUE16-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0xff, v0, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; GFX11-TRUE16-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_uaddsat_v4i8: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v2, 16, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v3, 16, v5 +; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, v6, v0, 16 +; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, v7, v1, 16 +; GFX11-FAKE16-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, v3 clamp +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, v1 clamp +; GFX11-FAKE16-NEXT: v_pk_lshrrev_b16 v1, 8, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v0, v0, 16, 8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xff, v1, v2 +; GFX11-FAKE16-NEXT: v_or3_b32 v0, v1, v3, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i32 %lhs.arg to <4 x i8> %rhs = bitcast i32 %rhs.arg to <4 x i8> %result = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> %lhs, <4 x i8> %rhs) @@ -729,46 +871,89 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: s_uaddsat_v4i8: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_lshr_b32 s2, s0, 8 -; GFX11-NEXT: s_lshr_b32 s3, s0, 24 -; GFX11-NEXT: s_lshr_b32 s4, s1, 8 -; GFX11-NEXT: s_lshr_b32 s5, s1, 24 -; GFX11-NEXT: s_pack_ll_b32_b16 s2, s0, s2 -; GFX11-NEXT: s_pack_hl_b32_b16 s0, s0, s3 -; GFX11-NEXT: s_pack_ll_b32_b16 s3, s1, s4 -; GFX11-NEXT: s_lshr_b32 s4, s2, 16 -; GFX11-NEXT: s_pack_hl_b32_b16 s1, s1, s5 -; GFX11-NEXT: s_lshr_b32 s5, s3, 16 -; GFX11-NEXT: s_lshl_b32 s2, s2, 0x80008 -; GFX11-NEXT: s_lshl_b32 s4, s4, 8 -; GFX11-NEXT: s_lshl_b32 s3, s3, 0x80008 -; GFX11-NEXT: s_lshl_b32 s5, s5, 8 -; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s4 -; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s5 -; GFX11-NEXT: s_lshr_b32 s4, s0, 16 -; GFX11-NEXT: s_lshr_b32 s5, s1, 16 -; GFX11-NEXT: v_pk_add_u16 v0, s2, s3 clamp -; GFX11-NEXT: s_lshl_b32 s0, s0, 0x80008 -; GFX11-NEXT: s_lshl_b32 s4, s4, 8 -; GFX11-NEXT: s_lshl_b32 s1, s1, 0x80008 -; GFX11-NEXT: s_lshl_b32 s2, s5, 8 -; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s2 -; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_u16 v1, s0, s1 clamp -; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8 -; GFX11-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v1 -; GFX11-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1 -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: s_uaddsat_v4i8: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s0, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 24 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s0, s2 +; GFX11-TRUE16-NEXT: s_pack_hl_b32_b16 s0, s0, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s1, s4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s2, 16 +; GFX11-TRUE16-NEXT: s_pack_hl_b32_b16 s1, s1, s5 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s3, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 0x80008 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 0x80008 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s5 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s2, s3 clamp +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 0x80008 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s5, 8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX11-TRUE16-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s0, s1 clamp +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h +; GFX11-TRUE16-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0xff, v0, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; GFX11-TRUE16-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: s_uaddsat_v4i8: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s0, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 24 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s0, s2 +; GFX11-FAKE16-NEXT: s_pack_hl_b32_b16 s0, s0, s3 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s1, s4 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s2, 16 +; GFX11-FAKE16-NEXT: s_pack_hl_b32_b16 s1, s1, s5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s3, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 0x80008 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 0x80008 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s2, s3 clamp +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 0x80008 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s5, 8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX11-FAKE16-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s0, s1 clamp +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 8 +; GFX11-FAKE16-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v1, 16, 8 +; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0xff, v0, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX11-FAKE16-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %lhs = bitcast i32 %lhs.arg to <4 x i8> %rhs = bitcast i32 %rhs.arg to <4 x i8> %result = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> %lhs, <4 x i8> %rhs) @@ -1761,11 +1946,23 @@ define i16 @v_uaddsat_i16(i16 %lhs, i16 %rhs) { ; GFX9-NEXT: v_add_u16_e64 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10PLUS-LABEL: v_uaddsat_i16: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_add_nc_u16 v0, v0, v1 clamp -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: v_uaddsat_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_nc_u16 v0, v0, v1 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: v_uaddsat_i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l clamp +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_uaddsat_i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v0, v1 clamp +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs) ret i16 %result } @@ -1795,11 +1992,23 @@ define amdgpu_ps i16 @s_uaddsat_i16(i16 inreg %lhs, i16 inreg %rhs) { ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: s_uaddsat_i16: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_add_nc_u16 v0, s0, s1 clamp -; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: s_uaddsat_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_nc_u16 v0, s0, s1 clamp +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-TRUE16-LABEL: s_uaddsat_i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, s0, s1 clamp +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: s_uaddsat_i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, s0, s1 clamp +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %result = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs) ret i16 %result } @@ -1825,10 +2034,20 @@ define amdgpu_ps half @uaddsat_i16_sv(i16 inreg %lhs, i16 %rhs) { ; GFX9-NEXT: v_add_u16_e64 v0, s0, v0 clamp ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: uaddsat_i16_sv: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_add_nc_u16 v0, s0, v0 clamp -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: uaddsat_i16_sv: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_nc_u16 v0, s0, v0 clamp +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-TRUE16-LABEL: uaddsat_i16_sv: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, s0, v0.l clamp +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: uaddsat_i16_sv: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, s0, v0 clamp +; GFX11-FAKE16-NEXT: ; return to shader part epilog %result = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs) %cast = bitcast i16 %result to half ret half %cast @@ -1855,10 +2074,20 @@ define amdgpu_ps half @uaddsat_i16_vs(i16 %lhs, i16 inreg %rhs) { ; GFX9-NEXT: v_add_u16_e64 v0, v0, s0 clamp ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: uaddsat_i16_vs: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_add_nc_u16 v0, v0, s0 clamp -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: uaddsat_i16_vs: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_nc_u16 v0, v0, s0 clamp +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-TRUE16-LABEL: uaddsat_i16_vs: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, s0 clamp +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: uaddsat_i16_vs: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v0, s0 clamp +; GFX11-FAKE16-NEXT: ; return to shader part epilog %result = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs) %cast = bitcast i16 %result to half ret half %cast diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll index 5a8b5fcc93f61..34d36581a21db 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -3,7 +3,8 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-TRUE16 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-FAKE16 %s define i7 @v_usubsat_i7(i7 %lhs, i7 %rhs) { ; GFX6-LABEL: v_usubsat_i7: @@ -34,14 +35,32 @@ define i7 @v_usubsat_i7(i7 %lhs, i7 %rhs) { ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 9, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10PLUS-LABEL: v_usubsat_i7: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_lshlrev_b16 v0, 9, v0 -; GFX10PLUS-NEXT: v_lshlrev_b16 v1, 9, v1 -; GFX10PLUS-NEXT: v_sub_nc_u16 v0, v0, v1 clamp -; GFX10PLUS-NEXT: v_lshrrev_b16 v0, 9, v0 -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: v_usubsat_i7: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_lshlrev_b16 v0, 9, v0 +; GFX10-NEXT: v_lshlrev_b16 v1, 9, v1 +; GFX10-NEXT: v_sub_nc_u16 v0, v0, v1 clamp +; GFX10-NEXT: v_lshrrev_b16 v0, 9, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: v_usubsat_i7: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 9, v0.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 9, v1.l +; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, v0.h clamp +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.l, 9, v0.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_usubsat_i7: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v0, 9, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 9, v1 +; GFX11-FAKE16-NEXT: v_sub_nc_u16 v0, v0, v1 clamp +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v0, 9, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call i7 @llvm.usub.sat.i7(i7 %lhs, i7 %rhs) ret i7 %result } @@ -76,14 +95,32 @@ define amdgpu_ps i7 @s_usubsat_i7(i7 inreg %lhs, i7 inreg %rhs) { ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: s_usubsat_i7: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 9 -; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 9 -; GFX10PLUS-NEXT: v_sub_nc_u16 v0, s0, s1 clamp -; GFX10PLUS-NEXT: v_lshrrev_b16 v0, 9, v0 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: s_usubsat_i7: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshl_b32 s0, s0, 9 +; GFX10-NEXT: s_lshl_b32 s1, s1, 9 +; GFX10-NEXT: v_sub_nc_u16 v0, s0, s1 clamp +; GFX10-NEXT: v_lshrrev_b16 v0, 9, v0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-TRUE16-LABEL: s_usubsat_i7: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 9 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 9 +; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, s0, s1 clamp +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.l, 9, v0.l +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: s_usubsat_i7: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 9 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 9 +; GFX11-FAKE16-NEXT: v_sub_nc_u16 v0, s0, s1 clamp +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v0, 9, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %result = call i7 @llvm.usub.sat.i7(i7 %lhs, i7 %rhs) ret i7 %result } @@ -117,14 +154,32 @@ define i8 @v_usubsat_i8(i8 %lhs, i8 %rhs) { ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10PLUS-LABEL: v_usubsat_i8: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_lshlrev_b16 v0, 8, v0 -; GFX10PLUS-NEXT: v_lshlrev_b16 v1, 8, v1 -; GFX10PLUS-NEXT: v_sub_nc_u16 v0, v0, v1 clamp -; GFX10PLUS-NEXT: v_lshrrev_b16 v0, 8, v0 -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: v_usubsat_i8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0 +; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1 +; GFX10-NEXT: v_sub_nc_u16 v0, v0, v1 clamp +; GFX10-NEXT: v_lshrrev_b16 v0, 8, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: v_usubsat_i8: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v0.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, v0.h clamp +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.l, 8, v0.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_usubsat_i8: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v0, 8, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1 +; GFX11-FAKE16-NEXT: v_sub_nc_u16 v0, v0, v1 clamp +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v0, 8, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call i8 @llvm.usub.sat.i8(i8 %lhs, i8 %rhs) ret i8 %result } @@ -159,14 +214,32 @@ define amdgpu_ps i8 @s_usubsat_i8(i8 inreg %lhs, i8 inreg %rhs) { ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: s_usubsat_i8: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8 -; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8 -; GFX10PLUS-NEXT: v_sub_nc_u16 v0, s0, s1 clamp -; GFX10PLUS-NEXT: v_lshrrev_b16 v0, 8, v0 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: s_usubsat_i8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshl_b32 s0, s0, 8 +; GFX10-NEXT: s_lshl_b32 s1, s1, 8 +; GFX10-NEXT: v_sub_nc_u16 v0, s0, s1 clamp +; GFX10-NEXT: v_lshrrev_b16 v0, 8, v0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-TRUE16-LABEL: s_usubsat_i8: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, s0, s1 clamp +; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.l, 8, v0.l +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: s_usubsat_i8: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: v_sub_nc_u16 v0, s0, s1 clamp +; GFX11-FAKE16-NEXT: v_lshrrev_b16 v0, 8, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %result = call i8 @llvm.usub.sat.i8(i8 %lhs, i8 %rhs) ret i8 %result } @@ -241,25 +314,40 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_usubsat_v2i8: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1 -; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_sub_u16 v0, v0, v1 clamp -; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 -; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_usubsat_v2i8: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l +; GFX11-TRUE16-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_sub_u16 v0, v0, v1 clamp +; GFX11-TRUE16-NEXT: v_pk_lshrrev_b16 v1, 8, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_usubsat_v2i8: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX11-FAKE16-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_sub_u16 v0, v0, v1 clamp +; GFX11-FAKE16-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i16 %lhs.arg to <2 x i8> %rhs = bitcast i16 %rhs.arg to <2 x i8> %result = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs) @@ -350,29 +438,50 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: s_usubsat_v2i8: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_lshr_b32 s2, s0, 8 -; GFX11-NEXT: s_lshr_b32 s3, s1, 8 -; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX11-NEXT: s_lshr_b32 s2, s0, 16 -; GFX11-NEXT: s_lshr_b32 s3, s1, 16 -; GFX11-NEXT: s_lshl_b32 s0, s0, 0x80008 -; GFX11-NEXT: s_lshl_b32 s2, s2, 8 -; GFX11-NEXT: s_lshl_b32 s1, s1, 0x80008 -; GFX11-NEXT: s_lshl_b32 s3, s3, 8 -; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX11-NEXT: v_pk_sub_u16 v0, s0, s1 clamp -; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 -; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: s_usubsat_v2i8: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s1, 8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s1, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 0x80008 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s3 +; GFX11-TRUE16-NEXT: v_pk_sub_u16 v0, s0, s1 clamp +; GFX11-TRUE16-NEXT: v_pk_lshrrev_b16 v1, 8, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: s_usubsat_v2i8: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s1, 8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s1, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 0x80008 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s3 +; GFX11-FAKE16-NEXT: v_pk_sub_u16 v0, s0, s1 clamp +; GFX11-FAKE16-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %lhs = bitcast i16 %lhs.arg to <2 x i8> %rhs = bitcast i16 %rhs.arg to <2 x i8> %result = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs) @@ -512,36 +621,69 @@ define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX10-NEXT: v_or3_b32 v0, v1, v2, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_usubsat_v4i8: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v0 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 24, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX11-NEXT: v_lshl_or_b32 v2, v2, 16, v4 -; GFX11-NEXT: v_lshl_or_b32 v3, v3, 16, v5 -; GFX11-NEXT: v_alignbit_b32 v0, v6, v0, 16 -; GFX11-NEXT: v_alignbit_b32 v1, v7, v1, 16 -; GFX11-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_sub_u16 v2, v2, v3 clamp -; GFX11-NEXT: v_pk_sub_u16 v0, v0, v1 clamp -; GFX11-NEXT: v_pk_lshrrev_b16 v1, 8, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 8 -; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v0 -; GFX11-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX11-NEXT: v_and_or_b32 v1, 0xff, v1, v2 -; GFX11-NEXT: v_or3_b32 v0, v1, v3, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_usubsat_v4i8: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v7.l +; GFX11-TRUE16-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_lshlrev_b16 v2, 8, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_sub_u16 v0, v0, v1 clamp +; GFX11-TRUE16-NEXT: v_pk_lshlrev_b16 v1, 8, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_sub_u16 v1, v2, v1 clamp +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h +; GFX11-TRUE16-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0xff, v0, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; GFX11-TRUE16-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_usubsat_v4i8: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v2, 16, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v3, 16, v5 +; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, v6, v0, 16 +; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, v7, v1, 16 +; GFX11-FAKE16-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_sub_u16 v2, v2, v3 clamp +; GFX11-FAKE16-NEXT: v_pk_sub_u16 v0, v0, v1 clamp +; GFX11-FAKE16-NEXT: v_pk_lshrrev_b16 v1, 8, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v0, v0, 16, 8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xff, v1, v2 +; GFX11-FAKE16-NEXT: v_or3_b32 v0, v1, v3, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i32 %lhs.arg to <4 x i8> %rhs = bitcast i32 %rhs.arg to <4 x i8> %result = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> %lhs, <4 x i8> %rhs) @@ -713,46 +855,89 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: s_usubsat_v4i8: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_lshr_b32 s2, s0, 8 -; GFX11-NEXT: s_lshr_b32 s3, s0, 24 -; GFX11-NEXT: s_lshr_b32 s4, s1, 8 -; GFX11-NEXT: s_lshr_b32 s5, s1, 24 -; GFX11-NEXT: s_pack_ll_b32_b16 s2, s0, s2 -; GFX11-NEXT: s_pack_hl_b32_b16 s0, s0, s3 -; GFX11-NEXT: s_pack_ll_b32_b16 s3, s1, s4 -; GFX11-NEXT: s_lshr_b32 s4, s2, 16 -; GFX11-NEXT: s_pack_hl_b32_b16 s1, s1, s5 -; GFX11-NEXT: s_lshr_b32 s5, s3, 16 -; GFX11-NEXT: s_lshl_b32 s2, s2, 0x80008 -; GFX11-NEXT: s_lshl_b32 s4, s4, 8 -; GFX11-NEXT: s_lshl_b32 s3, s3, 0x80008 -; GFX11-NEXT: s_lshl_b32 s5, s5, 8 -; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s4 -; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s5 -; GFX11-NEXT: s_lshr_b32 s4, s0, 16 -; GFX11-NEXT: s_lshr_b32 s5, s1, 16 -; GFX11-NEXT: v_pk_sub_u16 v0, s2, s3 clamp -; GFX11-NEXT: s_lshl_b32 s0, s0, 0x80008 -; GFX11-NEXT: s_lshl_b32 s4, s4, 8 -; GFX11-NEXT: s_lshl_b32 s1, s1, 0x80008 -; GFX11-NEXT: s_lshl_b32 s2, s5, 8 -; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s2 -; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_sub_u16 v1, s0, s1 clamp -; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8 -; GFX11-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v1 -; GFX11-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1 -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: s_usubsat_v4i8: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s0, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 24 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s0, s2 +; GFX11-TRUE16-NEXT: s_pack_hl_b32_b16 s0, s0, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s1, s4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s2, 16 +; GFX11-TRUE16-NEXT: s_pack_hl_b32_b16 s1, s1, s5 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s3, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 0x80008 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 0x80008 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s5 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-TRUE16-NEXT: v_pk_sub_u16 v0, s2, s3 clamp +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 0x80008 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s5, 8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX11-TRUE16-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_sub_u16 v1, s0, s1 clamp +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h +; GFX11-TRUE16-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0xff, v0, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; GFX11-TRUE16-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: s_usubsat_v4i8: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s0, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 24 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s0, s2 +; GFX11-FAKE16-NEXT: s_pack_hl_b32_b16 s0, s0, s3 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s1, s4 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s2, 16 +; GFX11-FAKE16-NEXT: s_pack_hl_b32_b16 s1, s1, s5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s3, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 0x80008 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 0x80008 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-FAKE16-NEXT: v_pk_sub_u16 v0, s2, s3 clamp +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 0x80008 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s5, 8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX11-FAKE16-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_sub_u16 v1, s0, s1 clamp +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 8 +; GFX11-FAKE16-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v1, 16, 8 +; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0xff, v0, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX11-FAKE16-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %lhs = bitcast i32 %lhs.arg to <4 x i8> %rhs = bitcast i32 %rhs.arg to <4 x i8> %result = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> %lhs, <4 x i8> %rhs) @@ -1678,11 +1863,23 @@ define i16 @v_usubsat_i16(i16 %lhs, i16 %rhs) { ; GFX9-NEXT: v_sub_u16_e64 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10PLUS-LABEL: v_usubsat_i16: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_sub_nc_u16 v0, v0, v1 clamp -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: v_usubsat_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_sub_nc_u16 v0, v0, v1 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: v_usubsat_i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, v1.l clamp +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_usubsat_i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_sub_nc_u16 v0, v0, v1 clamp +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs) ret i16 %result } @@ -1711,11 +1908,23 @@ define amdgpu_ps i16 @s_usubsat_i16(i16 inreg %lhs, i16 inreg %rhs) { ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: s_usubsat_i16: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_sub_nc_u16 v0, s0, s1 clamp -; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: s_usubsat_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_sub_nc_u16 v0, s0, s1 clamp +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-TRUE16-LABEL: s_usubsat_i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, s0, s1 clamp +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: s_usubsat_i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_sub_nc_u16 v0, s0, s1 clamp +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs) ret i16 %result } @@ -1740,10 +1949,20 @@ define amdgpu_ps half @usubsat_i16_sv(i16 inreg %lhs, i16 %rhs) { ; GFX9-NEXT: v_sub_u16_e64 v0, s0, v0 clamp ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: usubsat_i16_sv: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_sub_nc_u16 v0, s0, v0 clamp -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: usubsat_i16_sv: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_sub_nc_u16 v0, s0, v0 clamp +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-TRUE16-LABEL: usubsat_i16_sv: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, s0, v0.l clamp +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: usubsat_i16_sv: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_sub_nc_u16 v0, s0, v0 clamp +; GFX11-FAKE16-NEXT: ; return to shader part epilog %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs) %cast = bitcast i16 %result to half ret half %cast @@ -1769,10 +1988,20 @@ define amdgpu_ps half @usubsat_i16_vs(i16 %lhs, i16 inreg %rhs) { ; GFX9-NEXT: v_sub_u16_e64 v0, v0, s0 clamp ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: usubsat_i16_vs: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_sub_nc_u16 v0, v0, s0 clamp -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: usubsat_i16_vs: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_sub_nc_u16 v0, v0, s0 clamp +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-TRUE16-LABEL: usubsat_i16_vs: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, s0 clamp +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: usubsat_i16_vs: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_sub_nc_u16 v0, v0, s0 clamp +; GFX11-FAKE16-NEXT: ; return to shader part epilog %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs) %cast = bitcast i16 %result to half ret half %cast diff --git a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll index a94f27a0332c7..e1ecd34ad24e0 100644 --- a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll @@ -98,7 +98,9 @@ define amdgpu_kernel void @fadd_f16( ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l +; GFX11-GISEL-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h ; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; GFX11-GISEL-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fma.f16.ll b/llvm/test/CodeGen/AMDGPU/fma.f16.ll index a33fd03e0ce03..eefcf56586033 100644 --- a/llvm/test/CodeGen/AMDGPU/fma.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fma.f16.ll @@ -501,14 +501,12 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) { ; GFX11-GISEL-TRUE16-NEXT: v_pk_mul_f16 v2, 0x291e291e, v0 ; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-GISEL-TRUE16-NEXT: v_pk_fma_f16 v0, 0x291e291e, v0, v1 -; GFX11-GISEL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2.l -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-GISEL-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0.l -; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s1, 0, v1.l ; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s2, 0, v3.l +; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s1, 0, v2.h +; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0.l +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s2, 0, v0.h ; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll index f490ecf68d984..e2b61190719d3 100644 --- a/llvm/test/CodeGen/AMDGPU/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll @@ -7847,8 +7847,11 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt ; GFX11-GISEL-TRUE16-NEXT: global_load_u16 v3, v2, s[6:7] glc dlc ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l -; GFX11-GISEL-TRUE16-NEXT: v_add_f16_e32 v0.h, 2.0, v1.l -; GFX11-GISEL-TRUE16-NEXT: v_add_f16_e32 v1.l, 4.0, v3.l +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: v_add_f16_e32 v0.h, 2.0, v0.h +; GFX11-GISEL-TRUE16-NEXT: v_add_f16_e32 v1.l, 4.0, v1.l ; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-TRUE16-NEXT: v_med3_f16 v0.l, v0.l, v0.h, v1.l ; GFX11-GISEL-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll index 9540aa322f6ef..0071842baad27 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll @@ -537,13 +537,12 @@ define <2 x half> @test_ldexp_v2f16_v2i32(<2 x half> %a, <2 x i32> %b) { ; GFX11-GISEL-TRUE16: ; %bb.0: ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v3, 0x7fff -; GFX11-GISEL-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v1, 0xffff8000, v1, v3 ; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v2, 0xffff8000, v2, v3 ; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v1.l ; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v4.l, v2.l +; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v0.h, v2.l ; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-FAKE16-LABEL: test_ldexp_v2f16_v2i32: @@ -649,11 +648,8 @@ define <2 x half> @test_ldexp_v2f16_v2i16(<2 x half> %a, <2 x i16> %b) { ; GFX11-GISEL-TRUE16-LABEL: test_ldexp_v2f16_v2i16: ; GFX11-GISEL-TRUE16: ; %bb.0: ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX11-GISEL-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v1.l -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v2.l, v3.l +; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v0.h, v1.h ; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-FAKE16-LABEL: test_ldexp_v2f16_v2i16: @@ -793,15 +789,14 @@ define <3 x half> @test_ldexp_v3f16_v3i32(<3 x half> %a, <3 x i32> %b) { ; GFX11-GISEL-TRUE16: ; %bb.0: ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v5, 0x7fff -; GFX11-GISEL-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v4, 0xffff8000, v4, v5 ; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v2, 0xffff8000, v2, v5 ; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v3, 0xffff8000, v3, v5 ; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v4.l ; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l -; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v6.l, v3.l +; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v0.h, v3.l ; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-FAKE16-LABEL: test_ldexp_v3f16_v3i32: @@ -923,12 +918,9 @@ define <3 x half> @test_ldexp_v3f16_v3i16(<3 x half> %a, <3 x i16> %b) { ; GFX11-GISEL-TRUE16-LABEL: test_ldexp_v3f16_v3i16: ; GFX11-GISEL-TRUE16: ; %bb.0: ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX11-GISEL-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v3.l ; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v4.l, v5.l +; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v0.h, v2.h ; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-FAKE16-LABEL: test_ldexp_v3f16_v3i16: @@ -1097,19 +1089,17 @@ define <4 x half> @test_ldexp_v4f16_v4i32(<4 x half> %a, <4 x i32> %b) { ; GFX11-GISEL-TRUE16: ; %bb.0: ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v6, 0x7fff -; GFX11-GISEL-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GFX11-GISEL-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v2, 0xffff8000, v2, v6 ; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v3, 0xffff8000, v3, v6 ; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v4, 0xffff8000, v4, v6 ; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v5, 0xffff8000, v5, v6 ; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l ; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v7.l, v3.l +; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v0.h, v3.l ; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v4.l ; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v1.h, v8.l, v5.l +; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v1.h, v1.h, v5.l ; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-FAKE16-LABEL: test_ldexp_v4f16_v4i32: @@ -1264,15 +1254,10 @@ define <4 x half> @test_ldexp_v4f16_v4i16(<4 x half> %a, <4 x i16> %b) { ; GFX11-GISEL-TRUE16-LABEL: test_ldexp_v4f16_v4i16: ; GFX11-GISEL-TRUE16: ; %bb.0: ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX11-GISEL-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX11-GISEL-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX11-GISEL-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l +; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v0.h, v2.h ; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v3.l -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v4.l, v6.l -; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v1.h, v5.l, v7.l +; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v1.h, v1.h, v3.h ; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-FAKE16-LABEL: test_ldexp_v4f16_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll index 53f1c476e49ee..3725cea1173aa 100644 --- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -1364,12 +1364,12 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-TRUE16-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX11-GISEL-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3] ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v1.l, 0xffc0, v1.l -; GFX11-GISEL-TRUE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, 0xffc0, v0.l +; GFX11-GISEL-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] ; GFX11-GISEL-TRUE16-NEXT: s_endpgm ; ; GFX11-GISEL-FAKE16-LABEL: v_test_i16_x_sub_64: @@ -1802,17 +1802,22 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out, ; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-TRUE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3] glc dlc ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-TRUE16-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc +; GFX11-GISEL-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3] glc dlc ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v1.l, 0xffc0, v1.l -; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v2.l, 0xffc0, v2.l -; GFX11-GISEL-TRUE16-NEXT: global_store_b16 v0, v1, s[0:1] dlc +; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, 0xffc0, v0.l +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.h, 0xffc0, v0.h +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-GISEL-TRUE16-NEXT: global_store_b16 v1, v2, s[0:1] dlc ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-GISEL-TRUE16-NEXT: global_store_b16 v0, v2, s[0:1] dlc +; GFX11-GISEL-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] dlc ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-TRUE16-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/v_pack.ll b/llvm/test/CodeGen/AMDGPU/v_pack.ll index 072151dd6f5a0..3ab08a8bed860 100644 --- a/llvm/test/CodeGen/AMDGPU/v_pack.ll +++ b/llvm/test/CodeGen/AMDGPU/v_pack.ll @@ -117,8 +117,11 @@ define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace ; GFX11-GISEL-REAL16-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-REAL16-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc ; GFX11-GISEL-REAL16-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-REAL16-NEXT: v_add_f16_e32 v0.l, 2.0, v1.l -; GFX11-GISEL-REAL16-NEXT: v_add_f16_e32 v0.h, 2.0, v2.l +; GFX11-GISEL-REAL16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX11-GISEL-REAL16-NEXT: v_mov_b16_e32 v0.h, v2.l +; GFX11-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-REAL16-NEXT: v_add_f16_e32 v0.l, 2.0, v0.l +; GFX11-GISEL-REAL16-NEXT: v_add_f16_e32 v0.h, 2.0, v0.h ; GFX11-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-REAL16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h ; GFX11-GISEL-REAL16-NEXT: ;;#ASMSTART @@ -249,8 +252,11 @@ define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrs ; GFX11-GISEL-REAL16-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-REAL16-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc ; GFX11-GISEL-REAL16-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-REAL16-NEXT: v_subrev_f16_e32 v0.l, 2.0, v1.l -; GFX11-GISEL-REAL16-NEXT: v_add_f16_e32 v0.h, 2.0, v2.l +; GFX11-GISEL-REAL16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX11-GISEL-REAL16-NEXT: v_mov_b16_e32 v0.h, v2.l +; GFX11-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-REAL16-NEXT: v_subrev_f16_e32 v0.l, 2.0, v0.l +; GFX11-GISEL-REAL16-NEXT: v_add_f16_e32 v0.h, 2.0, v0.h ; GFX11-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-REAL16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h ; GFX11-GISEL-REAL16-NEXT: ;;#ASMSTART @@ -495,8 +501,11 @@ define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace( ; GFX11-GISEL-REAL16-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-REAL16-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc ; GFX11-GISEL-REAL16-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-REAL16-NEXT: v_add_f16_e32 v0.l, 2.0, v1.l -; GFX11-GISEL-REAL16-NEXT: v_add_f16_e32 v0.h, 2.0, v2.l +; GFX11-GISEL-REAL16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX11-GISEL-REAL16-NEXT: v_mov_b16_e32 v0.h, v2.l +; GFX11-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-REAL16-NEXT: v_add_f16_e32 v0.l, 2.0, v0.l +; GFX11-GISEL-REAL16-NEXT: v_add_f16_e32 v0.h, 2.0, v0.h ; GFX11-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-REAL16-NEXT: v_pack_b32_f16 v0, |v0.l|, |v0.h| ; GFX11-GISEL-REAL16-NEXT: ;;#ASMSTART @@ -629,8 +638,11 @@ define amdgpu_kernel void @v_pack_b32.fneg(ptr addrspace(1) %in0, ptr addrspace( ; GFX11-GISEL-REAL16-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-REAL16-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc ; GFX11-GISEL-REAL16-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-REAL16-NEXT: v_add_f16_e32 v0.l, 2.0, v1.l -; GFX11-GISEL-REAL16-NEXT: v_add_f16_e32 v0.h, 2.0, v2.l +; GFX11-GISEL-REAL16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX11-GISEL-REAL16-NEXT: v_mov_b16_e32 v0.h, v2.l +; GFX11-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-REAL16-NEXT: v_add_f16_e32 v0.l, 2.0, v0.l +; GFX11-GISEL-REAL16-NEXT: v_add_f16_e32 v0.h, 2.0, v0.h ; GFX11-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-REAL16-NEXT: v_pack_b32_f16 v0, -v0.l, -v0.h ; GFX11-GISEL-REAL16-NEXT: ;;#ASMSTART