diff --git a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h index 684a9bf554fb1..503f61216d9e6 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h @@ -336,12 +336,12 @@ bool isKnownToBeAPowerOfTwo(Register Val, const MachineRegisterInfo &MRI, /// Returns true if \p Val can be assumed to never be a NaN. If \p SNaN is true, /// this returns if \p Val can be assumed to never be a signaling NaN. -bool isKnownNeverNaN(Register Val, const MachineRegisterInfo &MRI, +bool isKnownNeverNaN(Register Val, const MachineRegisterInfo &MRI, GISelValueTracking *ValueTracking, bool SNaN = false); /// Returns true if \p Val can be assumed to never be a signaling NaN. -inline bool isKnownNeverSNaN(Register Val, const MachineRegisterInfo &MRI) { - return isKnownNeverNaN(Val, MRI, true); +inline bool isKnownNeverSNaN(Register Val, const MachineRegisterInfo &MRI, GISelValueTracking *ValueTracking) { + return isKnownNeverNaN(Val, MRI, ValueTracking, true); } Align inferAlignFromPtrInfo(MachineFunction &MF, const MachinePointerInfo &MPO); diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index b1e851183de0d..8952226ae7f1e 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -6519,8 +6519,8 @@ unsigned CombinerHelper::getFPMinMaxOpcForSelect( CombinerHelper::SelectPatternNaNBehaviour CombinerHelper::computeRetValAgainstNaN(Register LHS, Register RHS, bool IsOrderedComparison) const { - bool LHSSafe = isKnownNeverNaN(LHS, MRI); - bool RHSSafe = isKnownNeverNaN(RHS, MRI); + bool LHSSafe = isKnownNeverNaN(LHS, MRI, VT); + bool RHSSafe = isKnownNeverNaN(RHS, MRI, VT); // Completely unsafe. if (!LHSSafe && !RHSSafe) return SelectPatternNaNBehaviour::NOT_APPLICABLE; diff --git a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp index 67b1a449f8483..f1e77d813f0df 100644 --- a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp +++ b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp @@ -693,6 +693,9 @@ static bool outputDenormalIsIEEEOrPosZero(const MachineFunction &MF, LLT Ty) { void GISelValueTracking::computeKnownFPClass(Register R, KnownFPClass &Known, FPClassTest InterestedClasses, unsigned Depth) { + if (!R.isVirtual()) + return; + LLT Ty = MRI.getType(R); APInt DemandedElts = Ty.isFixedVector() ? APInt::getAllOnes(Ty.getNumElements()) : APInt(1, 1); @@ -736,6 +739,9 @@ void GISelValueTracking::computeKnownFPClass(Register R, assert(Depth <= MaxAnalysisRecursionDepth && "Limit Search Depth"); + if (!R.isVirtual()) + return; + MachineInstr &MI = *MRI.getVRegDef(R); unsigned Opcode = MI.getOpcode(); LLT DstTy = MRI.getType(R); @@ -1024,7 +1030,7 @@ void GISelValueTracking::computeKnownFPClass(Register R, // if ((Known.KnownFPClasses & fcZero) != fcNone && !Known.isKnownNeverSubnormal()) { - DenormalMode Mode = MF->getDenormalMode(getFltSemanticForLLT(DstTy)); + DenormalMode Mode = MF->getDenormalMode(getFltSemanticForLLT(DstTy.getScalarType())); if (Mode != DenormalMode::getIEEE()) Known.KnownFPClasses |= fcZero; } @@ -1086,8 +1092,8 @@ void GISelValueTracking::computeKnownFPClass(Register R, // If the parent function flushes denormals, the canonical output cannot // be a denormal. - LLT Ty = MRI.getType(Val); - const fltSemantics &FPType = getFltSemanticForLLT(Ty.getScalarType()); + LLT Ty = MRI.getType(Val).getScalarType(); + const fltSemantics &FPType = getFltSemanticForLLT(Ty); DenormalMode DenormMode = MF->getDenormalMode(FPType); if (DenormMode == DenormalMode::getIEEE()) { if (KnownSrc.isKnownNever(fcPosZero)) @@ -1197,8 +1203,8 @@ void GISelValueTracking::computeKnownFPClass(Register R, if (KnownSrc.isKnownNeverNaN() && KnownSrc.cannotBeOrderedLessThanZero()) Known.knownNot(fcNan); - LLT Ty = MRI.getType(Val); - const fltSemantics &FltSem = getFltSemanticForLLT(Ty.getScalarType()); + LLT Ty = MRI.getType(Val).getScalarType(); + const fltSemantics &FltSem = getFltSemanticForLLT(Ty); DenormalMode Mode = MF->getDenormalMode(FltSem); if (KnownSrc.isKnownNeverLogicalZero(Mode)) @@ -1317,18 +1323,18 @@ void GISelValueTracking::computeKnownFPClass(Register R, // (fadd x, 0.0) is guaranteed to return +0.0, not -0.0. if ((KnownLHS.isKnownNeverLogicalNegZero( - MF->getDenormalMode(getFltSemanticForLLT(DstTy))) || + MF->getDenormalMode(getFltSemanticForLLT(DstTy.getScalarType()))) || KnownRHS.isKnownNeverLogicalNegZero( - MF->getDenormalMode(getFltSemanticForLLT(DstTy)))) && + MF->getDenormalMode(getFltSemanticForLLT(DstTy.getScalarType())))) && // Make sure output negative denormal can't flush to -0 outputDenormalIsIEEEOrPosZero(*MF, DstTy)) Known.knownNot(fcNegZero); } else { // Only fsub -0, +0 can return -0 if ((KnownLHS.isKnownNeverLogicalNegZero( - MF->getDenormalMode(getFltSemanticForLLT(DstTy))) || + MF->getDenormalMode(getFltSemanticForLLT(DstTy.getScalarType()))) || KnownRHS.isKnownNeverLogicalPosZero( - MF->getDenormalMode(getFltSemanticForLLT(DstTy)))) && + MF->getDenormalMode(getFltSemanticForLLT(DstTy.getScalarType())))) && // Make sure output negative denormal can't flush to -0 outputDenormalIsIEEEOrPosZero(*MF, DstTy)) Known.knownNot(fcNegZero); @@ -1375,10 +1381,10 @@ void GISelValueTracking::computeKnownFPClass(Register R, if ((KnownRHS.isKnownNeverInfinity() || KnownLHS.isKnownNeverLogicalZero( - MF->getDenormalMode(getFltSemanticForLLT(DstTy)))) && + MF->getDenormalMode(getFltSemanticForLLT(DstTy.getScalarType())))) && (KnownLHS.isKnownNeverInfinity() || KnownRHS.isKnownNeverLogicalZero( - MF->getDenormalMode(getFltSemanticForLLT(DstTy))))) + MF->getDenormalMode(getFltSemanticForLLT(DstTy.getScalarType()))))) Known.knownNot(fcNan); break; @@ -1431,9 +1437,9 @@ void GISelValueTracking::computeKnownFPClass(Register R, (KnownLHS.isKnownNeverInfinity() || KnownRHS.isKnownNeverInfinity()) && ((KnownLHS.isKnownNeverLogicalZero( - MF->getDenormalMode(getFltSemanticForLLT(DstTy)))) || + MF->getDenormalMode(getFltSemanticForLLT(DstTy.getScalarType())))) || (KnownRHS.isKnownNeverLogicalZero( - MF->getDenormalMode(getFltSemanticForLLT(DstTy)))))) { + MF->getDenormalMode(getFltSemanticForLLT(DstTy.getScalarType())))))) { Known.knownNot(fcNan); } @@ -1447,7 +1453,7 @@ void GISelValueTracking::computeKnownFPClass(Register R, if (KnownLHS.isKnownNeverNaN() && KnownRHS.isKnownNeverNaN() && KnownLHS.isKnownNeverInfinity() && KnownRHS.isKnownNeverLogicalZero( - MF->getDenormalMode(getFltSemanticForLLT(DstTy)))) { + MF->getDenormalMode(getFltSemanticForLLT(DstTy.getScalarType())))) { Known.knownNot(fcNan); } @@ -1472,10 +1478,10 @@ void GISelValueTracking::computeKnownFPClass(Register R, // Infinity, nan and zero propagate from source. computeKnownFPClass(R, DemandedElts, InterestedClasses, Known, Depth + 1); - LLT DstTy = MRI.getType(Dst); - const fltSemantics &DstSem = getFltSemanticForLLT(DstTy.getScalarType()); - LLT SrcTy = MRI.getType(Src); - const fltSemantics &SrcSem = getFltSemanticForLLT(SrcTy.getScalarType()); + LLT DstTy = MRI.getType(Dst).getScalarType(); + const fltSemantics &DstSem = getFltSemanticForLLT(DstTy); + LLT SrcTy = MRI.getType(Src).getScalarType(); + const fltSemantics &SrcSem = getFltSemanticForLLT(SrcTy); // All subnormal inputs should be in the normal range in the result type. if (APFloat::isRepresentableAsNormalIn(SrcSem, DstSem)) { diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index 7b18a98d7f3ca..e242df04a5d80 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -8179,10 +8179,10 @@ LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) { // Note this must be done here, and not as an optimization combine in the // absence of a dedicate quiet-snan instruction as we're using an // omni-purpose G_FCANONICALIZE. - if (!isKnownNeverSNaN(Src0, MRI)) + if (!isKnownNeverSNaN(Src0, MRI, VT)) Src0 = MIRBuilder.buildFCanonicalize(Ty, Src0, MI.getFlags()).getReg(0); - if (!isKnownNeverSNaN(Src1, MRI)) + if (!isKnownNeverSNaN(Src1, MRI, VT)) Src1 = MIRBuilder.buildFCanonicalize(Ty, Src1, MI.getFlags()).getReg(0); } diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp index 64af7a57e8d12..227fac4007463 100644 --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -12,6 +12,7 @@ #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APInt.h" +#include "llvm/ADT/FloatingPointMode.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/CodeGenCommonISel.h" #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" @@ -806,88 +807,13 @@ llvm::ConstantFoldVectorBinop(unsigned Opcode, const Register Op1, return FoldedElements; } -bool llvm::isKnownNeverNaN(Register Val, const MachineRegisterInfo &MRI, +bool llvm::isKnownNeverNaN(Register Val, const MachineRegisterInfo &MRI, GISelValueTracking *VT, bool SNaN) { - const MachineInstr *DefMI = MRI.getVRegDef(Val); - if (!DefMI) - return false; - - const TargetMachine& TM = DefMI->getMF()->getTarget(); - if (DefMI->getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) - return true; - - // If the value is a constant, we can obviously see if it is a NaN or not. - if (const ConstantFP *FPVal = getConstantFPVRegVal(Val, MRI)) { - return !FPVal->getValueAPF().isNaN() || - (SNaN && !FPVal->getValueAPF().isSignaling()); - } - - if (DefMI->getOpcode() == TargetOpcode::G_BUILD_VECTOR) { - for (const auto &Op : DefMI->uses()) - if (!isKnownNeverNaN(Op.getReg(), MRI, SNaN)) - return false; - return true; - } - - switch (DefMI->getOpcode()) { - default: - break; - case TargetOpcode::G_FADD: - case TargetOpcode::G_FSUB: - case TargetOpcode::G_FMUL: - case TargetOpcode::G_FDIV: - case TargetOpcode::G_FREM: - case TargetOpcode::G_FSIN: - case TargetOpcode::G_FCOS: - case TargetOpcode::G_FTAN: - case TargetOpcode::G_FACOS: - case TargetOpcode::G_FASIN: - case TargetOpcode::G_FATAN: - case TargetOpcode::G_FATAN2: - case TargetOpcode::G_FCOSH: - case TargetOpcode::G_FSINH: - case TargetOpcode::G_FTANH: - case TargetOpcode::G_FMA: - case TargetOpcode::G_FMAD: - if (SNaN) - return true; - - // TODO: Need isKnownNeverInfinity - return false; - case TargetOpcode::G_FMINNUM_IEEE: - case TargetOpcode::G_FMAXNUM_IEEE: { - if (SNaN) - return true; - // This can return a NaN if either operand is an sNaN, or if both operands - // are NaN. - return (isKnownNeverNaN(DefMI->getOperand(1).getReg(), MRI) && - isKnownNeverSNaN(DefMI->getOperand(2).getReg(), MRI)) || - (isKnownNeverSNaN(DefMI->getOperand(1).getReg(), MRI) && - isKnownNeverNaN(DefMI->getOperand(2).getReg(), MRI)); - } - case TargetOpcode::G_FMINNUM: - case TargetOpcode::G_FMAXNUM: { - // Only one needs to be known not-nan, since it will be returned if the - // other ends up being one. - return isKnownNeverNaN(DefMI->getOperand(1).getReg(), MRI, SNaN) || - isKnownNeverNaN(DefMI->getOperand(2).getReg(), MRI, SNaN); - } - } - - if (SNaN) { - // FP operations quiet. For now, just handle the ones inserted during - // legalization. - switch (DefMI->getOpcode()) { - case TargetOpcode::G_FPEXT: - case TargetOpcode::G_FPTRUNC: - case TargetOpcode::G_FCANONICALIZE: - return true; - default: - return false; - } - } - - return false; + KnownFPClass FPClass = VT->computeKnownFPClass(Val, fcNan); + if (SNaN) + return FPClass.isKnownNever(fcSNan); + + return FPClass.isKnownNeverNaN(); } Align llvm::inferAlignFromPtrInfo(MachineFunction &MF, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index 18a948d68e97b..2a6073c20c73b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -859,7 +859,7 @@ class NeverNaNPats frags> : PatFrags { return CurDAG->isKnownNeverNaN(SDValue(N,0)); }]; let GISelPredicateCode = [{ - return isKnownNeverNaN(MI.getOperand(0).getReg(), MRI); + return isKnownNeverNaN(MI.getOperand(0).getReg(), MRI, VT); }]; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp index f08502fb3d928..7279fbe474212 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp @@ -261,7 +261,7 @@ bool AMDGPURegBankCombinerImpl::matchFPMinMaxToMed3( // nodes(max/min) have same behavior when one input is NaN and other isn't. // Don't consider max(min(SNaN, K1), K0) since there is no isKnownNeverQNaN, // also post-legalizer inputs to min/max are fcanonicalized (never SNaN). - if ((getIEEE() && isFminnumIeee(MI)) || isKnownNeverNaN(Dst, MRI)) { + if ((getIEEE() && isFminnumIeee(MI)) || isKnownNeverNaN(Dst, MRI, VT)) { // Don't fold single use constant that can't be inlined. if ((!MRI.hasOneNonDBGUse(K0->VReg) || TII.isInlineConstant(K0->Value)) && (!MRI.hasOneNonDBGUse(K1->VReg) || TII.isInlineConstant(K1->Value))) { @@ -291,8 +291,8 @@ bool AMDGPURegBankCombinerImpl::matchFPMinMaxToClamp(MachineInstr &MI, // For IEEE=true consider NaN inputs. Only min(max(QNaN, 0.0), 1.0) evaluates // to 0.0 requires dx10_clamp = true. if ((getIEEE() && getDX10Clamp() && isFminnumIeee(MI) && - isKnownNeverSNaN(Val, MRI)) || - isKnownNeverNaN(MI.getOperand(0).getReg(), MRI)) { + isKnownNeverSNaN(Val, MRI, VT)) || + isKnownNeverNaN(MI.getOperand(0).getReg(), MRI, VT)) { Reg = Val; return true; } @@ -329,6 +329,8 @@ bool AMDGPURegBankCombinerImpl::matchFPMed3ToClamp(MachineInstr &MI, Register Val = Src0->getOperand(0).getReg(); auto isOp3Zero = [&]() { + if (MI.getNumOperands() < 5) + return false; MachineInstr *Op3 = getDefIgnoringCopies(MI.getOperand(4).getReg(), MRI); if (Op3->getOpcode() == TargetOpcode::G_FCONSTANT) return Op3->getOperand(1).getFPImm()->isExactlyValue(0.0); @@ -338,9 +340,9 @@ bool AMDGPURegBankCombinerImpl::matchFPMed3ToClamp(MachineInstr &MI, // no NaN inputs. Most often MI is marked with nnan fast math flag. // For IEEE=true consider NaN inputs. Requires dx10_clamp = true. Safe to fold // when Val could be QNaN. If Val can also be SNaN third input should be 0.0. - if (isKnownNeverNaN(MI.getOperand(0).getReg(), MRI) || + if (isKnownNeverNaN(MI.getOperand(0).getReg(), MRI, VT) || (getIEEE() && getDX10Clamp() && - (isKnownNeverSNaN(Val, MRI) || isOp3Zero()))) { + (isKnownNeverSNaN(Val, MRI, VT) || isOp3Zero()))) { Reg = Val; return true; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll index c7676e9da6f49..0ca26b1b7d0df 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll @@ -74,7 +74,8 @@ define float @test_fmed3_maybe_SNaN_input_zero_third_operand_ieee_true_dx10clamp ; GFX10-LABEL: test_fmed3_maybe_SNaN_input_zero_third_operand_ieee_true_dx10clamp_true: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp +; GFX10-NEXT: v_mul_f32_e32 v0, 2.0, v0 +; GFX10-NEXT: v_med3_f32 v0, v0, 1.0, 0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: test_fmed3_maybe_SNaN_input_zero_third_operand_ieee_true_dx10clamp_true: @@ -84,7 +85,9 @@ define float @test_fmed3_maybe_SNaN_input_zero_third_operand_ieee_true_dx10clamp ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp +; GFX12-NEXT: v_mul_f32_e32 v0, 2.0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_med3_num_f32 v0, v0, 1.0, 0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %fmul = fmul float %a, 2.0 %fmed = call float @llvm.amdgcn.fmed3.f32(float %fmul, float 1.0, float 0.0) @@ -97,7 +100,8 @@ define float @test_fmed3_global_nnan(float %a) #3 { ; GFX10-LABEL: test_fmed3_global_nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp +; GFX10-NEXT: v_mul_f32_e32 v0, 2.0, v0 +; GFX10-NEXT: v_med3_f32 v0, v0, 0, 1.0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: test_fmed3_global_nnan: @@ -107,7 +111,9 @@ define float @test_fmed3_global_nnan(float %a) #3 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp +; GFX12-NEXT: v_mul_f32_e32 v0, 2.0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_med3_num_f32 v0, v0, 0, 1.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %fmul = fmul float %a, 2.0 %fmed = call float @llvm.amdgcn.fmed3.f32(float %fmul, float 0.0, float 1.0) @@ -134,7 +140,9 @@ define float @test_fmed3_f32_maybe_NaN_ieee_false(float %a) #1 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp +; GFX12-NEXT: v_mul_f32_e32 v0, 2.0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_med3_num_f32 v0, v0, 1.0, 0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %fmul = fmul float %a, 2.0 %fmed = call float @llvm.amdgcn.fmed3.f32(float %fmul, float 1.0, float 0.0) @@ -172,7 +180,8 @@ define float @test_fmed3_maybe_SNaN_input_ieee_true_dx10clamp_true(float %a) #2 ; GFX10-LABEL: test_fmed3_maybe_SNaN_input_ieee_true_dx10clamp_true: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp +; GFX10-NEXT: v_mul_f32_e32 v0, 2.0, v0 +; GFX10-NEXT: v_med3_f32 v0, v0, 0, 1.0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: test_fmed3_maybe_SNaN_input_ieee_true_dx10clamp_true: @@ -182,7 +191,9 @@ define float @test_fmed3_maybe_SNaN_input_ieee_true_dx10clamp_true(float %a) #2 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp +; GFX12-NEXT: v_mul_f32_e32 v0, 2.0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_med3_num_f32 v0, v0, 0, 1.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %fmul = fmul float %a, 2.0 %fmed = call float @llvm.amdgcn.fmed3.f32(float %fmul, float 0.0, float 1.0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll index e2e1c1147eeee..70276bd670715 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll @@ -51,7 +51,8 @@ define half @test_min_K1max_ValK0_f16(half %a) #2 { ; GFX10-LABEL: test_min_K1max_ValK0_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mul_f16_e64 v0, v0, 2.0 clamp +; GFX10-NEXT: v_mul_f16_e32 v0, 2.0, v0 +; GFX10-NEXT: v_med3_f16 v0, v0, 0, 1.0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: test_min_K1max_ValK0_f16: @@ -61,7 +62,9 @@ define half @test_min_K1max_ValK0_f16(half %a) #2 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mul_f16_e64 v0, v0, 2.0 clamp +; GFX12-NEXT: v_mul_f16_e32 v0, 2.0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_med3_num_f16 v0, v0, 0, 1.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %fmul = fmul half %a, 2.0 %maxnum = call half @llvm.maxnum.f16(half %fmul, half 0.0) @@ -95,7 +98,9 @@ define <2 x half> @test_min_max_splat_padded_with_undef(<2 x half> %a) #2 { ; GFX10-LABEL: test_min_max_splat_padded_with_undef: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_mul_f16 v0, v0, 2.0 op_sel_hi:[1,0] clamp +; GFX10-NEXT: v_pk_mul_f16 v0, v0, 2.0 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_max_f16 v0, v0, 0 +; GFX10-NEXT: v_pk_min_f16 v0, v0, 1.0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: test_min_max_splat_padded_with_undef: @@ -105,7 +110,10 @@ define <2 x half> @test_min_max_splat_padded_with_undef(<2 x half> %a) #2 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_pk_mul_f16 v0, v0, 2.0 op_sel_hi:[1,0] clamp +; GFX12-NEXT: v_pk_mul_f16 v0, v0, 2.0 op_sel_hi:[1,0] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v0, v0, 0 +; GFX12-NEXT: v_pk_min_num_f16 v0, v0, 1.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %fmul = fmul <2 x half> %a, %maxnum = call <2 x half> @llvm.maxnum.v2f16(<2 x half> , <2 x half> %fmul) @@ -231,7 +239,9 @@ define float @test_max_min_global_nnan(float %a) #3 { ; GFX10-LABEL: test_max_min_global_nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f32_e64 v0, v0, v0 clamp +; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX10-NEXT: v_min_f32_e32 v0, 1.0, v0 +; GFX10-NEXT: v_max_f32_e32 v0, 0, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: test_max_min_global_nnan: @@ -241,7 +251,9 @@ define float @test_max_min_global_nnan(float %a) #3 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f32_e64 v0, v0, v0 clamp +; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_minmax_num_f32 v0, v0, 1.0, 0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %minnum = call float @llvm.minnum.f32(float %a, float 1.0) %fmed = call float @llvm.maxnum.f32(float %minnum, float 0.0) @@ -305,9 +317,7 @@ define float @test_min_max_maybe_NaN_input_ieee_false(float %a) #1 { ; GFX10-LABEL: test_min_max_maybe_NaN_input_ieee_false: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mul_f32_e32 v0, 2.0, v0 -; GFX10-NEXT: v_max_f32_e32 v0, 0, v0 -; GFX10-NEXT: v_min_f32_e32 v0, 1.0, v0 +; GFX10-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: test_min_max_maybe_NaN_input_ieee_false: @@ -317,7 +327,9 @@ define float @test_min_max_maybe_NaN_input_ieee_false(float %a) #1 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp +; GFX12-NEXT: v_mul_f32_e32 v0, 2.0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_med3_num_f32 v0, v0, 0, 1.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %fmul = fmul float %a, 2.0 %maxnum = call float @llvm.maxnum.f32(float %fmul, float 0.0) @@ -341,7 +353,9 @@ define float @test_min_max_maybe_NaN_input_ieee_true_dx10clamp_false(float %a) # ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp +; GFX12-NEXT: v_mul_f32_e32 v0, 2.0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_med3_num_f32 v0, v0, 0, 1.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %fmul = fmul float %a, 2.0 %maxnum = call float @llvm.maxnum.f32(float %fmul, float 0.0) @@ -381,9 +395,7 @@ define float @test_max_min_maybe_NaN_input_ieee_false(float %a) #1 { ; GFX10-LABEL: test_max_min_maybe_NaN_input_ieee_false: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mul_f32_e32 v0, 2.0, v0 -; GFX10-NEXT: v_min_f32_e32 v0, 1.0, v0 -; GFX10-NEXT: v_max_f32_e32 v0, 0, v0 +; GFX10-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: test_max_min_maybe_NaN_input_ieee_false: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll index 75c4cd53e3bfc..97c86b9582784 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll @@ -236,12 +236,14 @@ define float @test_min_max_global_nnan(float %a) #2 { ; GFX10-LABEL: test_min_max_global_nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX10-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_min_max_global_nnan: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX8-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -252,6 +254,8 @@ define float @test_min_max_global_nnan(float %a) #2 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_med3_num_f32 v0, v0, 2.0, 4.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %maxnum = call float @llvm.maxnum.f32(float %a, float 2.0) @@ -263,13 +267,17 @@ define float @test_max_min_global_nnan(float %a) #2 { ; GFX10-LABEL: test_max_min_global_nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX10-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX10-NEXT: v_max_f32_e32 v0, 2.0, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_max_min_global_nnan: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX8-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX8-NEXT: v_max_f32_e32 v0, 2.0, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: test_max_min_global_nnan: @@ -279,7 +287,9 @@ define float @test_max_min_global_nnan(float %a) #2 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_med3_num_f32 v0, v0, 2.0, 4.0 +; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_minmax_num_f32 v0, v0, 4.0, 2.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %minnum = call float @llvm.minnum.f32(float %a, float 4.0) %fmed = call float @llvm.maxnum.f32(float %minnum, float 2.0) @@ -456,15 +466,13 @@ define float @test_min_max_maybe_NaN_input_ieee_false(float %a) #1 { ; GFX10-LABEL: test_min_max_maybe_NaN_input_ieee_false: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f32_e32 v0, 2.0, v0 -; GFX10-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX10-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_min_max_maybe_NaN_input_ieee_false: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f32_e32 v0, 2.0, v0 -; GFX8-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX8-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: test_min_max_maybe_NaN_input_ieee_false: @@ -489,15 +497,13 @@ define float @test_max_min_maybe_NaN_input_ieee_false(float %a) #1 { ; GFX10-LABEL: test_max_min_maybe_NaN_input_ieee_false: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX10-NEXT: v_max_f32_e32 v0, 2.0, v0 +; GFX10-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_max_min_maybe_NaN_input_ieee_false: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX8-NEXT: v_max_f32_e32 v0, 2.0, v0 +; GFX8-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: test_max_min_maybe_NaN_input_ieee_false: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaxnum.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaxnum.mir index d977049de26f4..eb1f0096c113a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaxnum.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaxnum.mir @@ -291,7 +291,9 @@ body: | ; SI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) ; SI-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) ; SI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) - ; SI-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FPEXT]], [[FPEXT1]] + ; SI-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT]] + ; SI-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT1]] + ; SI-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]] ; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMAXNUM_IEEE]](s32) ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16) ; SI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) @@ -411,11 +413,15 @@ body: | ; SI-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; SI-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) ; SI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16) - ; SI-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FPEXT]], [[FPEXT1]] + ; SI-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT]] + ; SI-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT1]] + ; SI-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]] ; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMAXNUM_IEEE]](s32) ; SI-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) ; SI-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16) - ; SI-NEXT: [[FMAXNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FPEXT2]], [[FPEXT3]] + ; SI-NEXT: [[FCANONICALIZE2:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT2]] + ; SI-NEXT: [[FCANONICALIZE3:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT3]] + ; SI-NEXT: [[FMAXNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE2]], [[FCANONICALIZE3]] ; SI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMAXNUM_IEEE1]](s32) ; SI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC]](s16) ; SI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC1]](s16) @@ -493,15 +499,21 @@ body: | ; SI-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; SI-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) ; SI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16) - ; SI-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FPEXT]], [[FPEXT1]] + ; SI-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT]] + ; SI-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT1]] + ; SI-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]] ; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMAXNUM_IEEE]](s32) ; SI-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) ; SI-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC4]](s16) - ; SI-NEXT: [[FMAXNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FPEXT2]], [[FPEXT3]] + ; SI-NEXT: [[FCANONICALIZE2:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT2]] + ; SI-NEXT: [[FCANONICALIZE3:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT3]] + ; SI-NEXT: [[FMAXNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE2]], [[FCANONICALIZE3]] ; SI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMAXNUM_IEEE1]](s32) ; SI-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16) ; SI-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC5]](s16) - ; SI-NEXT: [[FMAXNUM_IEEE2:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FPEXT4]], [[FPEXT5]] + ; SI-NEXT: [[FCANONICALIZE4:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT4]] + ; SI-NEXT: [[FCANONICALIZE5:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT5]] + ; SI-NEXT: [[FMAXNUM_IEEE2:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE4]], [[FCANONICALIZE5]] ; SI-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMAXNUM_IEEE2]](s32) ; SI-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; SI-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) @@ -661,19 +673,27 @@ body: | ; SI-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) ; SI-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) ; SI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC4]](s16) - ; SI-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FPEXT]], [[FPEXT1]] + ; SI-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT]] + ; SI-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT1]] + ; SI-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]] ; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMAXNUM_IEEE]](s32) ; SI-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) ; SI-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC5]](s16) - ; SI-NEXT: [[FMAXNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FPEXT2]], [[FPEXT3]] + ; SI-NEXT: [[FCANONICALIZE2:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT2]] + ; SI-NEXT: [[FCANONICALIZE3:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT3]] + ; SI-NEXT: [[FMAXNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE2]], [[FCANONICALIZE3]] ; SI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMAXNUM_IEEE1]](s32) ; SI-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16) ; SI-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC6]](s16) - ; SI-NEXT: [[FMAXNUM_IEEE2:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FPEXT4]], [[FPEXT5]] + ; SI-NEXT: [[FCANONICALIZE4:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT4]] + ; SI-NEXT: [[FCANONICALIZE5:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT5]] + ; SI-NEXT: [[FMAXNUM_IEEE2:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE4]], [[FCANONICALIZE5]] ; SI-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMAXNUM_IEEE2]](s32) ; SI-NEXT: [[FPEXT6:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16) ; SI-NEXT: [[FPEXT7:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC7]](s16) - ; SI-NEXT: [[FMAXNUM_IEEE3:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FPEXT6]], [[FPEXT7]] + ; SI-NEXT: [[FCANONICALIZE6:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT6]] + ; SI-NEXT: [[FCANONICALIZE7:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT7]] + ; SI-NEXT: [[FMAXNUM_IEEE3:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE6]], [[FCANONICALIZE7]] ; SI-NEXT: [[FPTRUNC3:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMAXNUM_IEEE3]](s32) ; SI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC]](s16) ; SI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC1]](s16) @@ -1040,11 +1060,15 @@ body: | ; SI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; SI-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) ; SI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[C]](s16) - ; SI-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FPEXT]], [[FPEXT1]] + ; SI-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT]] + ; SI-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT1]] + ; SI-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]] ; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMAXNUM_IEEE]](s32) ; SI-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) ; SI-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[C]](s16) - ; SI-NEXT: [[FMAXNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FPEXT2]], [[FPEXT3]] + ; SI-NEXT: [[FCANONICALIZE2:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT2]] + ; SI-NEXT: [[FCANONICALIZE3:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT3]] + ; SI-NEXT: [[FMAXNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE2]], [[FCANONICALIZE3]] ; SI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMAXNUM_IEEE1]](s32) ; SI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC]](s16) ; SI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC1]](s16) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fminnum.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fminnum.mir index 32c353d2c579c..4f99e6f8ea6a4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fminnum.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fminnum.mir @@ -291,7 +291,9 @@ body: | ; SI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) ; SI-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) ; SI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) - ; SI-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FPEXT]], [[FPEXT1]] + ; SI-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT]] + ; SI-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT1]] + ; SI-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]] ; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMINNUM_IEEE]](s32) ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16) ; SI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) @@ -411,11 +413,15 @@ body: | ; SI-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; SI-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) ; SI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16) - ; SI-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FPEXT]], [[FPEXT1]] + ; SI-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT]] + ; SI-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT1]] + ; SI-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]] ; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMINNUM_IEEE]](s32) ; SI-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) ; SI-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16) - ; SI-NEXT: [[FMINNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FPEXT2]], [[FPEXT3]] + ; SI-NEXT: [[FCANONICALIZE2:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT2]] + ; SI-NEXT: [[FCANONICALIZE3:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT3]] + ; SI-NEXT: [[FMINNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FCANONICALIZE2]], [[FCANONICALIZE3]] ; SI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMINNUM_IEEE1]](s32) ; SI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC]](s16) ; SI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC1]](s16) @@ -493,15 +499,21 @@ body: | ; SI-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; SI-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) ; SI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16) - ; SI-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FPEXT]], [[FPEXT1]] + ; SI-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT]] + ; SI-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT1]] + ; SI-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]] ; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMINNUM_IEEE]](s32) ; SI-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) ; SI-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC4]](s16) - ; SI-NEXT: [[FMINNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FPEXT2]], [[FPEXT3]] + ; SI-NEXT: [[FCANONICALIZE2:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT2]] + ; SI-NEXT: [[FCANONICALIZE3:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT3]] + ; SI-NEXT: [[FMINNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FCANONICALIZE2]], [[FCANONICALIZE3]] ; SI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMINNUM_IEEE1]](s32) ; SI-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16) ; SI-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC5]](s16) - ; SI-NEXT: [[FMINNUM_IEEE2:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FPEXT4]], [[FPEXT5]] + ; SI-NEXT: [[FCANONICALIZE4:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT4]] + ; SI-NEXT: [[FCANONICALIZE5:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT5]] + ; SI-NEXT: [[FMINNUM_IEEE2:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FCANONICALIZE4]], [[FCANONICALIZE5]] ; SI-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMINNUM_IEEE2]](s32) ; SI-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; SI-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) @@ -661,19 +673,27 @@ body: | ; SI-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) ; SI-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) ; SI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC4]](s16) - ; SI-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FPEXT]], [[FPEXT1]] + ; SI-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT]] + ; SI-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT1]] + ; SI-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]] ; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMINNUM_IEEE]](s32) ; SI-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) ; SI-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC5]](s16) - ; SI-NEXT: [[FMINNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FPEXT2]], [[FPEXT3]] + ; SI-NEXT: [[FCANONICALIZE2:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT2]] + ; SI-NEXT: [[FCANONICALIZE3:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT3]] + ; SI-NEXT: [[FMINNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FCANONICALIZE2]], [[FCANONICALIZE3]] ; SI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMINNUM_IEEE1]](s32) ; SI-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16) ; SI-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC6]](s16) - ; SI-NEXT: [[FMINNUM_IEEE2:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FPEXT4]], [[FPEXT5]] + ; SI-NEXT: [[FCANONICALIZE4:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT4]] + ; SI-NEXT: [[FCANONICALIZE5:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT5]] + ; SI-NEXT: [[FMINNUM_IEEE2:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FCANONICALIZE4]], [[FCANONICALIZE5]] ; SI-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMINNUM_IEEE2]](s32) ; SI-NEXT: [[FPEXT6:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16) ; SI-NEXT: [[FPEXT7:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC7]](s16) - ; SI-NEXT: [[FMINNUM_IEEE3:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FPEXT6]], [[FPEXT7]] + ; SI-NEXT: [[FCANONICALIZE6:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT6]] + ; SI-NEXT: [[FCANONICALIZE7:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT7]] + ; SI-NEXT: [[FMINNUM_IEEE3:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FCANONICALIZE6]], [[FCANONICALIZE7]] ; SI-NEXT: [[FPTRUNC3:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMINNUM_IEEE3]](s32) ; SI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC]](s16) ; SI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC1]](s16) @@ -1040,11 +1060,15 @@ body: | ; SI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; SI-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) ; SI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[C]](s16) - ; SI-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FPEXT]], [[FPEXT1]] + ; SI-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT]] + ; SI-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT1]] + ; SI-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]] ; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMINNUM_IEEE]](s32) ; SI-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) ; SI-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[C]](s16) - ; SI-NEXT: [[FMINNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FPEXT2]], [[FPEXT3]] + ; SI-NEXT: [[FCANONICALIZE2:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT2]] + ; SI-NEXT: [[FCANONICALIZE3:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT3]] + ; SI-NEXT: [[FMINNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FCANONICALIZE2]], [[FCANONICALIZE3]] ; SI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMINNUM_IEEE1]](s32) ; SI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC]](s16) ; SI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC1]](s16) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-vector-args-gfx7.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-vector-args-gfx7.mir index 4328d47969a1e..29266b42227e1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-vector-args-gfx7.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-vector-args-gfx7.mir @@ -290,23 +290,33 @@ body: | ; GFX7-NEXT: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GFX7-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) ; GFX7-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC5]](s16) - ; GFX7-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FPEXT]], [[FPEXT1]] + ; GFX7-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT]] + ; GFX7-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT1]] + ; GFX7-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]] ; GFX7-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMAXNUM_IEEE]](s32) ; GFX7-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) ; GFX7-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC6]](s16) - ; GFX7-NEXT: [[FMAXNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FPEXT2]], [[FPEXT3]] + ; GFX7-NEXT: [[FCANONICALIZE2:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT2]] + ; GFX7-NEXT: [[FCANONICALIZE3:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT3]] + ; GFX7-NEXT: [[FMAXNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE2]], [[FCANONICALIZE3]] ; GFX7-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMAXNUM_IEEE1]](s32) ; GFX7-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16) ; GFX7-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC7]](s16) - ; GFX7-NEXT: [[FMAXNUM_IEEE2:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FPEXT4]], [[FPEXT5]] + ; GFX7-NEXT: [[FCANONICALIZE4:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT4]] + ; GFX7-NEXT: [[FCANONICALIZE5:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT5]] + ; GFX7-NEXT: [[FMAXNUM_IEEE2:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE4]], [[FCANONICALIZE5]] ; GFX7-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMAXNUM_IEEE2]](s32) ; GFX7-NEXT: [[FPEXT6:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16) ; GFX7-NEXT: [[FPEXT7:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC8]](s16) - ; GFX7-NEXT: [[FMAXNUM_IEEE3:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FPEXT6]], [[FPEXT7]] + ; GFX7-NEXT: [[FCANONICALIZE6:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT6]] + ; GFX7-NEXT: [[FCANONICALIZE7:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT7]] + ; GFX7-NEXT: [[FMAXNUM_IEEE3:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE6]], [[FCANONICALIZE7]] ; GFX7-NEXT: [[FPTRUNC3:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMAXNUM_IEEE3]](s32) ; GFX7-NEXT: [[FPEXT8:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC4]](s16) ; GFX7-NEXT: [[FPEXT9:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC9]](s16) - ; GFX7-NEXT: [[FMAXNUM_IEEE4:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FPEXT8]], [[FPEXT9]] + ; GFX7-NEXT: [[FCANONICALIZE8:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT8]] + ; GFX7-NEXT: [[FCANONICALIZE9:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FPEXT9]] + ; GFX7-NEXT: [[FMAXNUM_IEEE4:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE8]], [[FCANONICALIZE9]] ; GFX7-NEXT: [[FPTRUNC4:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMAXNUM_IEEE4]](s32) ; GFX7-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16) ; GFX7-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC1]](s16) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-clamp-fmed3-const.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-clamp-fmed3-const.mir index a97d905f2a978..129cbcfca6fa5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-clamp-fmed3-const.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-clamp-fmed3-const.mir @@ -162,8 +162,12 @@ body: | ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY]], [[COPY1]] - ; CHECK-NEXT: [[AMDGPU_CLAMP:%[0-9]+]]:vgpr(s32) = G_AMDGPU_CLAMP [[FMUL]] - ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_CLAMP]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 0.000000e+00 + ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) + ; CHECK-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_FMED3 [[FMUL]], [[COPY2]], [[COPY3]] + ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_FMED3_]](s32) ; ; GFX12-LABEL: name: test_fmed3_maybe_SNaN_input_zero_third_operand_ieee_true_dx10clamp_true ; GFX12: liveins: $vgpr0 @@ -172,8 +176,12 @@ body: | ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; GFX12-NEXT: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY]], [[COPY1]] - ; GFX12-NEXT: [[AMDGPU_CLAMP:%[0-9]+]]:vgpr(s32) = G_AMDGPU_CLAMP [[FMUL]] - ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_CLAMP]](s32) + ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX12-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) + ; GFX12-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_FMED3 [[FMUL]], [[COPY2]], [[COPY3]] + ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_FMED3_]](s32) %0:vgpr(s32) = COPY $vgpr0 %2:sgpr(s32) = G_FCONSTANT float 2.000000e+00 %8:vgpr(s32) = COPY %2(s32) @@ -222,8 +230,12 @@ body: | ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; GFX12-NEXT: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY]], [[COPY1]] - ; GFX12-NEXT: [[AMDGPU_CLAMP:%[0-9]+]]:vgpr(s32) = G_AMDGPU_CLAMP [[FMUL]] - ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_CLAMP]](s32) + ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX12-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) + ; GFX12-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_FMED3 [[FMUL]], [[COPY2]], [[COPY3]] + ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_FMED3_]](s32) %0:vgpr(s32) = COPY $vgpr0 %2:sgpr(s32) = G_FCONSTANT float 2.000000e+00 %8:vgpr(s32) = COPY %2(s32) @@ -307,8 +319,12 @@ body: | ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY]], [[COPY1]] - ; CHECK-NEXT: [[AMDGPU_CLAMP:%[0-9]+]]:vgpr(s32) = G_AMDGPU_CLAMP [[FMUL]] - ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_CLAMP]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00 + ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 0.000000e+00 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) + ; CHECK-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_FMED3 [[FMUL]], [[COPY2]], [[COPY3]] + ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_FMED3_]](s32) ; ; GFX12-LABEL: name: test_fmed3_maybe_SNaN_input_ieee_true_dx10clamp_true ; GFX12: liveins: $vgpr0 @@ -317,8 +333,12 @@ body: | ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; GFX12-NEXT: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY]], [[COPY1]] - ; GFX12-NEXT: [[AMDGPU_CLAMP:%[0-9]+]]:vgpr(s32) = G_AMDGPU_CLAMP [[FMUL]] - ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_CLAMP]](s32) + ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX12-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) + ; GFX12-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_FMED3 [[FMUL]], [[COPY2]], [[COPY3]] + ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_FMED3_]](s32) %0:vgpr(s32) = COPY $vgpr0 %2:sgpr(s32) = G_FCONSTANT float 2.000000e+00 %8:vgpr(s32) = COPY %2(s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-clamp-minmax-const.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-clamp-minmax-const.mir index 70fd67363648d..7e5555b68daad 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-clamp-minmax-const.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-clamp-minmax-const.mir @@ -441,13 +441,8 @@ body: | ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY]], [[COPY1]] - ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 0.000000e+00 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) - ; CHECK-NEXT: [[FMAXNUM:%[0-9]+]]:vgpr(s32) = G_FMAXNUM [[FMUL]], [[COPY2]] - ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) - ; CHECK-NEXT: [[FMINNUM:%[0-9]+]]:vgpr(s32) = G_FMINNUM [[FMAXNUM]], [[COPY3]] - ; CHECK-NEXT: $vgpr0 = COPY [[FMINNUM]](s32) + ; CHECK-NEXT: [[AMDGPU_CLAMP:%[0-9]+]]:vgpr(s32) = G_AMDGPU_CLAMP [[FMUL]] + ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_CLAMP]](s32) %0:vgpr(s32) = COPY $vgpr0 %2:sgpr(s32) = G_FCONSTANT float 2.000000e+00 %9:vgpr(s32) = COPY %2(s32) @@ -564,13 +559,8 @@ body: | ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY]], [[COPY1]] - ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) - ; CHECK-NEXT: [[FMINNUM:%[0-9]+]]:vgpr(s32) = G_FMINNUM [[FMUL]], [[COPY2]] - ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 0.000000e+00 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) - ; CHECK-NEXT: [[FMAXNUM:%[0-9]+]]:vgpr(s32) = G_FMAXNUM [[FMINNUM]], [[COPY3]] - ; CHECK-NEXT: $vgpr0 = COPY [[FMAXNUM]](s32) + ; CHECK-NEXT: [[AMDGPU_CLAMP:%[0-9]+]]:vgpr(s32) = G_AMDGPU_CLAMP [[FMUL]] + ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_CLAMP]](s32) %0:vgpr(s32) = COPY $vgpr0 %2:sgpr(s32) = G_FCONSTANT float 2.000000e+00 %9:vgpr(s32) = COPY %2(s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-fmed3-minmax-const.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-fmed3-minmax-const.mir index 2f41d86100040..f329d126e66db 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-fmed3-minmax-const.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-fmed3-minmax-const.mir @@ -469,11 +469,10 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[FMAXNUM:%[0-9]+]]:vgpr(s32) = G_FMAXNUM [[COPY]], [[COPY1]] ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 4.000000e+00 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) - ; CHECK-NEXT: [[FMINNUM:%[0-9]+]]:vgpr(s32) = G_FMINNUM [[FMAXNUM]], [[COPY2]] - ; CHECK-NEXT: $vgpr0 = COPY [[FMINNUM]](s32) + ; CHECK-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_FMED3 [[COPY]], [[COPY1]], [[COPY2]] + ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_FMED3_]](s32) %0:vgpr(s32) = COPY $vgpr0 %2:sgpr(s32) = G_FCONSTANT float 2.000000e+00 %7:vgpr(s32) = COPY %2(s32) @@ -502,11 +501,10 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 4.000000e+00 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[FMINNUM:%[0-9]+]]:vgpr(s32) = G_FMINNUM [[COPY]], [[COPY1]] ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) - ; CHECK-NEXT: [[FMAXNUM:%[0-9]+]]:vgpr(s32) = G_FMAXNUM [[FMINNUM]], [[COPY2]] - ; CHECK-NEXT: $vgpr0 = COPY [[FMAXNUM]](s32) + ; CHECK-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_FMED3 [[COPY]], [[COPY2]], [[COPY1]] + ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_FMED3_]](s32) %0:vgpr(s32) = COPY $vgpr0 %2:sgpr(s32) = G_FCONSTANT float 4.000000e+00 %7:vgpr(s32) = COPY %2(s32) diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll index f9a1472b4596f..60aabda10533d 100644 --- a/llvm/test/CodeGen/AMDGPU/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll @@ -886,6 +886,7 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out, ; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-GISEL-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 @@ -923,33 +924,60 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out, ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: v_med3_f32 v2, v3, 2.0, 4.0 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; VI-GISEL-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-LABEL: v_test_fmed3_r_i_i_no_nans_f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-NEXT: s_endpgm +; GFX9-SDAG-LABEL: v_test_fmed3_r_i_i_no_nans_f32: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-SDAG-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_fmed3_r_i_i_no_nans_f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX9-GISEL-LABEL: v_test_fmed3_r_i_i_no_nans_f32: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_test_fmed3_r_i_i_no_nans_f32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_fmed3_r_i_i_no_nans_f32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid @@ -1158,7 +1186,12 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa ; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) ; SI-GISEL-NEXT: v_mul_f32_e32 v2, -1.0, v2 -; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-GISEL-NEXT: v_min_f32_e32 v5, v2, v3 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_endpgm @@ -1205,20 +1238,25 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa ; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 -; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 -; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-GISEL-NEXT: v_mul_f32_e32 v4, -1.0, v7 -; VI-GISEL-NEXT: v_med3_f32 v2, v4, v2, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-GISEL-NEXT: v_min_f32_e32 v5, v4, v2 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v4, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; VI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; @@ -1249,7 +1287,12 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa ; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1 -; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v4, v1 ; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-GISEL-NEXT: s_endpgm ; @@ -1284,8 +1327,11 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa ; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-GISEL-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v2 :: v_dual_max_f32 v2, v3, v3 +; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v2, v4 ; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1347,8 +1393,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa ; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-GISEL-NEXT: v_mul_f32_e32 v3, -1.0, v3 -; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-GISEL-NEXT: v_min_f32_e32 v5, v2, v3 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_endpgm @@ -1395,20 +1446,25 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa ; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 -; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 -; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7 ; VI-GISEL-NEXT: v_mul_f32_e32 v2, -1.0, v2 -; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-GISEL-NEXT: v_min_f32_e32 v5, v4, v2 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v4, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; VI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; @@ -1438,8 +1494,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX9-GISEL-NEXT: v_max_f32_e64 v2, -v2, -v2 -; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v4, v1 ; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-GISEL-NEXT: s_endpgm ; @@ -1473,9 +1534,12 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX11-GISEL-NEXT: v_max_f32_e64 v2, -v2, -v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v2 :: v_dual_max_f32 v2, v3, v3 +; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v2, v4 ; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1537,8 +1601,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa ; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; SI-GISEL-NEXT: v_mul_f32_e32 v4, -1.0, v4 -; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-GISEL-NEXT: v_min_f32_e32 v5, v2, v3 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v4 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_endpgm @@ -1597,8 +1666,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; VI-GISEL-NEXT: v_mul_f32_e32 v3, -1.0, v3 -; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-GISEL-NEXT: v_min_f32_e32 v5, v4, v2 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v4, v2 +; VI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; @@ -1628,8 +1702,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-GISEL-NEXT: v_max_f32_e64 v3, -v3, -v3 -; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v4, v1 ; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-GISEL-NEXT: s_endpgm ; @@ -1663,9 +1742,12 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2 ; GFX11-GISEL-NEXT: v_max_f32_e64 v3, -v3, -v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX11-GISEL-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v3, v4 ; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1728,8 +1810,12 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs ; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) ; SI-GISEL-NEXT: v_mul_f32_e32 v2, -1.0, v2 +; SI-GISEL-NEXT: v_mul_f32_e64 v3, 1.0, |v3| ; SI-GISEL-NEXT: v_mul_f32_e64 v4, -1.0, |v4| -; SI-GISEL-NEXT: v_med3_f32 v2, v2, |v3|, v4 +; SI-GISEL-NEXT: v_min_f32_e32 v5, v2, v3 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v4 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_endpgm @@ -1789,8 +1875,12 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-GISEL-NEXT: v_mul_f32_e32 v4, -1.0, v7 +; VI-GISEL-NEXT: v_mul_f32_e64 v2, 1.0, |v2| ; VI-GISEL-NEXT: v_mul_f32_e64 v3, -1.0, |v3| -; VI-GISEL-NEXT: v_med3_f32 v2, v4, |v2|, v3 +; VI-GISEL-NEXT: v_min_f32_e32 v5, v4, v2 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v4, v2 +; VI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; @@ -1821,8 +1911,12 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs ; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1 +; GFX9-GISEL-NEXT: v_max_f32_e64 v2, |v2|, |v2| ; GFX9-GISEL-NEXT: v_max_f32_e64 v3, -|v3|, -|v3| -; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, |v2|, v3 +; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v4, v1 ; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-GISEL-NEXT: s_endpgm ; @@ -1857,9 +1951,12 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs ; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1 +; GFX11-GISEL-NEXT: v_max_f32_e64 v2, |v2|, |v2| ; GFX11-GISEL-NEXT: v_max_f32_e64 v3, -|v3|, -|v3| -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, |v2|, v3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX11-GISEL-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v3, v4 ; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1930,7 +2027,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs ; SI-GISEL-NEXT: v_mul_f32_e64 v2, -1.0, |v2| ; SI-GISEL-NEXT: v_mul_f32_e64 v3, -1.0, |v3| ; SI-GISEL-NEXT: v_mul_f32_e64 v4, -1.0, |v4| -; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-GISEL-NEXT: v_min_f32_e32 v5, v2, v3 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v4 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_endpgm @@ -1992,7 +2092,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs ; VI-GISEL-NEXT: v_mul_f32_e64 v4, -1.0, |v7| ; VI-GISEL-NEXT: v_mul_f32_e64 v2, -1.0, |v2| ; VI-GISEL-NEXT: v_mul_f32_e64 v3, -1.0, |v3| -; VI-GISEL-NEXT: v_med3_f32 v2, v4, v2, v3 +; VI-GISEL-NEXT: v_min_f32_e32 v5, v4, v2 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v4, v2 +; VI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; @@ -2025,7 +2128,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs ; GFX9-GISEL-NEXT: v_max_f32_e64 v1, -|v1|, -|v1| ; GFX9-GISEL-NEXT: v_max_f32_e64 v2, -|v2|, -|v2| ; GFX9-GISEL-NEXT: v_max_f32_e64 v3, -|v3|, -|v3| -; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v4, v1 ; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-GISEL-NEXT: s_endpgm ; @@ -2062,8 +2168,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs ; GFX11-GISEL-NEXT: v_max_f32_e64 v1, -|v1|, -|v1| ; GFX11-GISEL-NEXT: v_max_f32_e64 v2, -|v2|, -|v2| ; GFX11-GISEL-NEXT: v_max_f32_e64 v3, -|v3|, -|v3| -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX11-GISEL-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v3, v4 ; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2776,7 +2884,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o ; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) -; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-GISEL-NEXT: v_min_f32_e32 v5, v2, v3 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_endpgm @@ -2823,70 +2937,118 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o ; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 -; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 -; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-GISEL-NEXT: v_min_f32_e32 v5, v4, v2 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v4, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; VI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[8:9] -; GFX9-NEXT: s_endpgm +; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-SDAG-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_global_nnans_med3_f32_pat0: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid - %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid - %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid - %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid - %a = load volatile float, ptr addrspace(1) %gep0 - %b = load volatile float, ptr addrspace(1) %gep1 - %c = load volatile float, ptr addrspace(1) %gep2 - %tmp0 = call float @llvm.minnum.f32(float %a, float %b) - %tmp1 = call float @llvm.maxnum.f32(float %a, float %b) - %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c) - %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) - store float %med3, ptr addrspace(1) %outgep - ret void -} - -define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v4, v1 +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v2 :: v_dual_max_f32 v2, v3, v3 +; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v2, v4 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid + %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid + %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid + %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid + %a = load volatile float, ptr addrspace(1) %gep0 + %b = load volatile float, ptr addrspace(1) %gep1 + %c = load volatile float, ptr addrspace(1) %gep2 + %tmp0 = call float @llvm.minnum.f32(float %a, float %b) + %tmp1 = call float @llvm.maxnum.f32(float %a, float %b) + %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c) + %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + store float %med3, ptr addrspace(1) %outgep + ret void +} + +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 @@ -2928,7 +3090,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o ; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) -; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-GISEL-NEXT: v_min_f32_e32 v5, v2, v3 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v3, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_endpgm @@ -2975,53 +3143,101 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o ; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 -; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 -; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-GISEL-NEXT: v_min_f32_e32 v5, v4, v2 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v2, v4 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; VI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-LABEL: v_test_global_nnans_med3_f32_pat1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[8:9] -; GFX9-NEXT: s_endpgm +; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-SDAG-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_global_nnans_med3_f32_pat1: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v4, v1 +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_dual_min_f32 v4, v1, v2 :: v_dual_max_f32 v1, v2, v1 +; GFX11-GISEL-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v2, v4 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -3081,7 +3297,12 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa ; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) ; SI-GISEL-NEXT: v_mul_f32_e32 v2, -1.0, v2 -; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-GISEL-NEXT: v_max_f32_e32 v5, v2, v3 +; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_min_f32_e32 v2, v5, v2 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_endpgm @@ -3128,20 +3349,25 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa ; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 -; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 -; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-GISEL-NEXT: v_mul_f32_e32 v4, -1.0, v7 -; VI-GISEL-NEXT: v_med3_f32 v2, v4, v2, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-GISEL-NEXT: v_max_f32_e32 v5, v4, v2 +; VI-GISEL-NEXT: v_min_f32_e32 v2, v4, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3 +; VI-GISEL-NEXT: v_min_f32_e32 v2, v5, v2 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; @@ -3172,7 +3398,12 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa ; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1 -; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v4, v1, v2 +; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v4, v1 ; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-GISEL-NEXT: s_endpgm ; @@ -3207,8 +3438,11 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa ; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-GISEL-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_max_f32_e32 v4, v1, v2 +; GFX11-GISEL-NEXT: v_dual_min_f32 v1, v1, v2 :: v_dual_max_f32 v2, v3, v3 +; GFX11-GISEL-NEXT: v_maxmin_f32 v1, v1, v2, v4 ; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -3270,7 +3504,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o ; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) -; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-GISEL-NEXT: v_min_f32_e32 v5, v2, v3 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; SI-GISEL-NEXT: v_min_f32_e32 v2, v3, v2 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_endpgm @@ -3317,54 +3557,102 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o ; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 -; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 -; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-GISEL-NEXT: v_min_f32_e32 v5, v4, v2 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v4, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; VI-GISEL-NEXT: v_min_f32_e32 v2, v3, v2 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-LABEL: v_test_global_nnans_med3_f32_pat2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[8:9] -; GFX9-NEXT: s_endpgm +; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat2: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-SDAG-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_global_nnans_med3_f32_pat2: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm - %tid = call i32 @llvm.amdgcn.workitem.id.x() +; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat2: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v2, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v4, v1 +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat2: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat2: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v2 :: v_dual_max_f32 v2, v3, v3 +; GFX11-GISEL-NEXT: v_minmax_f32 v1, v2, v1, v4 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid @@ -3422,7 +3710,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o ; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) -; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-GISEL-NEXT: v_min_f32_e32 v5, v2, v3 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v3, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; SI-GISEL-NEXT: v_min_f32_e32 v2, v3, v2 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_endpgm @@ -3469,53 +3763,101 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o ; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 -; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 -; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-GISEL-NEXT: v_min_f32_e32 v5, v4, v2 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v2, v4 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; VI-GISEL-NEXT: v_min_f32_e32 v2, v3, v2 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-LABEL: v_test_global_nnans_med3_f32_pat3: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[8:9] -; GFX9-NEXT: s_endpgm +; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat3: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-SDAG-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_global_nnans_med3_f32_pat3: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat3: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v2, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v4, v1 +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat3: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat3: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_dual_min_f32 v4, v1, v2 :: v_dual_max_f32 v1, v2, v1 +; GFX11-GISEL-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX11-GISEL-NEXT: v_minmax_f32 v1, v2, v1, v4 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -3574,7 +3916,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o ; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) -; SI-GISEL-NEXT: v_med3_f32 v2, v3, v2, v4 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-GISEL-NEXT: v_min_f32_e32 v5, v3, v2 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v3, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; SI-GISEL-NEXT: v_min_f32_e32 v2, v3, v2 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_endpgm @@ -3621,53 +3969,101 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o ; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 -; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 -; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-GISEL-NEXT: v_min_f32_e32 v5, v2, v4 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v2, v4 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; VI-GISEL-NEXT: v_min_f32_e32 v2, v3, v2 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-LABEL: v_test_global_nnans_med3_f32_pat4: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[8:9] -; GFX9-NEXT: s_endpgm +; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat4: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-SDAG-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_global_nnans_med3_f32_pat4: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat4: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v2, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v2, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v4, v1 +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat4: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat4: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_min_f32_e32 v4, v2, v1 +; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v2, v1 :: v_dual_max_f32 v2, v3, v3 +; GFX11-GISEL-NEXT: v_minmax_f32 v1, v2, v1, v4 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -3726,10 +4122,16 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o ; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) -; SI-GISEL-NEXT: v_med3_f32 v2, v3, v2, v4 -; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-GISEL-NEXT: s_endpgm +; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-GISEL-NEXT: v_min_f32_e32 v5, v3, v2 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v3, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat5: ; VI-SDAG: ; %bb.0: @@ -3773,53 +4175,101 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o ; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 -; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 -; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-GISEL-NEXT: v_min_f32_e32 v5, v2, v4 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v2, v4 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; VI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-LABEL: v_test_global_nnans_med3_f32_pat5: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[8:9] -; GFX9-NEXT: s_endpgm +; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat5: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-SDAG-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_global_nnans_med3_f32_pat5: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat5: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v2, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v4, v1 +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat5: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat5: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_min_f32_e32 v4, v2, v1 +; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v2, v1 :: v_dual_max_f32 v2, v3, v3 +; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v2, v4 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -3878,7 +4328,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o ; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) -; SI-GISEL-NEXT: v_med3_f32 v2, v3, v2, v4 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-GISEL-NEXT: v_min_f32_e32 v5, v3, v2 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; SI-GISEL-NEXT: v_min_f32_e32 v2, v3, v2 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_endpgm @@ -3925,53 +4381,101 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o ; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 -; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 -; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-GISEL-NEXT: v_min_f32_e32 v5, v2, v4 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v4, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; VI-GISEL-NEXT: v_min_f32_e32 v2, v3, v2 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-LABEL: v_test_global_nnans_med3_f32_pat6: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[8:9] -; GFX9-NEXT: s_endpgm +; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat6: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-SDAG-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_global_nnans_med3_f32_pat6: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat6: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v2, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v2, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v4, v1 +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat6: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat6: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_dual_min_f32 v4, v2, v1 :: v_dual_max_f32 v1, v1, v2 +; GFX11-GISEL-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX11-GISEL-NEXT: v_minmax_f32 v1, v2, v1, v4 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -4030,7 +4534,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o ; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) -; SI-GISEL-NEXT: v_med3_f32 v2, v3, v2, v4 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-GISEL-NEXT: v_min_f32_e32 v5, v3, v2 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v3, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; SI-GISEL-NEXT: v_min_f32_e32 v2, v3, v2 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_endpgm @@ -4077,53 +4587,101 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o ; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 -; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 -; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-GISEL-NEXT: v_min_f32_e32 v5, v2, v4 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v2, v4 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; VI-GISEL-NEXT: v_min_f32_e32 v2, v3, v2 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-LABEL: v_test_global_nnans_med3_f32_pat7: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[8:9] -; GFX9-NEXT: s_endpgm +; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat7: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-SDAG-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_global_nnans_med3_f32_pat7: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat7: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v2, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v2, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v4, v1 +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat7: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat7: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_min_f32_e32 v4, v2, v1 +; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v2, v1 :: v_dual_max_f32 v2, v3, v3 +; GFX11-GISEL-NEXT: v_minmax_f32 v1, v2, v1, v4 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -4182,7 +4740,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o ; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) -; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-GISEL-NEXT: v_min_f32_e32 v5, v2, v3 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v5 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_endpgm @@ -4229,53 +4793,101 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o ; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 -; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 -; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-GISEL-NEXT: v_min_f32_e32 v5, v4, v2 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v4, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; VI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v2, v5 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-LABEL: v_test_global_nnans_med3_f32_pat8: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[8:9] -; GFX9-NEXT: s_endpgm +; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat8: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-SDAG-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_global_nnans_med3_f32_pat8: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat8: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v4 +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat8: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat8: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2 +; GFX11-GISEL-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_maxmin_f32 v3, v1, v2, v3 +; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v2, v3 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -4334,7 +4946,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o ; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) -; SI-GISEL-NEXT: v_med3_f32 v2, v3, v2, v4 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-GISEL-NEXT: v_min_f32_e32 v5, v2, v3 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v3, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v5 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_endpgm @@ -4381,53 +4999,101 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o ; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 -; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 -; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-GISEL-NEXT: v_min_f32_e32 v5, v4, v2 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v2, v4 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; VI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v2, v5 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-LABEL: v_test_global_nnans_med3_f32_pat9: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[8:9] -; GFX9-NEXT: s_endpgm +; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat9: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-SDAG-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_global_nnans_med3_f32_pat9: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat9: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v4 +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat9: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat9: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2 +; GFX11-GISEL-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_maxmin_f32 v3, v2, v1, v3 +; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v2, v3 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -4486,9 +5152,15 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) % ; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) -; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4 -; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-GISEL-NEXT: v_min_f32_e32 v5, v2, v3 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; SI-GISEL-NEXT: v_min_f32_e32 v2, v3, v2 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v5 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat10: @@ -4533,53 +5205,101 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) % ; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 -; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 -; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-GISEL-NEXT: v_min_f32_e32 v5, v4, v2 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v4, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; VI-GISEL-NEXT: v_min_f32_e32 v2, v3, v2 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v2, v5 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-LABEL: v_test_global_nnans_med3_f32_pat10: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[8:9] -; GFX9-NEXT: s_endpgm +; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat10: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-SDAG-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_global_nnans_med3_f32_pat10: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat10: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v2, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v4 +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat10: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat10: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2 +; GFX11-GISEL-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_maxmin_f32 v3, v1, v2, v3 +; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v2, v3 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -4638,7 +5358,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) % ; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) -; SI-GISEL-NEXT: v_med3_f32 v2, v3, v2, v4 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-GISEL-NEXT: v_min_f32_e32 v5, v2, v3 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v3, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; SI-GISEL-NEXT: v_min_f32_e32 v2, v3, v2 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v5 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_endpgm @@ -4685,53 +5411,101 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) % ; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 -; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 -; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-GISEL-NEXT: v_min_f32_e32 v5, v4, v2 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v2, v4 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; VI-GISEL-NEXT: v_min_f32_e32 v2, v3, v2 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v2, v5 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-LABEL: v_test_global_nnans_med3_f32_pat11: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[8:9] -; GFX9-NEXT: s_endpgm +; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat11: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-SDAG-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_global_nnans_med3_f32_pat11: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat11: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v2, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v4 +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat11: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat11: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2 +; GFX11-GISEL-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_maxmin_f32 v3, v2, v1, v3 +; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v2, v3 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -4790,7 +5564,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) % ; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) -; SI-GISEL-NEXT: v_med3_f32 v2, v3, v2, v4 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-GISEL-NEXT: v_min_f32_e32 v5, v3, v2 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v3, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; SI-GISEL-NEXT: v_min_f32_e32 v2, v3, v2 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v5 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_endpgm @@ -4837,53 +5617,101 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) % ; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 -; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 -; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-GISEL-NEXT: v_min_f32_e32 v5, v2, v4 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v2, v4 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; VI-GISEL-NEXT: v_min_f32_e32 v2, v3, v2 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v2, v5 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-LABEL: v_test_global_nnans_med3_f32_pat12: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[8:9] -; GFX9-NEXT: s_endpgm +; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat12: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-SDAG-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_global_nnans_med3_f32_pat12: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat12: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v2, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v2, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v4 +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat12: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat12: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2 +; GFX11-GISEL-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_maxmin_f32 v3, v2, v1, v3 +; GFX11-GISEL-NEXT: v_minmax_f32 v1, v2, v1, v3 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -4942,7 +5770,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) % ; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) -; SI-GISEL-NEXT: v_med3_f32 v2, v3, v2, v4 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-GISEL-NEXT: v_min_f32_e32 v5, v3, v2 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v3, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v5 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_endpgm @@ -4989,53 +5823,101 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) % ; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 -; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 -; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-GISEL-NEXT: v_min_f32_e32 v5, v2, v4 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v2, v4 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; VI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v2, v5 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-LABEL: v_test_global_nnans_med3_f32_pat13: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[8:9] -; GFX9-NEXT: s_endpgm +; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat13: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-SDAG-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_global_nnans_med3_f32_pat13: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat13: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v2, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v4 +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat13: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat13: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2 +; GFX11-GISEL-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_maxmin_f32 v3, v2, v1, v3 +; GFX11-GISEL-NEXT: v_minmax_f32 v1, v2, v1, v3 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -5094,7 +5976,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) % ; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) -; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-GISEL-NEXT: v_min_f32_e32 v5, v3, v2 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; SI-GISEL-NEXT: v_min_f32_e32 v2, v3, v2 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v5 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_endpgm @@ -5141,53 +6029,101 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) % ; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 -; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 -; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-GISEL-NEXT: v_min_f32_e32 v5, v2, v4 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v4, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; VI-GISEL-NEXT: v_min_f32_e32 v2, v3, v2 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v2, v5 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-LABEL: v_test_global_nnans_med3_f32_pat14: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[8:9] -; GFX9-NEXT: s_endpgm +; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat14: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-SDAG-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_global_nnans_med3_f32_pat14: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat14: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v2, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v2, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v4 +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat14: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat14: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2 +; GFX11-GISEL-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_maxmin_f32 v3, v1, v2, v3 +; GFX11-GISEL-NEXT: v_minmax_f32 v1, v2, v1, v3 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -5246,7 +6182,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) % ; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) -; SI-GISEL-NEXT: v_med3_f32 v2, v3, v2, v4 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-GISEL-NEXT: v_min_f32_e32 v5, v3, v2 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v3, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; SI-GISEL-NEXT: v_min_f32_e32 v2, v3, v2 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v5 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_endpgm @@ -5293,53 +6235,101 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) % ; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 -; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 -; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-GISEL-NEXT: v_min_f32_e32 v5, v2, v4 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v2, v4 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; VI-GISEL-NEXT: v_min_f32_e32 v2, v3, v2 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v2, v5 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-LABEL: v_test_global_nnans_med3_f32_pat15: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[8:9] -; GFX9-NEXT: s_endpgm +; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat15: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-SDAG-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_global_nnans_med3_f32_pat15: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat15: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v2, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v2, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v4 +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat15: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat15: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2 +; GFX11-GISEL-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_maxmin_f32 v3, v2, v1, v3 +; GFX11-GISEL-NEXT: v_minmax_f32 v1, v2, v1, v3 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -5401,7 +6391,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) % ; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) -; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-GISEL-NEXT: v_max_f32_e32 v5, v2, v3 +; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_min_f32_e32 v2, v5, v2 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_endpgm @@ -5448,53 +6444,101 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) % ; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 -; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 -; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-GISEL-NEXT: v_max_f32_e32 v5, v4, v2 +; VI-GISEL-NEXT: v_min_f32_e32 v2, v4, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3 +; VI-GISEL-NEXT: v_min_f32_e32 v2, v5, v2 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-LABEL: v_test_global_nnans_med3_f32_pat16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[8:9] -; GFX9-NEXT: s_endpgm +; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat16: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-SDAG-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat16: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v4, v1, v2 +; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v4, v1 +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat16: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_global_nnans_med3_f32_pat16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat16: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_max_f32_e32 v4, v1, v2 +; GFX11-GISEL-NEXT: v_dual_min_f32 v1, v1, v2 :: v_dual_max_f32 v2, v3, v3 +; GFX11-GISEL-NEXT: v_maxmin_f32 v1, v1, v2, v4 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -6400,7 +7444,10 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) ; SI-GISEL-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-GISEL-NEXT: v_add_f32_e32 v3, 2.0, v3 ; SI-GISEL-NEXT: v_add_f32_e32 v4, 4.0, v4 -; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-GISEL-NEXT: v_min_f32_e32 v5, v2, v3 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v4 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_endpgm @@ -6465,47 +7512,92 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) ; VI-GISEL-NEXT: v_add_f32_e32 v4, 1.0, v7 ; VI-GISEL-NEXT: v_add_f32_e32 v2, 2.0, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v3, 4.0, v3 -; VI-GISEL-NEXT: v_med3_f32 v2, v4, v2, v3 +; VI-GISEL-NEXT: v_min_f32_e32 v5, v4, v2 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v4, v2 +; VI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX9-NEXT: v_add_f32_e32 v2, 2.0, v2 -; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[8:9] -; GFX9-NEXT: s_endpgm +; GFX9-SDAG-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-SDAG-NEXT: v_add_f32_e32 v2, 2.0, v2 +; GFX9-SDAG-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX9-SDAG-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-SDAG-NEXT: s_endpgm ; -; GFX11-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v2, 2.0, v2 -; GFX11-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX9-GISEL-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-GISEL-NEXT: v_add_f32_e32 v2, 2.0, v2 +; GFX9-GISEL-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v4, v1 +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v2, 2.0, v2 +; GFX11-SDAG-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v2, 2.0, v2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_dual_add_f32 v3, 4.0, v3 :: v_dual_min_f32 v4, v1, v2 +; GFX11-GISEL-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v3, v4 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -6575,7 +7667,10 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) ; SI-GISEL-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-GISEL-NEXT: v_add_f32_e32 v3, 2.0, v3 ; SI-GISEL-NEXT: v_add_f32_e32 v4, 4.0, v4 -; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-GISEL-NEXT: v_min_f32_e32 v5, v2, v3 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v4 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_endpgm @@ -6640,47 +7735,92 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) ; VI-GISEL-NEXT: v_add_f32_e32 v4, 1.0, v7 ; VI-GISEL-NEXT: v_add_f32_e32 v2, 2.0, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v3, 4.0, v3 -; VI-GISEL-NEXT: v_med3_f32 v2, v4, v2, v3 +; VI-GISEL-NEXT: v_min_f32_e32 v5, v4, v2 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v4, v2 +; VI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX9-NEXT: v_add_f32_e32 v2, 2.0, v2 -; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[8:9] -; GFX9-NEXT: s_endpgm +; GFX9-SDAG-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-SDAG-NEXT: v_add_f32_e32 v2, 2.0, v2 +; GFX9-SDAG-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX9-SDAG-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-SDAG-NEXT: s_endpgm ; -; GFX11-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v2, 2.0, v2 -; GFX11-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX9-GISEL-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-GISEL-NEXT: v_add_f32_e32 v2, 2.0, v2 +; GFX9-GISEL-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v4, v1 +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v2, 2.0, v2 +; GFX11-SDAG-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v2, 2.0, v2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_dual_add_f32 v3, 4.0, v3 :: v_dual_min_f32 v4, v1, v2 +; GFX11-GISEL-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v3, v4 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -6750,7 +7890,10 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) ; SI-GISEL-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-GISEL-NEXT: v_add_f32_e32 v3, 2.0, v3 ; SI-GISEL-NEXT: v_add_f32_e32 v4, 4.0, v4 -; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-GISEL-NEXT: v_min_f32_e32 v5, v2, v3 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v4 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_endpgm @@ -6815,47 +7958,92 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) ; VI-GISEL-NEXT: v_add_f32_e32 v4, 1.0, v7 ; VI-GISEL-NEXT: v_add_f32_e32 v2, 2.0, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v3, 4.0, v3 -; VI-GISEL-NEXT: v_med3_f32 v2, v4, v2, v3 +; VI-GISEL-NEXT: v_min_f32_e32 v5, v4, v2 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v4, v2 +; VI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX9-NEXT: v_add_f32_e32 v2, 2.0, v2 -; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[8:9] -; GFX9-NEXT: s_endpgm +; GFX9-SDAG-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-SDAG-NEXT: v_add_f32_e32 v2, 2.0, v2 +; GFX9-SDAG-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX9-SDAG-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-SDAG-NEXT: s_endpgm ; -; GFX11-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v2, 2.0, v2 -; GFX11-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX9-GISEL-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-GISEL-NEXT: v_add_f32_e32 v2, 2.0, v2 +; GFX9-GISEL-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v4, v1 +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v2, 2.0, v2 +; GFX11-SDAG-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v2, 2.0, v2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_dual_add_f32 v3, 4.0, v3 :: v_dual_min_f32 v4, v1, v2 +; GFX11-GISEL-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v3, v4 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -7112,9 +8300,12 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt ; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) ; SI-GISEL-NEXT: v_mul_f32_e32 v5, -1.0, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; SI-GISEL-NEXT: v_min_f32_e32 v5, v5, v3 ; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3 -; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v4 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3 ; SI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 @@ -7178,10 +8369,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-GISEL-NEXT: v_mul_f32_e32 v4, -1.0, v7 -; VI-GISEL-NEXT: v_max_f32_e32 v5, v7, v2 -; VI-GISEL-NEXT: v_min_f32_e32 v2, v4, v2 -; VI-GISEL-NEXT: v_min_f32_e32 v3, v5, v3 -; VI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v5, 1.0, v7 +; VI-GISEL-NEXT: v_min_f32_e32 v4, v4, v2 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; VI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v4, v2 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; @@ -7215,10 +8409,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt ; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_max_f32_e64 v4, -v1, -v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v4, v2 ; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX9-GISEL-NEXT: v_min_f32_e32 v2, v4, v2 -; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v4, v1 ; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-GISEL-NEXT: s_endpgm ; @@ -7255,10 +8452,12 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt ; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: v_max_f32_e64 v4, -v1, -v1 -; GFX11-GISEL-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-GISEL-NEXT: v_min_f32_e32 v4, v4, v2 -; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v3, v4 +; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v2 :: v_dual_max_f32 v2, v3, v3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v2, v4 ; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -7322,8 +8521,11 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out ; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3 -; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v4 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_endpgm @@ -7371,55 +8573,97 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out ; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 -; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 -; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-GISEL-NEXT: v_max_f32_e32 v2, v7, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v4, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; VI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-LABEL: v_test_global_nnans_min_max_f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[8:9] -; GFX9-NEXT: s_endpgm +; GFX9-SDAG-LABEL: v_test_global_nnans_min_max_f32: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-SDAG-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-SDAG-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_global_nnans_min_max_f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_maxmin_f32 v1, v1, v2, v3 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX9-GISEL-LABEL: v_test_global_nnans_min_max_f32: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_test_global_nnans_min_max_f32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_maxmin_f32 v1, v1, v2, v3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_global_nnans_min_max_f32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2 +; GFX11-GISEL-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_maxmin_f32 v1, v1, v2, v3 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll index cbd824e171976..850aeb60335e8 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll @@ -350,22 +350,22 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt(half % ; SDAG-GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SDAG-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp -; GFX9-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; GFX9-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt: +; SDAG-GFX9: ; %bb.0: +; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX9-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; SDAG-GFX9-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; VI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp -; VI-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: s_setpc_b64 s[30:31] +; SDAG-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-VI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp +; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt: ; SDAG-CI: ; %bb.0: @@ -378,19 +378,41 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt(half % ; GISEL-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX11-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; GISEL-GFX11-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_med3_f32 v0, v0, 0, 1.0 ; GISEL-GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt: +; GISEL-GFX9: ; %bb.0: +; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX9-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GISEL-GFX9-NEXT: v_med3_f32 v0, v0, 0, 1.0 +; GISEL-GFX9-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; GISEL-VI-NEXT: v_med3_f32 v0, v2, 0, 1.0 +; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GISEL-CI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp +; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; GISEL-CI-NEXT: v_med3_f32 v0, v2, 0, 1.0 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v0 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float @@ -405,27 +427,27 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt(half % } define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt(half %src0, half %src1, half %src2) #0 { -; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp -; GFX11-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt: +; SDAG-GFX11: ; %bb.0: +; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-NEXT: v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mad_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp -; GFX9-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt: +; SDAG-GFX9: ; %bb.0: +; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX9-NEXT: v_mad_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; VI-NEXT: v_mac_f32_e32 v2, v0, v1 -; VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: s_setpc_b64 s[30:31] +; SDAG-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt: ; SDAG-CI: ; %bb.0: @@ -435,6 +457,36 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt(half ; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v1, v0 clamp ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_med3_f16 v0, v0, 0, 1.0 +; GISEL-GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt: +; GISEL-GFX9: ; %bb.0: +; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GISEL-GFX9-NEXT: v_med3_f16 v0, v0, 0, 1.0 +; GISEL-GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0x3c00 +; GISEL-VI-NEXT: v_max_f16_e32 v0, 0, v0 +; GISEL-VI-NEXT: v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -464,36 +516,36 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt(half } define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use(half %src0, half %src1, half %src2) #0 { -; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] -; GFX11-NEXT: v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp -; GFX11-NEXT: global_store_b16 v[0:1], v3, off dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: +; SDAG-GFX11: ; %bb.0: +; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; SDAG-GFX11-NEXT: v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; SDAG-GFX11-NEXT: global_store_b16 v[0:1], v3, off dlc +; SDAG-GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] -; GFX9-NEXT: global_store_short v[0:1], v3, off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp -; GFX9-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: +; SDAG-GFX9: ; %bb.0: +; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX9-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; SDAG-GFX9-NEXT: global_store_short v[0:1], v3, off +; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) +; SDAG-GFX9-NEXT: v_mad_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; VI-NEXT: v_mac_f32_e32 v2, v0, v1 -; VI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; VI-NEXT: flat_store_short v[0:1], v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_max_f16_sdwa v0, v0, v0 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_setpc_b64 s[30:31] +; SDAG-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SDAG-VI-NEXT: flat_store_short v[0:1], v0 +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) +; SDAG-VI-NEXT: v_max_f16_sdwa v0, v0, v0 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: ; SDAG-CI: ; %bb.0: @@ -507,6 +559,42 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: v_fma_mixlo_f16 v1, v0, v1, v2 op_sel_hi:[1,1,1] +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_med3_f16 v0, v1, 0, 1.0 +; GISEL-GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GISEL-GFX11-NEXT: global_store_b16 v[0:1], v1, off dlc +; GISEL-GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: +; GISEL-GFX9: ; %bb.0: +; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GISEL-GFX9-NEXT: global_store_short v[0:1], v0, off +; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) +; GISEL-GFX9-NEXT: v_med3_f16 v0, v0, 0, 1.0 +; GISEL-GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; GISEL-VI-NEXT: flat_store_short v[0:1], v0 +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) +; GISEL-VI-NEXT: v_max_f16_e32 v0, 0, v0 +; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0x3c00 +; GISEL-VI-NEXT: v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll index 32e0d393a1001..a9d07877b3887 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll @@ -271,32 +271,38 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32(half %src0, half %src1, float %src2 } define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %src1, float %src2) #0 { -; GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX1100-TRUE16-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: +; SDAG-GFX1100-TRUE16: ; %bb.0: +; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp +; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp -; GFX900-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX1100-FAKE16-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: +; SDAG-GFX1100-FAKE16: ; %bb.0: +; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp +; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX906-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: -; GFX906: ; %bb.0: -; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp -; GFX906-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: +; SDAG-GFX900: ; %bb.0: +; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp +; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; VI-NEXT: v_mac_f32_e32 v2, v0, v1 -; VI-NEXT: v_cvt_f16_f32_e64 v0, v2 clamp -; VI-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX906-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: +; SDAG-GFX906: ; %bb.0: +; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp +; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SDAG-VI-NEXT: v_cvt_f16_f32_e64 v0, v2 clamp +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: ; SDAG-CI: ; %bb.0: @@ -306,6 +312,39 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %sr ; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX1100-NEXT: v_med3_f16 v0, v0, 0, 1.0 +; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: +; GISEL-GFX900: ; %bb.0: +; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GISEL-GFX900-NEXT: v_med3_f16 v0, v0, 0, 1.0 +; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX906-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: +; GISEL-GFX906: ; %bb.0: +; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GISEL-GFX906-NEXT: v_med3_f16 v0, v0, 0, 1.0 +; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; GISEL-VI-NEXT: v_max_f16_e32 v0, 0, v0 +; GISEL-VI-NEXT: v_min_f16_e32 v0, 1.0, v0 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -348,28 +387,28 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt(half %src0, half %src ; SDAG-GFX1100-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp -; GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX900-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt: +; SDAG-GFX900: ; %bb.0: +; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp +; SDAG-GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX906-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt: -; GFX906: ; %bb.0: -; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp -; GFX906-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX906-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX906-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt: +; SDAG-GFX906: ; %bb.0: +; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp +; SDAG-GFX906-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; VI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp -; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; VI-NEXT: s_setpc_b64 s[30:31] +; SDAG-VI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-VI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp +; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt: ; SDAG-CI: ; %bb.0: @@ -382,17 +421,45 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt(half %src0, half %src ; GISEL-GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt: ; GISEL-GFX1100: ; %bb.0: ; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp -; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX1100-NEXT: v_med3_f32 v0, v0, 0, 1.0 ; GISEL-GFX1100-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt: +; GISEL-GFX900: ; %bb.0: +; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GISEL-GFX900-NEXT: v_med3_f32 v0, v0, 0, 1.0 +; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX906-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt: +; GISEL-GFX906: ; %bb.0: +; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GISEL-GFX906-NEXT: v_med3_f32 v0, v0, 0, 1.0 +; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; GISEL-VI-NEXT: v_med3_f32 v0, v2, 0, 1.0 +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp +; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; GISEL-CI-NEXT: v_med3_f32 v0, v2, 0, 1.0 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float @@ -914,30 +981,39 @@ define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half ; FIXME (DAG): Fold clamp define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { -; GFX1100-LABEL: v_mad_mix_v2f32_clamp_postcvt: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX1100-NEXT: v_mov_b32_e32 v0, v3 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32_clamp_postcvt: +; SDAG-GFX1100-TRUE16: ; %bb.0: +; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX900-LABEL: v_mad_mix_v2f32_clamp_postcvt: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp -; GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32_clamp_postcvt: +; SDAG-GFX1100-FAKE16: ; %bb.0: +; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX906-LABEL: v_mad_mix_v2f32_clamp_postcvt: -; GFX906: ; %bb.0: -; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp -; GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX906-NEXT: v_mov_b32_e32 v0, v3 -; GFX906-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX900-LABEL: v_mad_mix_v2f32_clamp_postcvt: +; SDAG-GFX900: ; %bb.0: +; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; SDAG-GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX900-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX906-LABEL: v_mad_mix_v2f32_clamp_postcvt: +; SDAG-GFX906: ; %bb.0: +; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; SDAG-GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX906-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-VI-LABEL: v_mad_mix_v2f32_clamp_postcvt: ; SDAG-VI: ; %bb.0: @@ -978,6 +1054,35 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %s ; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v1, v1 clamp ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX1100-LABEL: v_mad_mix_v2f32_clamp_postcvt: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GISEL-GFX1100-NEXT: v_pk_max_f16 v0, v3, 0 +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX1100-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0] +; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX900-LABEL: v_mad_mix_v2f32_clamp_postcvt: +; GISEL-GFX900: ; %bb.0: +; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; GISEL-GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GISEL-GFX900-NEXT: v_pk_max_f16 v0, v3, 0 +; GISEL-GFX900-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0] +; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX906-LABEL: v_mad_mix_v2f32_clamp_postcvt: +; GISEL-GFX906: ; %bb.0: +; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; GISEL-GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GISEL-GFX906-NEXT: v_pk_max_f16 v0, v3, 0 +; GISEL-GFX906-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0] +; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-VI-LABEL: v_mad_mix_v2f32_clamp_postcvt: ; GISEL-VI: ; %bb.0: ; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -989,8 +1094,13 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %s ; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GISEL-VI-NEXT: v_mac_f32_e32 v5, v3, v4 ; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1 -; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v0, v5 clamp -; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v1, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GISEL-VI-NEXT: v_mov_b32_e32 v2, 0x3c00 +; GISEL-VI-NEXT: v_max_f16_e32 v0, 0, v0 +; GISEL-VI-NEXT: v_max_f16_e32 v1, 0, v1 +; GISEL-VI-NEXT: v_min_f16_e32 v0, 1.0, v0 +; GISEL-VI-NEXT: v_min_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v1 ; GISEL-VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1147,33 +1257,36 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s ; GISEL-GFX1100-LABEL: v_mad_mix_v3f32_clamp_postcvt: ; GISEL-GFX1100: ; %bb.0: ; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] ; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] ; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] ; GISEL-GFX1100-NEXT: v_pk_max_f16 v1, v1, v1 clamp -; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GISEL-GFX1100-NEXT: v_mov_b32_e32 v0, v6 +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX1100-NEXT: v_pk_max_f16 v0, v6, 0 +; GISEL-GFX1100-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0] ; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX900-LABEL: v_mad_mix_v3f32_clamp_postcvt: ; GISEL-GFX900: ; %bb.0: ; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] +; GISEL-GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] ; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] -; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GISEL-GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX900-NEXT: v_pk_max_f16 v0, v6, 0 ; GISEL-GFX900-NEXT: v_pk_max_f16 v1, v1, v1 clamp -; GISEL-GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GISEL-GFX900-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0] ; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX906-LABEL: v_mad_mix_v3f32_clamp_postcvt: ; GISEL-GFX906: ; %bb.0: ; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] +; GISEL-GFX906-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] ; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] -; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GISEL-GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX906-NEXT: v_pk_max_f16 v0, v6, 0 ; GISEL-GFX906-NEXT: v_pk_max_f16 v1, v1, v1 clamp -; GISEL-GFX906-NEXT: v_mov_b32_e32 v0, v3 +; GISEL-GFX906-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0] ; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-VI-LABEL: v_mad_mix_v3f32_clamp_postcvt: @@ -1190,11 +1303,18 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s ; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GISEL-VI-NEXT: v_mac_f32_e32 v8, v6, v7 ; GISEL-VI-NEXT: v_mac_f32_e32 v4, v0, v2 -; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v0, v8 clamp -; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v2, v4 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v8 ; GISEL-VI-NEXT: v_mac_f32_e32 v5, v1, v3 -; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v1, v5 clamp -; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v2 +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v2, v5 +; GISEL-VI-NEXT: v_max_f16_e32 v0, 0, v0 +; GISEL-VI-NEXT: v_mov_b32_e32 v3, 0x3c00 +; GISEL-VI-NEXT: v_max_f16_e32 v1, 0, v1 +; GISEL-VI-NEXT: v_max_f16_e32 v2, 0, v2 +; GISEL-VI-NEXT: v_min_f16_e32 v0, 1.0, v0 +; GISEL-VI-NEXT: v_min_f16_sdwa v3, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GISEL-VI-NEXT: v_min_f16_e32 v1, 1.0, v2 +; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v3 ; GISEL-VI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-CI-LABEL: v_mad_mix_v3f32_clamp_postcvt: @@ -1247,39 +1367,51 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s } define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 { -; GFX1100-LABEL: v_mad_mix_v4f32_clamp_postcvt: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GFX1100-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX1100-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v4f32_clamp_postcvt: +; SDAG-GFX1100-TRUE16: ; %bb.0: +; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX1100-TRUE16-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 +; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX900-LABEL: v_mad_mix_v4f32_clamp_postcvt: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX900-NEXT: v_mad_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp -; GFX900-NEXT: v_mad_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v4f32_clamp_postcvt: +; SDAG-GFX1100-FAKE16: ; %bb.0: +; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX1100-FAKE16-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 +; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX906-LABEL: v_mad_mix_v4f32_clamp_postcvt: -; GFX906: ; %bb.0: -; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GFX906-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX906-NEXT: v_fma_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp -; GFX906-NEXT: v_fma_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX906-NEXT: v_mov_b32_e32 v0, v6 -; GFX906-NEXT: v_mov_b32_e32 v1, v2 -; GFX906-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX900-LABEL: v_mad_mix_v4f32_clamp_postcvt: +; SDAG-GFX900: ; %bb.0: +; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; SDAG-GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; SDAG-GFX900-NEXT: v_mad_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX900-NEXT: v_mov_b32_e32 v0, v6 +; SDAG-GFX900-NEXT: v_mov_b32_e32 v1, v2 +; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX906-LABEL: v_mad_mix_v4f32_clamp_postcvt: +; SDAG-GFX906: ; %bb.0: +; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; SDAG-GFX906-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; SDAG-GFX906-NEXT: v_fma_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX906-NEXT: v_mov_b32_e32 v0, v6 +; SDAG-GFX906-NEXT: v_mov_b32_e32 v1, v2 +; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-VI-LABEL: v_mad_mix_v4f32_clamp_postcvt: ; SDAG-VI: ; %bb.0: @@ -1358,6 +1490,48 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s ; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX1100-LABEL: v_mad_mix_v4f32_clamp_postcvt: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] +; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL-GFX1100-NEXT: v_pk_max_f16 v0, v6, 0 +; GISEL-GFX1100-NEXT: v_pk_max_f16 v1, v7, 0 +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL-GFX1100-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0] +; GISEL-GFX1100-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0] +; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX900-LABEL: v_mad_mix_v4f32_clamp_postcvt: +; GISEL-GFX900: ; %bb.0: +; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] +; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] +; GISEL-GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GISEL-GFX900-NEXT: v_mad_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GISEL-GFX900-NEXT: v_pk_max_f16 v0, v6, 0 +; GISEL-GFX900-NEXT: v_pk_max_f16 v1, v7, 0 +; GISEL-GFX900-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0] +; GISEL-GFX900-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0] +; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX906-LABEL: v_mad_mix_v4f32_clamp_postcvt: +; GISEL-GFX906: ; %bb.0: +; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] +; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] +; GISEL-GFX906-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GISEL-GFX906-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GISEL-GFX906-NEXT: v_pk_max_f16 v0, v6, 0 +; GISEL-GFX906-NEXT: v_pk_max_f16 v1, v7, 0 +; GISEL-GFX906-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0] +; GISEL-GFX906-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0] +; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-VI-LABEL: v_mad_mix_v4f32_clamp_postcvt: ; GISEL-VI: ; %bb.0: ; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1377,10 +1551,19 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s ; GISEL-VI-NEXT: v_mac_f32_e32 v4, v0, v2 ; GISEL-VI-NEXT: v_mac_f32_e32 v11, v7, v9 ; GISEL-VI-NEXT: v_mac_f32_e32 v5, v1, v3 -; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v0, v10 clamp -; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v1, v4 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v2, v11 clamp -; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v3, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v2, v11 +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v3, v5 +; GISEL-VI-NEXT: v_max_f16_e32 v0, 0, v0 +; GISEL-VI-NEXT: v_max_f16_e32 v1, 0, v1 +; GISEL-VI-NEXT: v_max_f16_e32 v2, 0, v2 +; GISEL-VI-NEXT: v_max_f16_e32 v3, 0, v3 +; GISEL-VI-NEXT: v_mov_b32_e32 v4, 0x3c00 +; GISEL-VI-NEXT: v_min_f16_e32 v0, 1.0, v0 +; GISEL-VI-NEXT: v_min_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GISEL-VI-NEXT: v_min_f16_e32 v2, 1.0, v2 +; GISEL-VI-NEXT: v_min_f16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v1 ; GISEL-VI-NEXT: v_or_b32_e32 v1, v2, v3 ; GISEL-VI-NEXT: s_setpc_b64 s[30:31] @@ -1528,7 +1711,7 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half> ; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] ; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GISEL-GFX1100-NEXT: v_mov_b32_e32 v4, v3 -; GISEL-GFX1100-NEXT: v_max_f16_e64 v3, v3, v3 clamp +; GISEL-GFX1100-NEXT: v_med3_f16 v3, v3, 0, 1.0 ; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v4, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] ; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GISEL-GFX1100-NEXT: v_and_b32_e32 v0, 0xffff, v3 @@ -1539,20 +1722,22 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half> ; GISEL-GFX900: ; %bb.0: ; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] -; GISEL-GFX900-NEXT: v_max_f16_e64 v4, v3, v3 clamp +; GISEL-GFX900-NEXT: v_med3_f16 v4, v3, 0, 1.0 ; GISEL-GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; GISEL-GFX900-NEXT: v_mov_b32_e32 v0, 0xffff0000 -; GISEL-GFX900-NEXT: v_and_or_b32 v0, v3, v0, v4 +; GISEL-GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GISEL-GFX900-NEXT: v_mov_b32_e32 v1, 0xffff0000 +; GISEL-GFX900-NEXT: v_and_or_b32 v0, v3, v1, v0 ; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX906-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo: ; GISEL-GFX906: ; %bb.0: ; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] -; GISEL-GFX906-NEXT: v_max_f16_e64 v4, v3, v3 clamp +; GISEL-GFX906-NEXT: v_med3_f16 v4, v3, 0, 1.0 ; GISEL-GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; GISEL-GFX906-NEXT: v_mov_b32_e32 v0, 0xffff0000 -; GISEL-GFX906-NEXT: v_and_or_b32 v0, v3, v0, v4 +; GISEL-GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GISEL-GFX906-NEXT: v_mov_b32_e32 v1, 0xffff0000 +; GISEL-GFX906-NEXT: v_and_or_b32 v0, v3, v1, v0 ; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-VI-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo: @@ -1690,13 +1875,15 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half> ; GISEL-GFX1100-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi: ; GISEL-GFX1100: ; %bb.0: ; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] ; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v4, v0, v1, v2 op_sel_hi:[1,1,1] ; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GISEL-GFX1100-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GISEL-GFX1100-NEXT: v_med3_f16 v3, v3, 0, 1.0 ; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v4, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] ; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX1100-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GISEL-GFX1100-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX1100-NEXT: v_and_or_b32 v0, 0xffff, v4, v0 ; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] ; @@ -1704,9 +1891,10 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half> ; GISEL-GFX900: ; %bb.0: ; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] -; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v4, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v4, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] ; GISEL-GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; GISEL-GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GISEL-GFX900-NEXT: v_med3_f16 v0, v4, 0, 1.0 +; GISEL-GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GISEL-GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GISEL-GFX900-NEXT: v_mov_b32_e32 v1, 0xffff ; GISEL-GFX900-NEXT: v_and_or_b32 v0, v3, v1, v0 @@ -1716,9 +1904,10 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half> ; GISEL-GFX906: ; %bb.0: ; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] -; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v4, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v4, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] ; GISEL-GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; GISEL-GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GISEL-GFX906-NEXT: v_med3_f16 v0, v4, 0, 1.0 +; GISEL-GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GISEL-GFX906-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GISEL-GFX906-NEXT: v_mov_b32_e32 v1, 0xffff ; GISEL-GFX906-NEXT: v_and_or_b32 v0, v3, v1, v0 @@ -1872,10 +2061,13 @@ define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %sr ; GISEL-GFX1100-LABEL: v_mad_mix_v2f32_clamp_precvt: ; GISEL-GFX1100: ; %bb.0: ; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX1100-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp -; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX1100-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] ; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GISEL-GFX1100-NEXT: v_cvt_f16_f32_e32 v1, v3 +; GISEL-GFX1100-NEXT: v_med3_f32 v1, v3, 0, 1.0 +; GISEL-GFX1100-NEXT: v_med3_f32 v0, v0, 0, 1.0 +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL-GFX1100-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GISEL-GFX1100-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX1100-NEXT: v_pack_b32_f16 v0, v1, v0 @@ -1884,9 +2076,11 @@ define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %sr ; GISEL-GFX900-LABEL: v_mad_mix_v2f32_clamp_precvt: ; GISEL-GFX900: ; %bb.0: ; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX900-NEXT: v_mad_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp -; GISEL-GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v1, v3 +; GISEL-GFX900-NEXT: v_mad_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; GISEL-GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GISEL-GFX900-NEXT: v_med3_f32 v1, v3, 0, 1.0 +; GISEL-GFX900-NEXT: v_med3_f32 v0, v0, 0, 1.0 +; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GISEL-GFX900-NEXT: v_pack_b32_f16 v0, v1, v0 ; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1894,9 +2088,11 @@ define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %sr ; GISEL-GFX906-LABEL: v_mad_mix_v2f32_clamp_precvt: ; GISEL-GFX906: ; %bb.0: ; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX906-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp -; GISEL-GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v1, v3 +; GISEL-GFX906-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; GISEL-GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GISEL-GFX906-NEXT: v_med3_f32 v1, v3, 0, 1.0 +; GISEL-GFX906-NEXT: v_med3_f32 v0, v0, 0, 1.0 +; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GISEL-GFX906-NEXT: v_pack_b32_f16 v0, v1, v0 ; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1910,11 +2106,13 @@ define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %sr ; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GISEL-VI-NEXT: v_mad_f32 v3, v3, v4, v5 clamp -; GISEL-VI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp -; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; GISEL-VI-NEXT: v_or_b32_e32 v0, v1, v0 +; GISEL-VI-NEXT: v_mac_f32_e32 v5, v3, v4 +; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; GISEL-VI-NEXT: v_med3_f32 v0, v5, 0, 1.0 +; GISEL-VI-NEXT: v_med3_f32 v1, v2, 0, 1.0 +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v1 ; GISEL-VI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-CI-LABEL: v_mad_mix_v2f32_clamp_precvt: @@ -1926,8 +2124,10 @@ define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %sr ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GISEL-CI-NEXT: v_mad_f32 v0, v0, v2, v4 clamp -; GISEL-CI-NEXT: v_mad_f32 v1, v1, v3, v5 clamp +; GISEL-CI-NEXT: v_mac_f32_e32 v4, v0, v2 +; GISEL-CI-NEXT: v_mac_f32_e32 v5, v1, v3 +; GISEL-CI-NEXT: v_med3_f32 v0, v4, 0, 1.0 +; GISEL-CI-NEXT: v_med3_f32 v1, v5, 0, 1.0 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] @@ -2052,25 +2252,33 @@ define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %sr ; GISEL-GFX1100-LABEL: v_mad_mix_v3f32_clamp_precvt: ; GISEL-GFX1100: ; %bb.0: ; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX1100-NEXT: v_fma_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GISEL-GFX1100-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; GISEL-GFX1100-NEXT: v_fma_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] +; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GISEL-GFX1100-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GISEL-GFX1100-NEXT: v_med3_f32 v2, v6, 0, 1.0 +; GISEL-GFX1100-NEXT: v_med3_f32 v0, v0, 0, 1.0 +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GISEL-GFX1100-NEXT: v_med3_f32 v1, v1, 0, 1.0 +; GISEL-GFX1100-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GISEL-GFX1100-NEXT: v_cvt_f16_f32_e32 v2, v6 ; GISEL-GFX1100-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GISEL-GFX1100-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GISEL-GFX1100-NEXT: v_pack_b32_f16 v0, v2, v0 ; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX900-LABEL: v_mad_mix_v3f32_clamp_precvt: ; GISEL-GFX900: ; %bb.0: ; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX900-NEXT: v_mad_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GISEL-GFX900-NEXT: v_mad_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v2, v6 +; GISEL-GFX900-NEXT: v_mad_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] +; GISEL-GFX900-NEXT: v_mad_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GISEL-GFX900-NEXT: v_mad_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] +; GISEL-GFX900-NEXT: v_med3_f32 v2, v6, 0, 1.0 +; GISEL-GFX900-NEXT: v_med3_f32 v0, v0, 0, 1.0 +; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GISEL-GFX900-NEXT: v_mad_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; GISEL-GFX900-NEXT: v_med3_f32 v1, v1, 0, 1.0 ; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GISEL-GFX900-NEXT: v_pack_b32_f16 v0, v2, v0 ; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31] @@ -2078,11 +2286,14 @@ define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %sr ; GISEL-GFX906-LABEL: v_mad_mix_v3f32_clamp_precvt: ; GISEL-GFX906: ; %bb.0: ; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX906-NEXT: v_fma_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GISEL-GFX906-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v2, v6 +; GISEL-GFX906-NEXT: v_fma_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] +; GISEL-GFX906-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GISEL-GFX906-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] +; GISEL-GFX906-NEXT: v_med3_f32 v2, v6, 0, 1.0 +; GISEL-GFX906-NEXT: v_med3_f32 v0, v0, 0, 1.0 +; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GISEL-GFX906-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; GISEL-GFX906-NEXT: v_med3_f32 v1, v1, 0, 1.0 ; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GISEL-GFX906-NEXT: v_pack_b32_f16 v0, v2, v0 ; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31] @@ -2099,13 +2310,16 @@ define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %sr ; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GISEL-VI-NEXT: v_mad_f32 v6, v6, v7, v8 clamp -; GISEL-VI-NEXT: v_mad_f32 v0, v0, v2, v4 clamp -; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v2, v6 -; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; GISEL-VI-NEXT: v_mad_f32 v1, v1, v3, v5 clamp +; GISEL-VI-NEXT: v_mac_f32_e32 v8, v6, v7 +; GISEL-VI-NEXT: v_mac_f32_e32 v4, v0, v2 +; GISEL-VI-NEXT: v_med3_f32 v0, v8, 0, 1.0 +; GISEL-VI-NEXT: v_mac_f32_e32 v5, v1, v3 +; GISEL-VI-NEXT: v_med3_f32 v1, v4, 0, 1.0 +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GISEL-VI-NEXT: v_med3_f32 v1, v5, 0, 1.0 ; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GISEL-VI-NEXT: v_or_b32_e32 v0, v2, v0 +; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v2 ; GISEL-VI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-CI-LABEL: v_mad_mix_v3f32_clamp_precvt: @@ -2120,9 +2334,12 @@ define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %sr ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GISEL-CI-NEXT: v_mad_f32 v0, v0, v3, v6 clamp -; GISEL-CI-NEXT: v_mad_f32 v1, v1, v4, v7 clamp -; GISEL-CI-NEXT: v_mad_f32 v2, v2, v5, v8 clamp +; GISEL-CI-NEXT: v_mac_f32_e32 v6, v0, v3 +; GISEL-CI-NEXT: v_mac_f32_e32 v7, v1, v4 +; GISEL-CI-NEXT: v_mac_f32_e32 v8, v2, v5 +; GISEL-CI-NEXT: v_med3_f32 v0, v6, 0, 1.0 +; GISEL-CI-NEXT: v_med3_f32 v1, v7, 0, 1.0 +; GISEL-CI-NEXT: v_med3_f32 v2, v8, 0, 1.0 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -2275,12 +2492,18 @@ define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %sr ; GISEL-GFX1100-LABEL: v_mad_mix_v4f32_clamp_precvt: ; GISEL-GFX1100: ; %bb.0: ; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX1100-NEXT: v_fma_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GISEL-GFX1100-NEXT: v_fma_mix_f32 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp -; GISEL-GFX1100-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX1100-NEXT: v_fma_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] +; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GISEL-GFX1100-NEXT: v_fma_mix_f32 v2, v1, v3, v5 op_sel_hi:[1,1,1] +; GISEL-GFX1100-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GISEL-GFX1100-NEXT: v_med3_f32 v3, v6, 0, 1.0 +; GISEL-GFX1100-NEXT: v_med3_f32 v0, v0, 0, 1.0 +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GISEL-GFX1100-NEXT: v_med3_f32 v2, v2, 0, 1.0 +; GISEL-GFX1100-NEXT: v_med3_f32 v1, v1, 0, 1.0 ; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GISEL-GFX1100-NEXT: v_cvt_f16_f32_e32 v3, v6 +; GISEL-GFX1100-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GISEL-GFX1100-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GISEL-GFX1100-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -2293,11 +2516,15 @@ define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %sr ; GISEL-GFX900-LABEL: v_mad_mix_v4f32_clamp_precvt: ; GISEL-GFX900: ; %bb.0: ; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX900-NEXT: v_mad_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GISEL-GFX900-NEXT: v_mad_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GISEL-GFX900-NEXT: v_mad_mix_f32 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp -; GISEL-GFX900-NEXT: v_mad_mix_f32 v1, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v3, v6 +; GISEL-GFX900-NEXT: v_mad_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] +; GISEL-GFX900-NEXT: v_mad_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GISEL-GFX900-NEXT: v_mad_mix_f32 v2, v1, v3, v5 op_sel_hi:[1,1,1] +; GISEL-GFX900-NEXT: v_mad_mix_f32 v1, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GISEL-GFX900-NEXT: v_med3_f32 v3, v6, 0, 1.0 +; GISEL-GFX900-NEXT: v_med3_f32 v0, v0, 0, 1.0 +; GISEL-GFX900-NEXT: v_med3_f32 v2, v2, 0, 1.0 +; GISEL-GFX900-NEXT: v_med3_f32 v1, v1, 0, 1.0 +; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -2308,11 +2535,15 @@ define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %sr ; GISEL-GFX906-LABEL: v_mad_mix_v4f32_clamp_precvt: ; GISEL-GFX906: ; %bb.0: ; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX906-NEXT: v_fma_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GISEL-GFX906-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GISEL-GFX906-NEXT: v_fma_mix_f32 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp -; GISEL-GFX906-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v3, v6 +; GISEL-GFX906-NEXT: v_fma_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] +; GISEL-GFX906-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GISEL-GFX906-NEXT: v_fma_mix_f32 v2, v1, v3, v5 op_sel_hi:[1,1,1] +; GISEL-GFX906-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GISEL-GFX906-NEXT: v_med3_f32 v3, v6, 0, 1.0 +; GISEL-GFX906-NEXT: v_med3_f32 v0, v0, 0, 1.0 +; GISEL-GFX906-NEXT: v_med3_f32 v2, v2, 0, 1.0 +; GISEL-GFX906-NEXT: v_med3_f32 v1, v1, 0, 1.0 +; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -2335,16 +2566,20 @@ define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %sr ; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v11, v5 ; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GISEL-VI-NEXT: v_mad_f32 v6, v6, v8, v10 clamp -; GISEL-VI-NEXT: v_mad_f32 v0, v0, v2, v4 clamp -; GISEL-VI-NEXT: v_mad_f32 v2, v7, v9, v11 clamp -; GISEL-VI-NEXT: v_mad_f32 v1, v1, v3, v5 clamp -; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v3, v6 -; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GISEL-VI-NEXT: v_mac_f32_e32 v10, v6, v8 +; GISEL-VI-NEXT: v_mac_f32_e32 v4, v0, v2 +; GISEL-VI-NEXT: v_mac_f32_e32 v11, v7, v9 +; GISEL-VI-NEXT: v_mac_f32_e32 v5, v1, v3 +; GISEL-VI-NEXT: v_med3_f32 v0, v10, 0, 1.0 +; GISEL-VI-NEXT: v_med3_f32 v1, v4, 0, 1.0 +; GISEL-VI-NEXT: v_med3_f32 v2, v11, 0, 1.0 +; GISEL-VI-NEXT: v_med3_f32 v3, v5, 0, 1.0 +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; GISEL-VI-NEXT: v_or_b32_e32 v0, v3, v0 -; GISEL-VI-NEXT: v_or_b32_e32 v1, v2, v1 +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-VI-NEXT: v_or_b32_e32 v1, v2, v3 ; GISEL-VI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-CI-LABEL: v_mad_mix_v4f32_clamp_precvt: @@ -2362,10 +2597,14 @@ define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %sr ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GISEL-CI-NEXT: v_mad_f32 v0, v0, v4, v8 clamp -; GISEL-CI-NEXT: v_mad_f32 v1, v1, v5, v9 clamp -; GISEL-CI-NEXT: v_mad_f32 v2, v2, v6, v10 clamp -; GISEL-CI-NEXT: v_mad_f32 v3, v3, v7, v11 clamp +; GISEL-CI-NEXT: v_mac_f32_e32 v8, v0, v4 +; GISEL-CI-NEXT: v_mac_f32_e32 v9, v1, v5 +; GISEL-CI-NEXT: v_mac_f32_e32 v10, v2, v6 +; GISEL-CI-NEXT: v_mac_f32_e32 v11, v3, v7 +; GISEL-CI-NEXT: v_med3_f32 v0, v8, 0, 1.0 +; GISEL-CI-NEXT: v_med3_f32 v1, v9, 0, 1.0 +; GISEL-CI-NEXT: v_med3_f32 v2, v10, 0, 1.0 +; GISEL-CI-NEXT: v_med3_f32 v3, v11, 0, 1.0 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix.ll b/llvm/test/CodeGen/AMDGPU/mad-mix.ll index e2170fa406da4..53db04e21af6e 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix.ll @@ -1599,41 +1599,41 @@ define <2 x float> @v_mad_mix_v2f32_f32imminv2pi(<2 x half> %src0, <2 x half> %s } define float @v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { -; GFX1100-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX1100-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt: +; SDAG-GFX1100: ; %bb.0: +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31] ; -; GFX900-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX900-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX900-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt: +; SDAG-GFX900: ; %bb.0: +; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX906-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt: -; GFX906: ; %bb.0: -; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX906-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX906-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt: +; SDAG-GFX906: ; %bb.0: +; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31] ; -; GFX9GEN-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt: -; GFX9GEN: ; %bb.0: -; GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9GEN-NEXT: v_mad_f32 v0, v0, v1, v2 clamp -; GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX9GEN-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt: +; SDAG-GFX9GEN: ; %bb.0: +; SDAG-GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-GFX9GEN-NEXT: v_mad_f32 v0, v0, v1, v2 clamp +; SDAG-GFX9GEN-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp -; VI-NEXT: s_setpc_b64 s[30:31] +; SDAG-VI-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-CI-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt: ; SDAG-CI: ; %bb.0: @@ -1641,13 +1641,56 @@ define float @v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x h ; SDAG-CI-NEXT: v_mad_f32 v0, v1, v3, v5 clamp ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX1100-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX1100-NEXT: v_med3_f32 v0, v0, 0, 1.0 +; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX900-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt: +; GISEL-GFX900: ; %bb.0: +; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GISEL-GFX900-NEXT: v_med3_f32 v0, v0, 0, 1.0 +; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX906-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt: +; GISEL-GFX906: ; %bb.0: +; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GISEL-GFX906-NEXT: v_med3_f32 v0, v0, 0, 1.0 +; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX9GEN-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt: +; GISEL-GFX9GEN: ; %bb.0: +; GISEL-GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-GFX9GEN-NEXT: v_mac_f32_e32 v2, v0, v1 +; GISEL-GFX9GEN-NEXT: v_med3_f32 v0, v2, 0, 1.0 +; GISEL-GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; GISEL-VI-NEXT: v_med3_f32 v0, v2, 0, 1.0 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-CI-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v3 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GISEL-CI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp +; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; GISEL-CI-NEXT: v_med3_f32 v0, v2, 0, 1.0 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.hi = extractelement <2 x half> %src0, i32 1 %src1.hi = extractelement <2 x half> %src1, i32 1 diff --git a/llvm/test/CodeGen/AMDGPU/minmax.ll b/llvm/test/CodeGen/AMDGPU/minmax.ll index bdd8935d0df5e..2958ca7122cb7 100644 --- a/llvm/test/CodeGen/AMDGPU/minmax.ll +++ b/llvm/test/CodeGen/AMDGPU/minmax.ll @@ -445,23 +445,47 @@ define amdgpu_ps float @test_maxmin_commuted_f32_ieee_false(float %a, float %b, } define void @test_med3_f32(ptr addrspace(1) %arg, float %x, float %y, float %z) #0 { -; GFX11-LABEL: test_med3_f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_med3_f32 v2, v2, v3, v4 -; GFX11-NEXT: global_store_b32 v[0:1], v2, off -; GFX11-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX11-LABEL: test_med3_f32: +; SDAG-GFX11: ; %bb.0: +; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-NEXT: v_med3_f32 v2, v2, v3, v4 +; SDAG-GFX11-NEXT: global_store_b32 v[0:1], v2, off +; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX12-LABEL: test_med3_f32: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_med3_num_f32 v2, v2, v3, v4 -; GFX12-NEXT: global_store_b32 v[0:1], v2, off -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GISEL-GFX11-LABEL: test_med3_f32: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3 +; GISEL-GFX11-NEXT: v_min_f32_e32 v5, v2, v3 +; GISEL-GFX11-NEXT: v_dual_max_f32 v2, v2, v3 :: v_dual_max_f32 v3, v4, v4 +; GISEL-GFX11-NEXT: v_minmax_f32 v2, v2, v3, v5 +; GISEL-GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX12-LABEL: test_med3_f32: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-NEXT: v_med3_num_f32 v2, v2, v3, v4 +; SDAG-GFX12-NEXT: global_store_b32 v[0:1], v2, off +; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-LABEL: test_med3_f32: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-NEXT: v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3 +; GISEL-GFX12-NEXT: v_min_num_f32_e32 v5, v2, v3 +; GISEL-GFX12-NEXT: v_dual_max_num_f32 v2, v2, v3 :: v_dual_max_num_f32 v3, v4, v4 +; GISEL-GFX12-NEXT: v_minmax_num_f32 v2, v2, v3, v5 +; GISEL-GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] %tmp0 = call float @llvm.minnum.f32(float %x, float %y) %tmp1 = call float @llvm.maxnum.f32(float %x, float %y) %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %z) @@ -471,23 +495,47 @@ define void @test_med3_f32(ptr addrspace(1) %arg, float %x, float %y, float %z) } define void @test_med3_minimumnum_maximumnum_f32(ptr addrspace(1) %arg, float %x, float %y, float %z) #0 { -; GFX11-LABEL: test_med3_minimumnum_maximumnum_f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_med3_f32 v2, v2, v3, v4 -; GFX11-NEXT: global_store_b32 v[0:1], v2, off -; GFX11-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX11-LABEL: test_med3_minimumnum_maximumnum_f32: +; SDAG-GFX11: ; %bb.0: +; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-NEXT: v_med3_f32 v2, v2, v3, v4 +; SDAG-GFX11-NEXT: global_store_b32 v[0:1], v2, off +; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX12-LABEL: test_med3_minimumnum_maximumnum_f32: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_med3_num_f32 v2, v2, v3, v4 -; GFX12-NEXT: global_store_b32 v[0:1], v2, off -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GISEL-GFX11-LABEL: test_med3_minimumnum_maximumnum_f32: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3 +; GISEL-GFX11-NEXT: v_min_f32_e32 v5, v2, v3 +; GISEL-GFX11-NEXT: v_dual_max_f32 v2, v2, v3 :: v_dual_max_f32 v3, v4, v4 +; GISEL-GFX11-NEXT: v_minmax_f32 v2, v2, v3, v5 +; GISEL-GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX12-LABEL: test_med3_minimumnum_maximumnum_f32: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-NEXT: v_med3_num_f32 v2, v2, v3, v4 +; SDAG-GFX12-NEXT: global_store_b32 v[0:1], v2, off +; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-LABEL: test_med3_minimumnum_maximumnum_f32: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-NEXT: v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3 +; GISEL-GFX12-NEXT: v_min_num_f32_e32 v5, v2, v3 +; GISEL-GFX12-NEXT: v_dual_max_num_f32 v2, v2, v3 :: v_dual_max_num_f32 v3, v4, v4 +; GISEL-GFX12-NEXT: v_minmax_num_f32 v2, v2, v3, v5 +; GISEL-GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] %tmp0 = call float @llvm.minimumnum.f32(float %x, float %y) %tmp1 = call float @llvm.maximumnum.f32(float %x, float %y) %tmp2 = call float @llvm.minimumnum.f32(float %tmp1, float %z) @@ -875,14 +923,24 @@ define void @test_med3_f16(ptr addrspace(1) %arg, half %x, half %y, half %z) #0 ; GISEL-GFX11-TRUE16-LABEL: test_med3_f16: ; GISEL-GFX11-TRUE16: ; %bb.0: ; GISEL-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX11-TRUE16-NEXT: v_med3_f16 v2.l, v2.l, v3.l, v4.l +; GISEL-GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l +; GISEL-GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v3.l, v3.l +; GISEL-GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.l, v2.h +; GISEL-GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.h +; GISEL-GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l +; GISEL-GFX11-TRUE16-NEXT: v_minmax_f16 v2.l, v2.l, v2.h, v3.l ; GISEL-GFX11-TRUE16-NEXT: global_store_b16 v[0:1], v2, off ; GISEL-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX11-FAKE16-LABEL: test_med3_f16: ; GISEL-GFX11-FAKE16: ; %bb.0: ; GISEL-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX11-FAKE16-NEXT: v_med3_f16 v2, v2, v3, v4 +; GISEL-GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GISEL-GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3 +; GISEL-GFX11-FAKE16-NEXT: v_min_f16_e32 v5, v2, v3 +; GISEL-GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v3 +; GISEL-GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v4, v4 +; GISEL-GFX11-FAKE16-NEXT: v_minmax_f16 v2, v2, v3, v5 ; GISEL-GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v2, off ; GISEL-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -915,7 +973,12 @@ define void @test_med3_f16(ptr addrspace(1) %arg, half %x, half %y, half %z) #0 ; GISEL-GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GISEL-GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GISEL-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GISEL-GFX12-TRUE16-NEXT: v_med3_num_f16 v2.l, v2.l, v3.l, v4.l +; GISEL-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l +; GISEL-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v3.l, v3.l +; GISEL-GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.l, v2.h +; GISEL-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.h +; GISEL-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l +; GISEL-GFX12-TRUE16-NEXT: v_minmax_num_f16 v2.l, v2.l, v2.h, v3.l ; GISEL-GFX12-TRUE16-NEXT: global_store_b16 v[0:1], v2, off ; GISEL-GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -926,7 +989,12 @@ define void @test_med3_f16(ptr addrspace(1) %arg, half %x, half %y, half %z) #0 ; GISEL-GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GISEL-GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GISEL-GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GISEL-GFX12-FAKE16-NEXT: v_med3_num_f16 v2, v2, v3, v4 +; GISEL-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GISEL-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3 +; GISEL-GFX12-FAKE16-NEXT: v_min_num_f16_e32 v5, v2, v3 +; GISEL-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v3 +; GISEL-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v4, v4 +; GISEL-GFX12-FAKE16-NEXT: v_minmax_num_f16 v2, v2, v3, v5 ; GISEL-GFX12-FAKE16-NEXT: global_store_b16 v[0:1], v2, off ; GISEL-GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] %tmp0 = call half @llvm.minnum.f16(half %x, half %y)