From 390fd92f69c269a33a7a6c01595f80321c510ca7 Mon Sep 17 00:00:00 2001 From: Frederik Harwath Date: Thu, 27 Mar 2025 10:57:42 -0400 Subject: [PATCH 01/26] [AMDGPU] SIPeepholeSDWA: Reject V_CNDMASK_B32_e64 instead of V_CNDMASK_B32_e32 The problem with V_CNDMASK_B32_e64 (i.e. that conversion to the VOP2 SDWA form introduces an implicit VCC use) hinted at by the comment does not exist with V_CNDMASK_B32_e32. Hence the latter should already be acceptable for conversion to SDWA without further ado. --- llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 9 ++-- .../CodeGen/AMDGPU/sdwa-peephole-vcndmask.mir | 41 +++++++++++++++++++ 2 files changed, 46 insertions(+), 4 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/sdwa-peephole-vcndmask.mir diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index 22f23e4c94e2d..f40b293af8b15 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -1070,6 +1070,11 @@ bool isConvertibleToSDWA(MachineInstr &MI, if (TII->isSDWA(Opc)) return true; + // FIXME V_CNDMASK_B32_e64 needs handling of the implicit VCC use + // introduced by conversion to VOP2. + if (Opc == AMDGPU::V_CNDMASK_B32_e64) + return false; + // Check if this instruction has opcode that supports SDWA if (AMDGPU::getSDWAOp(Opc) == -1) Opc = AMDGPU::getVOPe32(Opc); @@ -1108,10 +1113,6 @@ bool isConvertibleToSDWA(MachineInstr &MI, if (TII->pseudoToMCOpcode(Opc) == -1) return false; - // FIXME: has SDWA but require handling of implicit VCC use - if (Opc == AMDGPU::V_CNDMASK_B32_e32) - return false; - if (MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0)) { if (!Src0->isReg() && !Src0->isImm()) return false; diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-vcndmask.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-vcndmask.mir new file mode 100644 index 0000000000000..46af1e214aa45 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-vcndmask.mir @@ -0,0 +1,41 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=si-peephole-sdwa -mcpu=gfx803 -o - %s | FileCheck -check-prefix=gfx8 %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=si-peephole-sdwa -mcpu=gfx1100 -o - %s | FileCheck -check-prefix=gfx11 %s + +--- +name: v_cndmask_b32_test +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vcc + + ; gfx8-LABEL: name: v_cndmask_b32_test + ; gfx8: liveins: $vgpr0, $vgpr1, $vcc + ; gfx8-NEXT: {{ $}} + ; gfx8-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; gfx8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; gfx8-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[COPY]], implicit $exec + ; gfx8-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[COPY1]], implicit $exec + ; gfx8-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[COPY]], 0, [[COPY1]], 0, 6, 0, 5, 5, implicit $vcc, implicit $exec + ; gfx8-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]] + ; gfx8-NEXT: SI_RETURN implicit $vgpr0 + ; + ; gfx11-LABEL: name: v_cndmask_b32_test + ; gfx11: liveins: $vgpr0, $vgpr1, $vcc + ; gfx11-NEXT: {{ $}} + ; gfx11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; gfx11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; gfx11-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[COPY]], implicit $exec + ; gfx11-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[COPY1]], implicit $exec + ; gfx11-NEXT: [[V_CNDMASK_B32_e32_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e32 killed [[V_LSHRREV_B32_e64_]], killed [[V_LSHRREV_B32_e64_1]], implicit $exec, implicit $vcc_lo + ; gfx11-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e32_]] + ; gfx11-NEXT: SI_RETURN implicit $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr0 + %3:vgpr_32 = V_LSHRREV_B32_e64 16, %1, implicit $exec + %4:vgpr_32 = V_LSHRREV_B32_e64 16, %2, implicit $exec + %7:vgpr_32 = V_CNDMASK_B32_e32 killed %3, killed %4, implicit $exec, implicit $vcc + $vgpr0 = COPY %7 + SI_RETURN implicit $vgpr0 + +... From c9b7002032b48db46a2f472a45646a94bf092750 Mon Sep 17 00:00:00 2001 From: Frederik Harwath Date: Fri, 4 Apr 2025 07:34:20 -0400 Subject: [PATCH 02/26] [AMDGPU] SIPeepholeSDWA: Handle V_CNDMASK_B32_e64 The VOP3 form of the V_CNDMASK_B32 instruction takes a carry-in operand. The conversion to SDWA implies a conversion to VOP2 form which reads from VCC instead. Convert V_CNDMASK_B32_e64 instructions that might be converted to SDWA to V_CNDMASK_B32_e32 first and either change the instruction that defines the carry-in operand to write to VCC if this is possible or introduce a write of the carry-in operand to VCC. --- llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 92 +- llvm/test/CodeGen/AMDGPU/bf16.ll | 1052 +++++++---------- llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll | 12 +- llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll | 14 +- .../CodeGen/AMDGPU/dagcombine-fmul-sel.ll | 40 +- .../CodeGen/AMDGPU/extract_vector_elt-f16.ll | 88 +- .../CodeGen/AMDGPU/extract_vector_elt-i16.ll | 570 +++++++-- llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll | 70 +- llvm/test/CodeGen/AMDGPU/fmaximum3.ll | 255 ++-- llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll | 70 +- llvm/test/CodeGen/AMDGPU/fminimum3.ll | 255 ++-- .../CodeGen/AMDGPU/fneg-modifier-casting.ll | 30 +- .../CodeGen/AMDGPU/insert_vector_dynelt.ll | 50 +- .../AMDGPU/insert_vector_elt.v2bf16.ll | 650 +++++----- .../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll | 430 ++++--- llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll | 382 +++--- llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll | 382 +++--- llvm/test/CodeGen/AMDGPU/maximumnum.ll | 6 +- llvm/test/CodeGen/AMDGPU/minimumnum.ll | 6 +- llvm/test/CodeGen/AMDGPU/saddsat.ll | 26 +- .../AMDGPU/sdwa-peephole-vcnd_mask-1.mir | 48 + .../AMDGPU/sdwa-peephole-vcnd_mask-2.mir | 43 + .../AMDGPU/select-fabs-fneg-extract.v2f16.ll | 332 ++---- llvm/test/CodeGen/AMDGPU/select.f16.ll | 642 ++++------ llvm/test/CodeGen/AMDGPU/ssubsat.ll | 26 +- 25 files changed, 2638 insertions(+), 2933 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/sdwa-peephole-vcnd_mask-1.mir create mode 100644 llvm/test/CodeGen/AMDGPU/sdwa-peephole-vcnd_mask-2.mir diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index f40b293af8b15..f5f808623cc0c 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -62,6 +62,7 @@ class SIPeepholeSDWA { std::unique_ptr matchSDWAOperand(MachineInstr &MI); void pseudoOpConvertToVOP2(MachineInstr &MI, const GCNSubtarget &ST) const; + void convertToImplicitVcc(MachineInstr &MI, const GCNSubtarget &ST) const; MachineInstr *createSDWAVersion(MachineInstr &MI); bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const; @@ -1061,6 +1062,79 @@ void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI, MISucc.substituteRegister(CarryIn->getReg(), TRI->getVCC(), 0, *TRI); } +static unsigned getVCmpEqOpcode(unsigned Bits) { + if (Bits == 64) + return AMDGPU::V_CMP_EQ_U64_e64; + if (Bits == 32) + return AMDGPU::V_CMP_EQ_U32_e64; + if (Bits == 16) + return AMDGPU::V_CMP_EQ_U16_e64; + + llvm_unreachable("Unexpected register bit width."); +}; + +/// Try to convert an \p MI in VOP3 which takes an src2 carry-in +/// operand into the corresponding VOP2 form which expects the +/// argument in VCC. To this end, either try to change the definition +/// of the carry-in operand to write to VCC or add an instruction that +/// copies from the carry-in to VCC. The conversion will only be +/// applied if \p MI can be shrunk to VOP2 and if VCC can be proven to +/// be dead before \p MI. +void SIPeepholeSDWA::convertToImplicitVcc(MachineInstr &MI, + const GCNSubtarget &ST) const { + assert(MI.getOpcode() == AMDGPU::V_CNDMASK_B32_e64); + + MCRegister Vcc = TRI->getVCC(); + // FIXME Conversion introduces implicit vcc_hi use + if (Vcc == AMDGPU::VCC_LO) + return; + + LLVM_DEBUG(dbgs() << "Attempting VOP2 conversion: " << MI); + if (!TII->canShrink(MI, *MRI)) { + LLVM_DEBUG(dbgs() << "Cannot shrink instruction\n"); + return; + } + + const MachineOperand &CarryIn = + *TII->getNamedOperand(MI, AMDGPU::OpName::src2); + + // Make sure VCC or its subregs are dead before MI. + MachineBasicBlock &MBB = *MI.getParent(); + auto Liveness = MBB.computeRegisterLiveness(TRI, Vcc, MI, 100); + if (Liveness != MachineBasicBlock::LQR_Dead) { + LLVM_DEBUG(dbgs() << "VCC not known to be dead before instruction.\n"); + return; + } + // Change destination of compare instruction to VCC + // or copy to VCC if carry-in is not a compare inst. + Register CarryReg = CarryIn.getReg(); + MachineInstr &CarryDef = *MRI->getVRegDef(CarryReg); + + if (CarryDef.isCompare() && TII->isVOP3(CarryDef) && + MRI->hasOneUse(CarryIn.getReg())) { + CarryDef.substituteRegister(CarryIn.getReg(), Vcc, 0, *TRI); + CarryDef.moveBefore(&MI); + } else { + // Add write: VCC[lanedId] <- (CarryIn[laneId] == 1) + const TargetRegisterClass *Class = + TRI->getRegClassForOperandReg(*MRI, CarryIn); + unsigned RegSize = Class->MC->getSizeInBits(); + BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(getVCmpEqOpcode(RegSize))) + .addReg(Vcc, RegState::Define) + .addImm(1) + .add(CarryIn); + } + + auto Converted = BuildMI(MBB, MI, MI.getDebugLoc(), + TII->get(AMDGPU::getVOPe32(MI.getOpcode()))) + .add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) + .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)) + .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1)) + .setMIFlags(MI.getFlags()); + LLVM_DEBUG(dbgs() << "Converted to VOP2: " << *Converted << '\n'); + MI.eraseFromParent(); +} + namespace { bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST, @@ -1070,8 +1144,8 @@ bool isConvertibleToSDWA(MachineInstr &MI, if (TII->isSDWA(Opc)) return true; - // FIXME V_CNDMASK_B32_e64 needs handling of the implicit VCC use - // introduced by conversion to VOP2. + // Can only be handled after ealier conversion to + // AMDGPU::V_CNDMASK_B32_e32 which is not always possible. if (Opc == AMDGPU::V_CNDMASK_B32_e64) return false; @@ -1385,10 +1459,18 @@ bool SIPeepholeSDWA::run(MachineFunction &MF) { for (const auto &OperandPair : SDWAOperands) { const auto &Operand = OperandPair.second; MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST); - if (PotentialMI && - (PotentialMI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 || - PotentialMI->getOpcode() == AMDGPU::V_SUB_CO_U32_e64)) + if (!PotentialMI) + continue; + + switch (PotentialMI->getOpcode()) { + case AMDGPU::V_ADD_CO_U32_e64: + case AMDGPU::V_SUB_CO_U32_e64: pseudoOpConvertToVOP2(*PotentialMI, ST); + break; + case AMDGPU::V_CNDMASK_B32_e64: + convertToImplicitVcc(*PotentialMI, ST); + break; + }; } SDWAOperands.clear(); diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 19b6ff68b9869..e172bf090cca7 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -38481,10 +38481,8 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b) ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_cmp_eq_u64_e64 vcc, 1, vcc +; GFX8-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -38494,10 +38492,9 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b) ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX9-NEXT: v_cmp_eq_u64_e64 vcc, 1, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -38581,11 +38578,8 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo ; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -38596,10 +38590,8 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -38767,17 +38759,17 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg ; ; GFX8-LABEL: s_select_v2bf16: ; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX8-NEXT: v_cmp_eq_u64_e64 vcc, 1, vcc ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -38885,11 +38877,10 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg ; GFX8-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX8-NEXT: v_cndmask_b32_sdwa v1, v2, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 @@ -40567,11 +40558,10 @@ define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX8-NEXT: v_cndmask_b32_sdwa v3, v4, v5, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v4, s3 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 ; GFX8-NEXT: s_lshr_b32 s3, s2, 16 @@ -40579,11 +40569,10 @@ define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX8-NEXT: v_cndmask_b32_sdwa v1, v3, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 @@ -40769,24 +40758,18 @@ define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bflo ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v7 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v9, v8, vcc +; GFX8-NEXT: v_cndmask_b32_sdwa v3, v7, v5, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc +; GFX8-NEXT: v_cndmask_b32_sdwa v1, v6, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_vselect_v4bf16: @@ -40797,17 +40780,13 @@ define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bflo ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v3, v7, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v6 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v6, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4 @@ -40996,44 +40975,32 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v7, 1, v7 ; GFX8-NEXT: v_and_b32_e32 v6, 1, v6 -; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v11 -; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v15 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 ; GFX8-NEXT: v_and_b32_e32 v5, 1, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v7, v17, v16, vcc +; GFX8-NEXT: v_cndmask_b32_sdwa v7, v15, v11, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 ; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v15, v11, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v14 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 ; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v15, v11, vcc +; GFX8-NEXT: v_cndmask_b32_sdwa v5, v14, v10, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v14, v10, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v9 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v13 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v10, vcc +; GFX8-NEXT: v_cndmask_b32_sdwa v3, v13, v9, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v13, v9, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v12 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v10, v9, vcc +; GFX8-NEXT: v_cndmask_b32_sdwa v1, v12, v8, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_vselect_v8bf16: @@ -41044,33 +41011,25 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 ; GFX9-NEXT: v_and_b32_e32 v4, 1, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v15, v11, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 ; GFX9-NEXT: v_and_b32_e32 v5, 1, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v11, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v7, v15, v11, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v14, v10, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v14 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 ; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v11, v10, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v5, v14, v10, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v13, v9, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v13 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v10, v9, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v3, v13, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v12 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v12, v8, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4 @@ -41466,168 +41425,128 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GFX8-LABEL: v_vselect_v16bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v15, 1, v15 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 +; GFX8-NEXT: buffer_load_dword v15, off, s[0:3], s32 +; GFX8-NEXT: v_and_b32_e32 v14, 1, v14 +; GFX8-NEXT: v_and_b32_e32 v13, 1, v13 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v14 +; GFX8-NEXT: v_and_b32_e32 v10, 1, v10 +; GFX8-NEXT: v_and_b32_e32 v8, 1, v8 +; GFX8-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX8-NEXT: v_and_b32_e32 v12, 1, v12 +; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v1 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v2 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v3 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v4 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v5 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v6 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v7 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v8 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v9 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v10 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v11 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v12 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v13 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v14 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v15 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v30 -; GFX8-NEXT: v_cndmask_b32_e64 v6, v1, v0, s[28:29] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v21 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v29 -; GFX8-NEXT: v_cndmask_b32_e64 v5, v1, v0, s[24:25] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v20 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v28 -; GFX8-NEXT: v_cndmask_b32_e64 v4, v1, v0, s[20:21] -; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v23 -; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v24 -; GFX8-NEXT: v_cndmask_b32_e64 v7, v30, v22, s[26:27] -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_cndmask_b32_e64 v8, v29, v21, s[22:23] -; GFX8-NEXT: v_cndmask_b32_e64 v9, v28, v20, s[18:19] -; GFX8-NEXT: v_cndmask_b32_e64 v12, v27, v19, s[14:15] -; GFX8-NEXT: v_cndmask_b32_e64 v13, v26, v18, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v14, v25, v17, s[6:7] -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v11, 1, v11 +; GFX8-NEXT: v_and_b32_e32 v9, 1, v9 +; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e64 v10, v0, v23, s[40:41] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v11, v0, v1, s[42:43] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v19 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v27 -; GFX8-NEXT: v_cndmask_b32_e64 v3, v1, v0, s[16:17] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v18 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v26 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[12:13] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v25 -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[8:9] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v16 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v15, v0, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v15, v24, v16, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v11 -; GFX8-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v7, v10, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_sdwa v14, v15, v23, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13 +; GFX8-NEXT: v_cndmask_b32_sdwa v13, v30, v22, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 +; GFX8-NEXT: v_cndmask_b32_e64 v15, v15, v23, s[4:5] +; GFX8-NEXT: v_and_b32_e32 v23, 1, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v29, v21, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GFX8-NEXT: v_cndmask_b32_e32 v8, v28, v20, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v12 +; GFX8-NEXT: v_cndmask_b32_e32 v10, v27, v19, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v12, v30, v22, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v30, v26, v18, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v31, v25, v17, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11 +; GFX8-NEXT: v_and_b32_e32 v22, 1, v7 +; GFX8-NEXT: v_cndmask_b32_sdwa v2, v29, v21, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 +; GFX8-NEXT: v_cndmask_b32_sdwa v4, v28, v20, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v22 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_sdwa v2, v27, v19, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v23 +; GFX8-NEXT: v_or_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_sdwa v8, v26, v18, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX8-NEXT: v_cndmask_b32_sdwa v9, v25, v17, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX8-NEXT: v_or_b32_sdwa v3, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v30, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_sdwa v8, v24, v16, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_sdwa v7, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v6, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v31, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_vselect_v16bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v12, 1, v12 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12 +; GFX9-NEXT: v_and_b32_e32 v15, 1, v15 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 +; GFX9-NEXT: v_and_b32_e32 v14, 1, v14 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v14 ; GFX9-NEXT: v_and_b32_e32 v13, 1, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v30, v22, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13 ; GFX9-NEXT: v_and_b32_e32 v10, 1, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v30, v22, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 -; GFX9-NEXT: v_and_b32_e32 v10, 1, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v29, v21, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v29 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v22, v21, vcc -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 ; GFX9-NEXT: v_and_b32_e32 v8, 1, v8 -; GFX9-NEXT: v_and_b32_e32 v9, 1, v9 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v20 -; GFX9-NEXT: v_cndmask_b32_e32 v20, v28, v20, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v28 ; GFX9-NEXT: v_and_b32_e32 v6, 1, v6 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 -; GFX9-NEXT: v_and_b32_e32 v7, 1, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v22, v8, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v27 +; GFX9-NEXT: v_and_b32_e32 v12, 1, v12 ; GFX9-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v27, v19, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 -; GFX9-NEXT: v_and_b32_e32 v5, 1, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v22, v9, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; GFX9-NEXT: v_and_b32_e32 v14, 1, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v26, v18, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 -; GFX9-NEXT: v_and_b32_e32 v15, 1, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v27, v6, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14 ; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v23 -; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_and_b32_e32 v11, 1, v11 +; GFX9-NEXT: v_and_b32_e32 v9, 1, v9 +; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v14, v21, v23, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v21 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v6, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v14, v15, v23, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v15, v15, v23, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13 +; GFX9-NEXT: v_cndmask_b32_sdwa v13, v30, v22, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 +; GFX9-NEXT: v_and_b32_e32 v23, 1, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v29, v21, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v28, v20, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v27, v19, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v12, v30, v22, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v30, v26, v18, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v25 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v15, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v31, v25, v17, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v24 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11 +; GFX9-NEXT: v_and_b32_e32 v22, 1, v7 +; GFX9-NEXT: v_cndmask_b32_sdwa v2, v29, v21, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_cndmask_b32_sdwa v4, v28, v20, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v22 +; GFX9-NEXT: v_perm_b32 v5, v2, v5, s4 +; GFX9-NEXT: v_cndmask_b32_sdwa v2, v27, v19, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v23 +; GFX9-NEXT: v_perm_b32 v4, v4, v8, s4 +; GFX9-NEXT: v_cndmask_b32_sdwa v8, v26, v18, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX9-NEXT: v_cndmask_b32_sdwa v9, v25, v17, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v15, v6, vcc -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4 -; GFX9-NEXT: v_perm_b32 v2, v5, v4, s4 -; GFX9-NEXT: v_perm_b32 v3, v9, v19, s4 -; GFX9-NEXT: v_perm_b32 v4, v8, v20, s4 -; GFX9-NEXT: v_perm_b32 v5, v10, v11, s4 +; GFX9-NEXT: v_perm_b32 v3, v2, v10, s4 +; GFX9-NEXT: v_perm_b32 v2, v8, v30, s4 +; GFX9-NEXT: v_cndmask_b32_sdwa v8, v24, v16, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_perm_b32 v7, v15, v14, s4 ; GFX9-NEXT: v_perm_b32 v6, v13, v12, s4 -; GFX9-NEXT: v_perm_b32 v7, v7, v14, s4 +; GFX9-NEXT: v_perm_b32 v1, v9, v31, s4 +; GFX9-NEXT: v_perm_b32 v0, v8, v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_vselect_v16bf16: @@ -42507,425 +42426,336 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX8-LABEL: v_vselect_v32bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: v_and_b32_e32 v26, 1, v26 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v26 +; GFX8-NEXT: buffer_load_ushort v26, off, s[0:3], s32 +; GFX8-NEXT: v_and_b32_e32 v24, 1, v24 +; GFX8-NEXT: v_and_b32_e32 v22, 1, v22 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v24 +; GFX8-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128 +; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v22 +; GFX8-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 +; GFX8-NEXT: v_and_b32_e32 v30, 1, v30 +; GFX8-NEXT: v_and_b32_e32 v28, 1, v28 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v30 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v28 +; GFX8-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:124 +; GFX8-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:60 +; GFX8-NEXT: v_and_b32_e32 v18, 1, v18 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v18 +; GFX8-NEXT: v_and_b32_e32 v20, 1, v20 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v20 +; GFX8-NEXT: v_and_b32_e32 v25, 1, v25 +; GFX8-NEXT: v_and_b32_e32 v23, 1, v23 +; GFX8-NEXT: v_and_b32_e32 v21, 1, v21 +; GFX8-NEXT: v_and_b32_e32 v19, 1, v19 +; GFX8-NEXT: v_and_b32_e32 v17, 1, v17 +; GFX8-NEXT: v_and_b32_e32 v16, 1, v16 +; GFX8-NEXT: v_and_b32_e32 v15, 1, v15 +; GFX8-NEXT: v_and_b32_e32 v14, 1, v14 +; GFX8-NEXT: v_and_b32_e32 v13, 1, v13 +; GFX8-NEXT: v_and_b32_e32 v12, 1, v12 +; GFX8-NEXT: v_and_b32_e32 v11, 1, v11 +; GFX8-NEXT: v_and_b32_e32 v10, 1, v10 +; GFX8-NEXT: v_and_b32_e32 v9, 1, v9 +; GFX8-NEXT: v_and_b32_e32 v8, 1, v8 +; GFX8-NEXT: v_and_b32_e32 v7, 1, v7 +; GFX8-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v1 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v2 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v3 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v4 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v5 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v6 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v7 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v8 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v9 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v10 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v11 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v12 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v13 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v14 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v15 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v16 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[44:45], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v17 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[46:47], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v18 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[56:57], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v19 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[58:59], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v20 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[60:61], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v21 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[62:63], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v22 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[72:73], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v23 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[74:75], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v24 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[76:77], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v25 -; GFX8-NEXT: v_writelane_b32 v34, s30, 0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[78:79], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v26 -; GFX8-NEXT: v_writelane_b32 v34, s31, 1 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[88:89], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v27 -; GFX8-NEXT: v_writelane_b32 v34, s34, 2 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[90:91], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v28 -; GFX8-NEXT: v_writelane_b32 v34, s35, 3 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v29 -; GFX8-NEXT: v_writelane_b32 v34, s36, 4 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v30 -; GFX8-NEXT: v_writelane_b32 v34, s37, 5 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[36:37], 1, v0 -; GFX8-NEXT: buffer_load_ushort v0, off, s[0:3], s32 -; GFX8-NEXT: v_writelane_b32 v34, s38, 6 -; GFX8-NEXT: v_writelane_b32 v34, s39, 7 +; GFX8-NEXT: s_waitcnt vmcnt(5) +; GFX8-NEXT: v_and_b32_e32 v18, 1, v26 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v18 +; GFX8-NEXT: v_and_b32_e32 v26, 1, v27 +; GFX8-NEXT: s_waitcnt vmcnt(2) +; GFX8-NEXT: v_cndmask_b32_e64 v18, v24, v22, s[8:9] +; GFX8-NEXT: v_cndmask_b32_sdwa v20, v24, v22, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_and_b32_e32 v22, 1, v29 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v22 +; GFX8-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:52 +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_cndmask_b32_e64 v22, v28, v30, s[6:7] +; GFX8-NEXT: v_cndmask_b32_sdwa v24, v28, v30, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v26 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[38:39], 1, v0 -; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 -; GFX8-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 -; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 -; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 -; GFX8-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 -; GFX8-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 -; GFX8-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 -; GFX8-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:16 +; GFX8-NEXT: v_cndmask_b32_e64 v26, v31, v28, s[4:5] +; GFX8-NEXT: v_cndmask_b32_sdwa v27, v31, v28, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v25 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cndmask_b32_e64 v25, v28, v29, s[16:17] +; GFX8-NEXT: v_cndmask_b32_sdwa v28, v28, v29, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:48 +; GFX8-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:112 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v23 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cndmask_b32_e64 v23, v30, v29, s[12:13] +; GFX8-NEXT: v_cndmask_b32_sdwa v29, v30, v29, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:108 +; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v21 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cndmask_b32_e64 v21, v30, v31, s[10:11] +; GFX8-NEXT: v_cndmask_b32_sdwa v30, v30, v31, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104 +; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v19 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cndmask_b32_e64 v19, v31, v32, s[14:15] +; GFX8-NEXT: v_cndmask_b32_sdwa v31, v31, v32, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 +; GFX8-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 +; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cndmask_b32_sdwa v16, v17, v32, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_e64 v17, v17, v32, s[4:5] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 +; GFX8-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 +; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v14 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cndmask_b32_sdwa v14, v32, v15, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_e64 v15, v32, v15, s[4:5] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13 +; GFX8-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 +; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v12 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cndmask_b32_sdwa v12, v13, v32, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_e64 v13, v13, v32, s[4:5] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11 +; GFX8-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:88 +; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v10 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cndmask_b32_sdwa v10, v11, v32, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v32, s[4:5] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8 ; GFX8-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 ; GFX8-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:20 -; GFX8-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88 -; GFX8-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:24 -; GFX8-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 -; GFX8-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28 -; GFX8-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 -; GFX8-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 -; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 -; GFX8-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36 -; GFX8-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 -; GFX8-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40 -; GFX8-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 -; GFX8-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44 -; GFX8-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 -; GFX8-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:48 -; GFX8-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 -; GFX8-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52 -; GFX8-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 -; GFX8-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:56 -; GFX8-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124 -; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 -; GFX8-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:128 -; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 -; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v29 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v28, 16, v32 -; GFX8-NEXT: v_cndmask_b32_e64 v28, v33, v28, s[38:39] -; GFX8-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[36:37] -; GFX8-NEXT: v_lshrrev_b32_e32 v32, 16, v31 -; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; GFX8-NEXT: v_cndmask_b32_e64 v32, v33, v32, s[34:35] -; GFX8-NEXT: v_cndmask_b32_e64 v30, v30, v31, s[30:31] -; GFX8-NEXT: v_lshrrev_b32_e32 v31, 16, v27 -; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX8-NEXT: v_cndmask_b32_e64 v31, v33, v31, s[90:91] -; GFX8-NEXT: v_cndmask_b32_e64 v26, v26, v27, s[88:89] -; GFX8-NEXT: v_lshrrev_b32_e32 v27, 16, v25 -; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GFX8-NEXT: v_cndmask_b32_e64 v27, v33, v27, s[78:79] -; GFX8-NEXT: v_cndmask_b32_e64 v24, v24, v25, s[76:77] -; GFX8-NEXT: v_lshrrev_b32_e32 v25, 16, v23 -; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX8-NEXT: v_cndmask_b32_e64 v25, v33, v25, s[74:75] -; GFX8-NEXT: v_cndmask_b32_e64 v22, v22, v23, s[72:73] -; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GFX8-NEXT: v_cndmask_b32_e64 v23, v33, v23, s[62:63] -; GFX8-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[60:61] -; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v19 -; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GFX8-NEXT: v_cndmask_b32_e64 v21, v33, v21, s[58:59] -; GFX8-NEXT: v_cndmask_b32_e64 v18, v18, v19, s[56:57] -; GFX8-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GFX8-NEXT: v_cndmask_b32_e64 v19, v33, v19, s[46:47] -; GFX8-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[44:45] -; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v15 -; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX8-NEXT: v_cndmask_b32_e64 v17, v33, v17, s[42:43] -; GFX8-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[40:41] -; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GFX8-NEXT: v_cndmask_b32_e64 v15, v33, v15, s[28:29] -; GFX8-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[26:27] -; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v11 -; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX8-NEXT: v_cndmask_b32_e64 v13, v33, v13, s[24:25] -; GFX8-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[22:23] -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v11, v33, v11, s[20:21] -; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[18:19] -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GFX8-NEXT: v_cndmask_b32_e64 v9, v33, v9, s[16:17] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[14:15] -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; GFX8-NEXT: v_cndmask_b32_e64 v7, v33, v7, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v5, v33, v5, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[6:7] -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v3, v33, v3, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v9 -; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v11 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v13 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v15 -; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v17 -; GFX8-NEXT: v_or_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v5, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v6, v12, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v7, v14, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v19 -; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v21 -; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v23 -; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v25 -; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v27 -; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v31 -; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v32 -; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v28 -; GFX8-NEXT: v_or_b32_sdwa v8, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v9, v18, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v10, v20, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v11, v22, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v12, v24, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v13, v26, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v14, v30, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v15, v29, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_readlane_b32 s39, v34, 7 -; GFX8-NEXT: v_readlane_b32 s38, v34, 6 -; GFX8-NEXT: v_readlane_b32 s37, v34, 5 -; GFX8-NEXT: v_readlane_b32 s36, v34, 4 -; GFX8-NEXT: v_readlane_b32 s35, v34, 3 -; GFX8-NEXT: v_readlane_b32 s34, v34, 2 -; GFX8-NEXT: v_readlane_b32 s31, v34, 1 -; GFX8-NEXT: v_readlane_b32 s30, v34, 0 -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: v_cndmask_b32_sdwa v32, v8, v9, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[4:5] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v6 +; GFX8-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 +; GFX8-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:16 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cndmask_b32_sdwa v9, v6, v7, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v4 +; GFX8-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 +; GFX8-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 ; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cndmask_b32_sdwa v7, v4, v5, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[4:5] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2 +; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 +; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cndmask_b32_sdwa v5, v2, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[4:5] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 +; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 +; GFX8-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cndmask_b32_sdwa v3, v0, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v8, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v5, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v6, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v7, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v8, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v9, v19, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v10, v21, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v11, v23, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v12, v25, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v13, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v14, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v15, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_vselect_v32bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v3 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v2 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v5 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v4 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v7 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v6 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v9 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v8 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v11 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v10 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v13 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v12 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v15 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v14 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v17 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[44:45], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v16 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[46:47], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v19 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[56:57], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v18 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[58:59], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v21 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[60:61], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v20 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[62:63], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v23 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[72:73], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v22 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[74:75], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v25 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[76:77], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v24 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[78:79], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v27 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[88:89], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v26 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[90:91], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v29 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[92:93], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v28 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[94:95], 1, v0 -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 -; GFX9-NEXT: v_writelane_b32 v33, s30, 0 -; GFX9-NEXT: v_writelane_b32 v33, s31, 1 -; GFX9-NEXT: v_writelane_b32 v33, s34, 2 +; GFX9-NEXT: v_and_b32_e32 v26, 1, v26 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v26 +; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 +; GFX9-NEXT: v_and_b32_e32 v24, 1, v24 +; GFX9-NEXT: v_and_b32_e32 v22, 1, v22 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v24 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v22 +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 +; GFX9-NEXT: v_and_b32_e32 v30, 1, v30 +; GFX9-NEXT: v_and_b32_e32 v28, 1, v28 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v30 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v28 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:60 +; GFX9-NEXT: v_and_b32_e32 v18, 1, v18 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v18 +; GFX9-NEXT: v_and_b32_e32 v20, 1, v20 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v20 +; GFX9-NEXT: v_and_b32_e32 v25, 1, v25 +; GFX9-NEXT: v_and_b32_e32 v23, 1, v23 +; GFX9-NEXT: v_and_b32_e32 v21, 1, v21 +; GFX9-NEXT: v_and_b32_e32 v19, 1, v19 +; GFX9-NEXT: v_and_b32_e32 v17, 1, v17 +; GFX9-NEXT: v_and_b32_e32 v16, 1, v16 +; GFX9-NEXT: v_and_b32_e32 v15, 1, v15 +; GFX9-NEXT: v_and_b32_e32 v14, 1, v14 +; GFX9-NEXT: v_and_b32_e32 v13, 1, v13 +; GFX9-NEXT: v_and_b32_e32 v12, 1, v12 +; GFX9-NEXT: v_and_b32_e32 v11, 1, v11 +; GFX9-NEXT: v_and_b32_e32 v10, 1, v10 +; GFX9-NEXT: v_and_b32_e32 v9, 1, v9 +; GFX9-NEXT: v_and_b32_e32 v8, 1, v8 +; GFX9-NEXT: v_and_b32_e32 v7, 1, v7 +; GFX9-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX9-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX9-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_writelane_b32 v33, s35, 3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v30 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_and_b32_e32 v18, 1, v26 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v18 +; GFX9-NEXT: v_and_b32_e32 v26, 1, v27 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_cndmask_b32_e64 v18, v24, v22, s[8:9] +; GFX9-NEXT: v_cndmask_b32_sdwa v20, v24, v22, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_e32 v22, 1, v29 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v22 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:52 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_cndmask_b32_e64 v22, v28, v30, s[6:7] +; GFX9-NEXT: v_cndmask_b32_sdwa v24, v28, v30, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e64 v26, v31, v28, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v27, v31, v28, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v25 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e64 v25, v28, v29, s[10:11] +; GFX9-NEXT: v_cndmask_b32_sdwa v28, v28, v29, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:112 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v23 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e64 v23, v30, v29, s[12:13] +; GFX9-NEXT: v_cndmask_b32_sdwa v29, v30, v29, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v21 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e64 v21, v30, v31, s[14:15] +; GFX9-NEXT: v_cndmask_b32_sdwa v30, v30, v31, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v19 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e64 v19, v31, v32, s[16:17] +; GFX9-NEXT: v_cndmask_b32_sdwa v31, v31, v32, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e64 v16, v17, v32, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v17, v17, v32, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v14 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e64 v14, v32, v15, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v15, v32, v15, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v12 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e64 v12, v13, v32, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v13, v13, v32, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v10 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e64 v10, v11, v32, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v11, v11, v32, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8 ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88 -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:124 -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v29, v31, v32, s[34:35] -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX9-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[30:31] -; GFX9-NEXT: v_cndmask_b32_e64 v32, v28, v30, s[94:95] -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GFX9-NEXT: v_cndmask_b32_e64 v28, v28, v30, s[92:93] -; GFX9-NEXT: v_cndmask_b32_e64 v30, v26, v27, s[90:91] -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GFX9-NEXT: v_cndmask_b32_e64 v26, v26, v27, s[88:89] -; GFX9-NEXT: v_cndmask_b32_e64 v27, v24, v25, s[78:79] -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GFX9-NEXT: v_cndmask_b32_e64 v24, v24, v25, s[76:77] -; GFX9-NEXT: v_cndmask_b32_e64 v25, v22, v23, s[74:75] -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GFX9-NEXT: v_cndmask_b32_e64 v22, v22, v23, s[72:73] -; GFX9-NEXT: v_cndmask_b32_e64 v23, v20, v21, s[62:63] -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GFX9-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[60:61] -; GFX9-NEXT: v_cndmask_b32_e64 v21, v18, v19, s[58:59] -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GFX9-NEXT: v_cndmask_b32_e64 v18, v18, v19, s[56:57] -; GFX9-NEXT: v_cndmask_b32_e64 v19, v16, v17, s[46:47] -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[44:45] -; GFX9-NEXT: v_cndmask_b32_e64 v17, v14, v15, s[42:43] -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX9-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[40:41] -; GFX9-NEXT: v_cndmask_b32_e64 v15, v12, v13, s[28:29] -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[26:27] -; GFX9-NEXT: v_cndmask_b32_e64 v13, v10, v11, s[24:25] -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[22:23] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v8, v9, s[20:21] -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[18:19] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v6, v7, s[16:17] -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v4, v5, s[12:13] -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v2, v3, s[8:9] -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v32, v8, v9, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v8, v8, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v6 +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e64 v9, v6, v7, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v6, v6, v7, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v4 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e64 v7, v4, v5, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v4, v4, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e64 v5, v2, v3, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v2, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cndmask_b32_e64 v3, v0, v1, s[4:5] -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v0, v0, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 ; GFX9-NEXT: v_perm_b32 v1, v2, v5, s4 ; GFX9-NEXT: v_perm_b32 v2, v4, v7, s4 ; GFX9-NEXT: v_perm_b32 v3, v6, v9, s4 -; GFX9-NEXT: v_perm_b32 v4, v8, v11, s4 -; GFX9-NEXT: v_perm_b32 v5, v10, v13, s4 -; GFX9-NEXT: v_perm_b32 v6, v12, v15, s4 -; GFX9-NEXT: v_perm_b32 v7, v14, v17, s4 -; GFX9-NEXT: v_perm_b32 v8, v16, v19, s4 -; GFX9-NEXT: v_perm_b32 v9, v18, v21, s4 -; GFX9-NEXT: v_perm_b32 v10, v20, v23, s4 -; GFX9-NEXT: v_perm_b32 v11, v22, v25, s4 -; GFX9-NEXT: v_perm_b32 v12, v24, v27, s4 -; GFX9-NEXT: v_perm_b32 v13, v26, v30, s4 -; GFX9-NEXT: v_perm_b32 v14, v28, v32, s4 -; GFX9-NEXT: v_perm_b32 v15, v31, v29, s4 -; GFX9-NEXT: v_readlane_b32 s35, v33, 3 -; GFX9-NEXT: v_readlane_b32 s34, v33, 2 -; GFX9-NEXT: v_readlane_b32 s31, v33, 1 -; GFX9-NEXT: v_readlane_b32 s30, v33, 0 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v4, v8, v32, s4 +; GFX9-NEXT: v_perm_b32 v5, v11, v10, s4 +; GFX9-NEXT: v_perm_b32 v6, v13, v12, s4 +; GFX9-NEXT: v_perm_b32 v7, v15, v14, s4 +; GFX9-NEXT: v_perm_b32 v8, v17, v16, s4 +; GFX9-NEXT: v_perm_b32 v9, v31, v19, s4 +; GFX9-NEXT: v_perm_b32 v10, v30, v21, s4 +; GFX9-NEXT: v_perm_b32 v11, v29, v23, s4 +; GFX9-NEXT: v_perm_b32 v12, v28, v25, s4 +; GFX9-NEXT: v_perm_b32 v13, v27, v26, s4 +; GFX9-NEXT: v_perm_b32 v14, v24, v22, s4 +; GFX9-NEXT: v_perm_b32 v15, v20, v18, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_vselect_v32bf16: diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll index 4fe11760e71fd..3cea1d17a2bfa 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -656,10 +656,10 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v1 -; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 -; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v2 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 32 +; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i8, ptr addrspace(1) %arrayidx, align 1 @@ -766,10 +766,10 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 -; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v2 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 32 +; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %arrayidx, align 1 diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll index 9fcfbba6fb235..0feefdf145639 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -629,9 +629,9 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1 -; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 32 +; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i8, ptr addrspace(1) %arrayidx, align 1 @@ -731,9 +731,9 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1 -; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 32 +; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %arrayidx, align 1 @@ -1509,9 +1509,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_or_b32_e32 v3, 0x100, v1 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v3, v3 -; GFX9-GISEL-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i8, ptr addrspace(1) %arrayidx, align 1 @@ -1613,9 +1612,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: v_or_b32_e32 v2, 0x10000, v1 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 -; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %arrayidx, align 1 diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll index ce7281702c108..1429d8e96ac7c 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll @@ -2778,8 +2778,7 @@ define bfloat @fmul_select_bf16_test1(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX9-NEXT: v_mov_b32_e32 v3, 0x3f80 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x4000 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -2913,8 +2912,7 @@ define bfloat @fmul_select_bf16_test2(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX9-NEXT: v_mov_b32_e32 v3, 0x3f80 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x3f00 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -3054,14 +3052,12 @@ define <2 x bfloat> @fmul_select_v2bf16_test3(<2 x bfloat> %x, <2 x i32> %bool.a ; GFX9-NEXT: v_mov_b32_e32 v5, 0x3f80 ; GFX9-NEXT: v_mov_b32_e32 v6, 0x4000 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v2, v5, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v5, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 @@ -3264,14 +3260,12 @@ define <2 x bfloat> @fmul_select_v2bf16_test4(<2 x bfloat> %x, <2 x i32> %bool.a ; GFX9-NEXT: v_mov_b32_e32 v5, 0x3f80 ; GFX9-NEXT: v_mov_b32_e32 v6, 0x3f00 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v2, v5, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v5, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 @@ -3469,8 +3463,7 @@ define bfloat @fmul_select_bf16_test5(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX9-NEXT: v_mov_b32_e32 v3, 0x4100 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x4000 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -3606,8 +3599,7 @@ define bfloat @fmul_select_bf16_test6(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX9-NEXT: v_mov_b32_e32 v3, 0x4040 ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffffc100 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -3742,8 +3734,7 @@ define bfloat @fmul_select_bf16_test7(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffffc080 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x4100 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -3877,8 +3868,8 @@ define bfloat @fmul_select_bf16_test8(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff8000 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v1, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -4007,8 +3998,7 @@ define bfloat @fmul_select_bf16_test9(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffffc200 ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffffc180 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -4144,8 +4134,7 @@ define bfloat @fmul_select_bf16_test10_sel_log2val_pos65_pos56(bfloat %x, i32 %b ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffffdb80 ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffffe000 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -4281,8 +4270,7 @@ define bfloat @fmul_select_bf16_test11_sel_log2val_neg22_pos25(bfloat %x, i32 %b ; GFX9-NEXT: v_mov_b32_e32 v3, 0x4c00 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x3480 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll index 8aab9ec885f3c..5488be398fe48 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll @@ -605,34 +605,34 @@ define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(ptr addrspace(1) ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; VI-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; VI-NEXT: v_mov_b32_e32 v6, s1 -; VI-NEXT: v_add_u32_e32 v5, vcc, s0, v0 ; VI-NEXT: s_cmp_eq_u32 s4, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] ; VI-NEXT: s_cmp_eq_u32 s4, 2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; VI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; VI-NEXT: v_cndmask_b32_sdwa v0, v1, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s4, 3 -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] ; VI-NEXT: s_cmp_eq_u32 s4, 4 -; VI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s4, 5 -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 -; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; VI-NEXT: s_cmp_eq_u32 s4, 6 -; VI-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; VI-NEXT: s_cmp_eq_u32 s4, 7 -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc +; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[2:3] +; VI-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] +; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: flat_store_short v[5:6], v0 ; VI-NEXT: s_endpgm ; @@ -832,63 +832,63 @@ define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1) ; VI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc ; VI-NEXT: flat_load_dwordx4 v[5:8], v[5:6] ; VI-NEXT: v_mov_b32_e32 v10, s1 -; VI-NEXT: v_add_u32_e32 v9, vcc, s0, v0 ; VI-NEXT: s_cmp_eq_u32 s4, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] ; VI-NEXT: s_cmp_eq_u32 s4, 2 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; VI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; VI-NEXT: v_cndmask_b32_sdwa v0, v1, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s4, 3 -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] ; VI-NEXT: s_cmp_eq_u32 s4, 4 -; VI-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc +; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s4, 5 -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] ; VI-NEXT: s_cmp_eq_u32 s4, 6 -; VI-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc +; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s4, 7 -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] ; VI-NEXT: s_cmp_eq_u32 s4, 8 -; VI-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc +; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s4, 9 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] ; VI-NEXT: s_cmp_eq_u32 s4, 10 -; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s4, 11 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] ; VI-NEXT: s_cmp_eq_u32 s4, 12 -; VI-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc +; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s4, 13 -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; VI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; VI-NEXT: s_cmp_eq_u32 s4, 14 -; VI-NEXT: v_cndmask_b32_e32 v0, v0, v15, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; VI-NEXT: s_cmp_eq_u32 s4, 15 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 -; VI-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc +; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v7, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[2:3] +; VI-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[0:1] +; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v8, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: flat_store_short v[9:10], v0 ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll index db6ec2b32ad63..12b26cb6d8c19 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll @@ -1,18 +1,24 @@ -; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s -; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89,SIVI %s -; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub --version 5 +; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s +; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89 %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89 %s -; GCN-LABEL: {{^}}extract_vector_elt_v2i16: -; GCN: s_load_dword [[VEC:s[0-9]+]] -; SIVI: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16 -; SIVI-DAG: v_mov_b32_e32 [[VELT0:v[0-9]+]], [[VEC]] -; SIVI-DAG: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]] -; SIVI-DAG: buffer_store_short [[VELT0]] -; SIVI-DAG: buffer_store_short [[VELT1]] -; GFX9: v_mov_b32_e32 [[VVEC:v[0-9]+]], [[VEC]] -; GFX9: global_store_short_d16_hi v{{[0-9]+}}, [[VVEC]], -; GFX9: buffer_store_short [[VVEC]], define amdgpu_kernel void @extract_vector_elt_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 { +; +; GCN-LABEL: extract_vector_elt_v2i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dword s4, s[2:3], 0x0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s5, s4, 16 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: buffer_store_short v1, off, s[0:3], 0 +; GCN-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:20 +; GCN-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr %p0 = extractelement <2 x i16> %vec, i32 0 %p1 = extractelement <2 x i16> %vec, i32 1 @@ -22,33 +28,65 @@ define amdgpu_kernel void @extract_vector_elt_v2i16(ptr addrspace(1) %out, ptr a ret void } -; GCN-LABEL: {{^}}extract_vector_elt_v2i16_dynamic_sgpr: -; GCN: s_load_dword [[IDX:s[0-9]+]] -; GCN: s_load_dword [[VEC:s[0-9]+]] -; GCN: s_lshl_b32 [[IDX_SCALED:s[0-9]+]], [[IDX]], 4 -; GCN: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], [[IDX_SCALED]] -; GCN: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]] -; GCN: buffer_store_short [[VELT1]] -; GCN: ScratchSize: 0 define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i32 %idx) #0 { +; GCN-LABEL: extract_vector_elt_v2i16_dynamic_sgpr: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s4, s[4:5], 0x15 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dword s2, s[2:3], 0x0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_lshl_b32 s4, s4, 4 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s4, s2, s4 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GCN-NEXT: s_endpgm +; +; GFX89-LABEL: extract_vector_elt_v2i16_dynamic_sgpr: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX89-NEXT: s_load_dword s8, s[4:5], 0x54 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_lshl_b32 s0, s8, 4 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_lshr_b32 s0, s2, s0 +; GFX89-NEXT: v_mov_b32_e32 v0, s0 +; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX89-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr %elt = extractelement <2 x i16> %vec, i32 %idx store i16 %elt, ptr addrspace(1) %out, align 2 ret void } -; GCN-LABEL: {{^}}extract_vector_elt_v2i16_dynamic_vgpr: -; GCN-DAG: {{flat|buffer|global}}_load_dword [[IDX:v[0-9]+]] -; GCN-DAG: v_lshlrev_b32_e32 [[IDX_SCALED:v[0-9]+]], 4, [[IDX]] -; GCN-DAG: s_load_dword [[VEC:s[0-9]+]] - -; SI: v_lshr_b32_e32 [[ELT:v[0-9]+]], [[VEC]], [[IDX_SCALED]] -; VI: v_lshrrev_b32_e64 [[ELT:v[0-9]+]], [[IDX_SCALED]], [[VEC]] - -; SI: buffer_store_short [[ELT]] -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[ELT]] -; GCN: ScratchSize: 0{{$}} define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, ptr addrspace(1) %idx.ptr) #0 { +; +; GCN-LABEL: extract_vector_elt_v2i16_dynamic_vgpr: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_load_dword v3, v[1:2], s[4:7], 0 addr64 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_load_dword s2, s[2:3], 0x0 +; GCN-NEXT: s_mov_b64 s[4:5], s[0:1] +; GCN-NEXT: v_lshlrev_b32_e32 v1, 1, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v3 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_lshr_b32_e32 v0, s2, v0 +; GCN-NEXT: buffer_store_short v0, v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i32, ptr addrspace(1) %idx.ptr, i64 %tid.ext @@ -60,14 +98,39 @@ define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_vgpr(ptr addrspace(1 ret void } -; GCN-LABEL: {{^}}extract_vector_elt_v3i16: -; GCN: s_load_dwordx4 - -; GCN-NOT: {{buffer|flat|global}}_load - -; GCN: buffer_store_short -; GCN: buffer_store_short define amdgpu_kernel void @extract_vector_elt_v3i16(ptr addrspace(1) %out, <3 x i16> %foo) #0 { +; +; +; +; +; GCN-LABEL: extract_vector_elt_v3i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:2 +; GCN-NEXT: s_endpgm +; +; GFX89-LABEL: extract_vector_elt_v3i16: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: v_mov_b32_e32 v0, s3 +; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX89-NEXT: v_mov_b32_e32 v0, s2 +; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:2 +; GFX89-NEXT: s_endpgm %p0 = extractelement <3 x i16> %foo, i32 0 %p1 = extractelement <3 x i16> %foo, i32 2 %out1 = getelementptr i16, ptr addrspace(1) %out, i32 1 @@ -76,17 +139,40 @@ define amdgpu_kernel void @extract_vector_elt_v3i16(ptr addrspace(1) %out, <3 x ret void } -; GCN-LABEL: {{^}}extract_vector_elt_v4i16: -; SI: s_load_dwordx4 -; SI: buffer_store_short -; SI: buffer_store_short - -; GFX89-DAG: s_load_dwordx4 s[[[#LOAD:]]:[[#END:]]], s[4:5], 0x24 -; GFX89-DAG: v_mov_b32_e32 [[VLOAD0:v[0-9]+]], s[[#LOAD + 2]] -; GFX89-DAG: buffer_store_short [[VLOAD0]], off -; GFX89-DAG: v_mov_b32_e32 [[VLOAD1:v[0-9]+]], s[[#LOAD + 3]] -; GFX89-DAG: buffer_store_short [[VLOAD1]], off define amdgpu_kernel void @extract_vector_elt_v4i16(ptr addrspace(1) %out, <4 x i16> %foo) #0 { +; +; +; GCN-LABEL: extract_vector_elt_v4i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:20 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm +; +; GFX89-LABEL: extract_vector_elt_v4i16: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: v_mov_b32_e32 v0, s3 +; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: v_mov_b32_e32 v0, s2 +; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:20 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_endpgm %p0 = extractelement <4 x i16> %foo, i32 0 %p1 = extractelement <4 x i16> %foo, i32 2 %out1 = getelementptr i16, ptr addrspace(1) %out, i32 10 @@ -95,36 +181,64 @@ define amdgpu_kernel void @extract_vector_elt_v4i16(ptr addrspace(1) %out, <4 x ret void } -; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i16: -; SI: s_load_dword s -; SI: s_load_dwordx2 s -; SI: s_load_dwordx2 s - -; GFX89-DAG: s_load_dwordx2 s[[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]], s[4:5], 0x24 -; GFX89-DAG: s_load_dwordx2 s[[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]], s[4:5], 0x4c -; GFX89-DAG: s_load_dword s{{[0-9]+}}, s[4:5], 0x54 - -; GCN-NOT: {{buffer|flat|global}} - -; SICI: buffer_store_short -; SICI: buffer_store_short -; SICI: buffer_store_short - -; GFX9-NOT: s_pack_ll_b32_b16 -; GFX9-NOT: s_pack_lh_b32_b16 - -; GCN-DAG: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 4 -; GCN: s_lshr_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s -; GCN: {{buffer|global}}_store_short define amdgpu_kernel void @dynamic_extract_vector_elt_v3i16(ptr addrspace(1) %out, [8 x i32], <3 x i16> %foo, i32 %idx) #0 { +; +; +; GCN-LABEL: dynamic_extract_vector_elt_v3i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s2, s[4:5], 0x15 +; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshl_b32 s2, s2, 4 +; GCN-NEXT: s_lshr_b64 s[4:5], s[6:7], s2 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GCN-NEXT: s_endpgm +; +; GFX89-LABEL: dynamic_extract_vector_elt_v3i16: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_load_dword s8, s[4:5], 0x54 +; GFX89-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX89-NEXT: s_mov_b32 s3, 0xf000 +; GFX89-NEXT: s_mov_b32 s2, -1 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_lshl_b32 s4, s8, 4 +; GFX89-NEXT: s_lshr_b64 s[4:5], s[6:7], s4 +; GFX89-NEXT: v_mov_b32_e32 v0, s4 +; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX89-NEXT: s_endpgm %p0 = extractelement <3 x i16> %foo, i32 %idx %out1 = getelementptr i16, ptr addrspace(1) %out, i32 1 store i16 %p0, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}v_insertelement_v4i16_dynamic_sgpr: define amdgpu_kernel void @v_insertelement_v4i16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) #0 { +; +; +; +; GCN-LABEL: v_insertelement_v4i16_dynamic_sgpr: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s8, s[4:5], 0xd +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: v_lshlrev_b32_e32 v1, 1, v0 +; GCN-NEXT: s_lshl_b32 s4, s8, 4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshr_b64 v[3:4], v[3:4], s4 +; GCN-NEXT: buffer_store_short v3, v[1:2], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %in, i64 %tid.ext @@ -135,13 +249,44 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_sgpr(ptr addrspace(1) % ret void } -; GCN-LABEL: {{^}}reduce_load_vector_v8i16_extract_01: -; GCN: s_load_dwordx2 [[PTR:s\[[0-9]+:[0-9]+\]]], -; GCN-NOT: {{s|buffer|flat|global}}_load_ -; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], 0x0 -; GCN-NOT: {{s|buffer|flat|global}}_load_ -; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 define amdgpu_kernel void @reduce_load_vector_v8i16_extract_01(ptr addrspace(4) %ptr) #0 { +; +; +; +; +; GCN-LABEL: reduce_load_vector_v8i16_extract_01: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s1, s0, 16 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm +; +; GFX89-LABEL: reduce_load_vector_v8i16_extract_01: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX89-NEXT: s_mov_b32 s3, 0xf000 +; GFX89-NEXT: s_mov_b32 s2, -1 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_lshr_b32 s1, s0, 16 +; GFX89-NEXT: v_mov_b32_e32 v0, s0 +; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: v_mov_b32_e32 v0, s1 +; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_endpgm %load = load <16 x i16>, ptr addrspace(4) %ptr %elt0 = extractelement <16 x i16> %load, i32 0 %elt1 = extractelement <16 x i16> %load, i32 1 @@ -150,13 +295,44 @@ define amdgpu_kernel void @reduce_load_vector_v8i16_extract_01(ptr addrspace(4) ret void } -; GCN-LABEL: {{^}}reduce_load_vector_v8i16_extract_23: -; GCN: s_load_dwordx2 [[PTR:s\[[0-9]+:[0-9]+\]]], -; GCN-NOT: {{s|buffer|flat|global}}_load_ -; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], {{0x1|0x4}} -; GCN-NOT: {{s|buffer|flat|global}}_load_ -; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 define amdgpu_kernel void @reduce_load_vector_v8i16_extract_23(ptr addrspace(4) %ptr) #0 { +; +; +; +; +; GCN-LABEL: reduce_load_vector_v8i16_extract_23: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dword s0, s[0:1], 0x1 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s1, s0, 16 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm +; +; GFX89-LABEL: reduce_load_vector_v8i16_extract_23: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX89-NEXT: s_mov_b32 s3, 0xf000 +; GFX89-NEXT: s_mov_b32 s2, -1 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_load_dword s0, s[0:1], 0x4 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_lshr_b32 s1, s0, 16 +; GFX89-NEXT: v_mov_b32_e32 v0, s0 +; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: v_mov_b32_e32 v0, s1 +; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_endpgm %load = load <16 x i16>, ptr addrspace(4) %ptr %elt2 = extractelement <16 x i16> %load, i32 2 %elt3 = extractelement <16 x i16> %load, i32 3 @@ -165,14 +341,26 @@ define amdgpu_kernel void @reduce_load_vector_v8i16_extract_23(ptr addrspace(4) ret void } -; GCN-LABEL: {{^}}v_extractelement_v8i16_2: -; SI: buffer_load_dword [[RES:v[0-9]+]], v[{{[0-9:]+}}], s[{{[0-9:]+}}], 0 addr64 offset:4 -; SI: buffer_store_short [[RES]] -; VI: flat_load_dword [[RES:v[0-9]+]] -; VI: flat_store_short v[{{[0-9:]+}}], [[RES]] -; GFX9: global_load_dword [[RES:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9:]+}}] offset:4 -; GFX9: global_store_short v{{[0-9]+}}, [[RES]] define amdgpu_kernel void @v_extractelement_v8i16_2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; +; +; +; GCN-LABEL: v_extractelement_v8i16_2: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 4, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_mov_b64 s[10:11], s[6:7] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[8:9], s[2:3] +; GCN-NEXT: buffer_load_dword v3, v[1:2], s[8:11], 0 addr64 offset:4 +; GCN-NEXT: s_mov_b64 s[4:5], s[0:1] +; GCN-NEXT: v_lshlrev_b32_e32 v1, 1, v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_short v3, v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <8 x i16>, ptr addrspace(1) %in, i64 %tid.ext @@ -183,14 +371,26 @@ define amdgpu_kernel void @v_extractelement_v8i16_2(ptr addrspace(1) %out, ptr a ret void } -; GCN-LABEL: {{^}}v_extractelement_v8i16_6: -; SI: buffer_load_dword [[RES:v[0-9]+]], v[{{[0-9:]+}}], s[{{[0-9:]+}}], 0 addr64 offset:12 -; SI: buffer_store_short [[RES]] -; VI: flat_load_dword [[RES:v[0-9]+]] -; VI: flat_store_short v[{{[0-9:]+}}], [[RES]] -; GFX9: global_load_dword [[RES:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9:]+}}] offset:12 -; GFX9: global_store_short v{{[0-9]+}}, [[RES]] define amdgpu_kernel void @v_extractelement_v8i16_6(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; +; +; +; GCN-LABEL: v_extractelement_v8i16_6: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 4, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_mov_b64 s[10:11], s[6:7] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[8:9], s[2:3] +; GCN-NEXT: buffer_load_dword v3, v[1:2], s[8:11], 0 addr64 offset:12 +; GCN-NEXT: s_mov_b64 s[4:5], s[0:1] +; GCN-NEXT: v_lshlrev_b32_e32 v1, 1, v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_short v3, v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <8 x i16>, ptr addrspace(1) %in, i64 %tid.ext @@ -201,9 +401,52 @@ define amdgpu_kernel void @v_extractelement_v8i16_6(ptr addrspace(1) %out, ptr a ret void } -; GCN-LABEL: {{^}}v_extractelement_v8i16_dynamic_sgpr: -; GCN-COUNT-7: v_cndmask_b32_e32 define amdgpu_kernel void @v_extractelement_v8i16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %n) #0 { +; +; +; +; GCN-LABEL: v_extractelement_v8i16_dynamic_sgpr: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s8, s[4:5], 0xd +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GCN-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dwordx4 v[1:4], v[4:5], s[4:7], 0 addr64 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 1, v0 +; GCN-NEXT: v_mov_b32_e32 v7, v5 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_cmp_eq_u32 s8, 1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, 2 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, 3 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, 4 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, 5 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, 6 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, 7 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc +; GCN-NEXT: buffer_store_short v0, v[6:7], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <8 x i16>, ptr addrspace(1) %in, i64 %tid.ext @@ -214,14 +457,26 @@ define amdgpu_kernel void @v_extractelement_v8i16_dynamic_sgpr(ptr addrspace(1) ret void } -; GCN-LABEL: {{^}}v_extractelement_v16i16_2: -; SI: buffer_load_dword [[RES:v[0-9]+]], v[{{[0-9:]+}}], s[{{[0-9:]+}}], 0 addr64 offset:4 -; SI: buffer_store_short [[RES]] -; VI: flat_load_dword [[RES:v[0-9]+]] -; VI: flat_store_short v[{{[0-9:]+}}], [[RES]] -; GFX9: global_load_dword [[RES:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9:]+}}] offset:4 -; GFX9: global_store_short v{{[0-9]+}}, [[RES]] define amdgpu_kernel void @v_extractelement_v16i16_2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; +; +; +; GCN-LABEL: v_extractelement_v16i16_2: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 5, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_mov_b64 s[10:11], s[6:7] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[8:9], s[2:3] +; GCN-NEXT: buffer_load_dword v3, v[1:2], s[8:11], 0 addr64 offset:4 +; GCN-NEXT: s_mov_b64 s[4:5], s[0:1] +; GCN-NEXT: v_lshlrev_b32_e32 v1, 1, v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_short v3, v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <16 x i16>, ptr addrspace(1) %in, i64 %tid.ext @@ -232,14 +487,26 @@ define amdgpu_kernel void @v_extractelement_v16i16_2(ptr addrspace(1) %out, ptr ret void } -; GCN-LABEL: {{^}}v_extractelement_v16i16_6: -; SI: buffer_load_dword [[RES:v[0-9]+]], v[{{[0-9:]+}}], s[{{[0-9:]+}}], 0 addr64 offset:12 -; SI: buffer_store_short [[RES]] -; VI: flat_load_dword [[RES:v[0-9]+]] -; VI: flat_store_short v[{{[0-9:]+}}], [[RES]] -; GFX9: global_load_dword [[RES:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9:]+}}] offset:12 -; GFX9: global_store_short v{{[0-9]+}}, [[RES]] define amdgpu_kernel void @v_extractelement_v16i16_6(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; +; +; +; GCN-LABEL: v_extractelement_v16i16_6: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 5, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_mov_b64 s[10:11], s[6:7] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[8:9], s[2:3] +; GCN-NEXT: buffer_load_dword v3, v[1:2], s[8:11], 0 addr64 offset:12 +; GCN-NEXT: s_mov_b64 s[4:5], s[0:1] +; GCN-NEXT: v_lshlrev_b32_e32 v1, 1, v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_short v3, v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <16 x i16>, ptr addrspace(1) %in, i64 %tid.ext @@ -250,9 +517,82 @@ define amdgpu_kernel void @v_extractelement_v16i16_6(ptr addrspace(1) %out, ptr ret void } -; GCN-LABEL: {{^}}v_extractelement_v16i16_dynamic_sgpr: -; GCN-COUNT-15: v_cndmask_b32_e32 define amdgpu_kernel void @v_extractelement_v16i16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %n) #0 { +; +; +; +; GCN-LABEL: v_extractelement_v16i16_dynamic_sgpr: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s8, s[4:5], 0xd +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 5, v0 +; GCN-NEXT: v_mov_b32_e32 v6, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dwordx4 v[1:4], v[5:6], s[4:7], 0 addr64 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 1, v0 +; GCN-NEXT: v_mov_b32_e32 v10, v6 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: buffer_load_dwordx4 v[5:8], v[5:6], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: s_cmp_eq_u32 s8, 1 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, 2 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, 3 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, 4 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, 5 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, 6 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, 7 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, 8 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, 9 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, 10 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, 11 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, 12 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v15, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, 13 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, 14 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, 15 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v17, vcc +; GCN-NEXT: buffer_store_short v0, v[9:10], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <16 x i16>, ptr addrspace(1) %in, i64 %tid.ext diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll index 09279f6f0768c..b4983bbdf4afa 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll @@ -113,12 +113,9 @@ define <2 x half> @test_fmax_legacy_ugt_v2f16(<2 x half> %a, <2 x half> %b) #0 { ; VI-SAFE-LABEL: test_fmax_legacy_ugt_v2f16: ; VI-SAFE: ; %bb.0: ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v3, v2 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-SAFE-NEXT: v_cmp_nle_f16 vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; VI-SAFE-NEXT: v_cndmask_b32_sdwa v2, v1, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v0, v1 -; VI-SAFE-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-SAFE-NEXT: s_setpc_b64 s[30:31] @@ -220,16 +217,13 @@ define <3 x half> @test_fmax_legacy_ugt_v3f16(<3 x half> %a, <3 x half> %b) #0 { ; VI-SAFE-LABEL: test_fmax_legacy_ugt_v3f16: ; VI-SAFE: ; %bb.0: ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v5, v4 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; VI-SAFE-NEXT: v_cmp_nle_f16 vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; VI-SAFE-NEXT: v_cndmask_b32_sdwa v4, v2, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v1, v3 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v0, v2 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; VI-SAFE-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; VI-NNAN-LABEL: test_fmax_legacy_ugt_v3f16: @@ -350,22 +344,16 @@ define <4 x half> @test_fmax_legacy_ugt_v4f16(<4 x half> %a, <4 x half> %b) #0 { ; VI-SAFE-LABEL: test_fmax_legacy_ugt_v4f16: ; VI-SAFE: ; %bb.0: ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v7, v6 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc -; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v5, v4 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; VI-SAFE-NEXT: v_cmp_nle_f16 vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; VI-SAFE-NEXT: v_cndmask_b32_sdwa v4, v3, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-SAFE-NEXT: v_cmp_nle_f16 vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; VI-SAFE-NEXT: v_cndmask_b32_sdwa v5, v2, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v1, v3 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v0, v2 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; VI-SAFE-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-SAFE-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; VI-SAFE-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-SAFE-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; VI-NNAN-LABEL: test_fmax_legacy_ugt_v4f16: @@ -523,22 +511,14 @@ define <8 x half> @test_fmax_legacy_ugt_v8f16(<8 x half> %a, <8 x half> %b) #0 { ; VI-SAFE-LABEL: test_fmax_legacy_ugt_v8f16: ; VI-SAFE: ; %bb.0: ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v12, 16, v6 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v15, v14 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v14, v14, v15, vcc -; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v13, v12 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v12, v12, v13, vcc -; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v11, v10 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc -; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v9, v8 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc +; VI-SAFE-NEXT: v_cmp_nle_f16 vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1 +; VI-SAFE-NEXT: v_cndmask_b32_sdwa v8, v7, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-SAFE-NEXT: v_cmp_nle_f16 vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1 +; VI-SAFE-NEXT: v_cndmask_b32_sdwa v9, v6, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-SAFE-NEXT: v_cmp_nle_f16 vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1 +; VI-SAFE-NEXT: v_cndmask_b32_sdwa v10, v5, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-SAFE-NEXT: v_cmp_nle_f16 vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1 +; VI-SAFE-NEXT: v_cndmask_b32_sdwa v11, v4, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v3, v7 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v2, v6 @@ -547,14 +527,10 @@ define <8 x half> @test_fmax_legacy_ugt_v8f16(<8 x half> %a, <8 x half> %b) #0 { ; VI-SAFE-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v0, v4 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; VI-SAFE-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-SAFE-NEXT: v_lshlrev_b32_e32 v4, 16, v10 -; VI-SAFE-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-SAFE-NEXT: v_lshlrev_b32_e32 v4, 16, v12 -; VI-SAFE-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-SAFE-NEXT: v_lshlrev_b32_e32 v4, 16, v14 -; VI-SAFE-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-SAFE-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-SAFE-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-SAFE-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; VI-NNAN-LABEL: test_fmax_legacy_ugt_v8f16: diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll index 1b8a79ee982d1..753b74ba696d1 100644 --- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll @@ -2083,19 +2083,17 @@ define <2 x half> @v_fmaximum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v1, v0, v5, s0 ; GFX942-NEXT: v_pk_max_f16 v1, v2, v1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v2, v5 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v2, v0 src0_sel:WORD_1 src1_sel:DWORD ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v4, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2131,19 +2129,17 @@ define <2 x half> @v_fmaximum3_v2f16_commute(<2 x half> %a, <2 x half> %b, <2 x ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v1, v0, v5, s0 ; GFX942-NEXT: v_pk_max_f16 v1, v1, v2 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v5, v2 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v4, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2179,24 +2175,22 @@ define <2 x half> @v_fmaximum3_v2f16__fabs_all(<2 x half> %a, <2 x half> %b, <2 ; GFX942-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v0 ; GFX942-NEXT: v_and_b32_e32 v4, 0x7fff7fff, v1 ; GFX942-NEXT: v_pk_max_f16 v3, v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v6, 0x7e00 -; GFX942-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v1| src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v2 -; GFX942-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v6, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v1| ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v6, v3, vcc -; GFX942-NEXT: v_perm_b32 v1, v4, v0, s0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_perm_b32 v1, v6, v0, s0 ; GFX942-NEXT: v_pk_max_f16 v1, v1, v5 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v4, |v2| src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v6, |v2| src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_sdwa v3, v4, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2| ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v6, v1, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX942-NEXT: v_perm_b32 v0, v3, v0, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2238,19 +2232,17 @@ define <2 x half> @v_fmaximum3_v2f16__fneg_all(<2 x half> %a, <2 x half> %b, <2 ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v1 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v1, v0, v5, s0 ; GFX942-NEXT: v_pk_max_f16 v1, v1, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v5, -v2 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v2 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v4, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2284,25 +2276,23 @@ define <2 x half> @v_fmaximum3_v2f16__inlineimm1(<2 x half> %a, <2 x half> %c) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_pk_max_f16 v2, v0, 2.0 op_sel_hi:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX942-NEXT: v_mov_b32_e32 v3, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v4, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX942-NEXT: v_perm_b32 v2, v3, v0, s0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: v_perm_b32 v2, v4, v0, s0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v1 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v3, v1 src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v4, v1 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_sdwa v4, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX942-NEXT: v_perm_b32 v0, v3, v0, s0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: v_perm_b32 v0, v4, v0, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v2f16__inlineimm1: @@ -2337,19 +2327,17 @@ define <2 x half> @v_fmaximum3_v2f16__inlineimm2(<2 x half> %a, <2 x half> %b) { ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v1, v0, v4, s0 ; GFX942-NEXT: v_pk_max_f16 v1, v1, 4.0 op_sel_hi:[1,0] ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v3, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v2, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2387,18 +2375,16 @@ define <3 x half> @v_fmaximum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v1, v1, v6, s0 ; GFX942-NEXT: v_pk_max_f16 v1, v5, v1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v5, v6 @@ -2408,10 +2394,9 @@ define <3 x half> @v_fmaximum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v8 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2450,18 +2435,16 @@ define <3 x half> @v_fmaximum3_v3f16_commute(<3 x half> %a, <3 x half> %b, <3 x ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v1, v1, v6, s0 ; GFX942-NEXT: v_pk_max_f16 v1, v1, v5 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 @@ -2471,10 +2454,9 @@ define <3 x half> @v_fmaximum3_v3f16_commute(<3 x half> %a, <3 x half> %b, <3 x ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v8, v4 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2519,36 +2501,33 @@ define <3 x half> @v_fmaximum3_v3f16__fabs_all(<3 x half> %a, <3 x half> %b, <3 ; GFX942-NEXT: v_and_b32_e32 v6, 0x7fff7fff, v0 ; GFX942-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v2 ; GFX942-NEXT: v_pk_max_f16 v7, v7, v9 -; GFX942-NEXT: v_mov_b32_e32 v12, 0x7e00 -; GFX942-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; GFX942-NEXT: v_mov_b32_e32 v9, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_max_f16 v6, v6, v8 ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 -; GFX942-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; GFX942-NEXT: v_cndmask_b32_sdwa v12, v9, v7, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v4 ; GFX942-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v5 -; GFX942-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v8, v9, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3| ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v7, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v1, v9, v7, vcc ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2| ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v6, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v0, v9, v6, vcc ; GFX942-NEXT: v_perm_b32 v2, v8, v0, s0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v11 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v4| src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX942-NEXT: v_perm_b32 v6, v9, v1, s0 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc +; GFX942-NEXT: v_perm_b32 v6, v12, v1, s0 ; GFX942-NEXT: v_pk_max_f16 v6, v6, v10 +; GFX942-NEXT: v_cndmask_b32_sdwa v3, v9, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v1, |v5| ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v6, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v1, v9, v6, vcc ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v4| ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v2, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc ; GFX942-NEXT: v_perm_b32 v0, v3, v0, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2596,18 +2575,16 @@ define <3 x half> @v_fmaximum3_v3f16__fneg_all(<3 x half> %a, <3 x half> %b, <3 ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1] ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v1, v1, v6, s0 ; GFX942-NEXT: v_pk_max_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v6, -v5 @@ -2617,10 +2594,9 @@ define <3 x half> @v_fmaximum3_v3f16__fneg_all(<3 x half> %a, <3 x half> %b, <3 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v8, -v4 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2657,34 +2633,31 @@ define <3 x half> @v_fmaximum3_v3f16__inlineimm1(<3 x half> %a, <3 x half> %c) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_pk_max_f16 v4, v0, 2.0 op_sel_hi:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v6, 0x7e00 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX942-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_max_f16 v7, v1, 2.0 ; GFX942-NEXT: s_mov_b32 s1, 0x5040100 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v6, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 ; GFX942-NEXT: s_movk_i32 s0, 0x7e00 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc -; GFX942-NEXT: v_perm_b32 v4, v5, v0, s1 -; GFX942-NEXT: v_pk_max_f16 v4, v4, v2 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc ; GFX942-NEXT: v_pack_b32_f16 v7, v1, s0 ; GFX942-NEXT: v_pk_max_f16 v7, v7, v3 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX942-NEXT: v_perm_b32 v4, v6, v0, s1 +; GFX942-NEXT: v_pk_max_f16 v4, v4, v2 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v6, v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_sdwa v6, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc -; GFX942-NEXT: v_perm_b32 v0, v5, v0, s1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX942-NEXT: v_perm_b32 v0, v6, v0, s1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v3f16__inlineimm1: @@ -2722,18 +2695,16 @@ define <3 x half> @v_fmaximum3_v3f16__inlineimm2(<3 x half> %a, <3 x half> %b) { ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v1, v1, v4, s0 ; GFX942-NEXT: v_pk_max_f16 v1, v1, 4.0 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 @@ -2743,10 +2714,9 @@ define <3 x half> @v_fmaximum3_v3f16__inlineimm2(<3 x half> %a, <3 x half> %b) { ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v6, v6 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2785,37 +2755,33 @@ define <4 x half> @v_fmaximum3_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v2, v1, v6, s0 ; GFX942-NEXT: v_pk_max_f16 v2, v5, v2 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v5, v6 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v5, v1 src0_sel:WORD_1 src1_sel:DWORD ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 ; GFX942-NEXT: v_pk_max_f16 v2, v4, v2 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v8 ; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v5, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2854,37 +2820,33 @@ define <4 x half> @v_fmaximum3_v4f16_commute(<4 x half> %a, <4 x half> %b, <4 x ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v2, v1, v6, s0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v5 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v8, v4 ; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v5, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2929,42 +2891,37 @@ define <4 x half> @v_fmaximum3_v4f16__fabs_all(<4 x half> %a, <4 x half> %b, <4 ; GFX942-NEXT: v_and_b32_e32 v6, 0x7fff7fff, v1 ; GFX942-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v3 ; GFX942-NEXT: v_pk_max_f16 v7, v7, v9 -; GFX942-NEXT: v_mov_b32_e32 v12, 0x7e00 -; GFX942-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; GFX942-NEXT: v_mov_b32_e32 v9, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_max_f16 v6, v6, v8 ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 -; GFX942-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; GFX942-NEXT: v_cndmask_b32_sdwa v12, v9, v7, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v5 ; GFX942-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v4 -; GFX942-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v8, v9, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2| ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v7, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v0, v9, v7, vcc ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3| ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v6, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v1, v9, v6, vcc ; GFX942-NEXT: v_perm_b32 v2, v8, v1, s0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v11 -; GFX942-NEXT: v_perm_b32 v6, v9, v0, s0 -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v5| src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_perm_b32 v6, v12, v0, s0 ; GFX942-NEXT: v_pk_max_f16 v6, v6, v10 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v9, |v4| src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cndmask_b32_sdwa v3, v9, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v12, |v4| src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v7, v12, v7, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v7, v9, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v1, |v5| ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v2, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v1, v9, v2, vcc ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v4| ; GFX942-NEXT: v_perm_b32 v1, v3, v1, s0 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v6, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v0, v9, v6, vcc ; GFX942-NEXT: v_perm_b32 v0, v7, v0, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -3012,37 +2969,33 @@ define <4 x half> @v_fmaximum3_v4f16__fneg_all(<4 x half> %a, <4 x half> %b, <4 ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1] ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v2, v1, v6, s0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v5 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v6, -v5 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, -v5 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v8, -v4 ; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v5, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -3079,41 +3032,37 @@ define <4 x half> @v_fmaximum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_pk_max_f16 v4, v0, 2.0 op_sel_hi:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v6, 0x7e00 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX942-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_max_f16 v7, v1, 2.0 op_sel_hi:[1,0] ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; GFX942-NEXT: v_cndmask_b32_sdwa v6, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v1 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v8, v6, v8, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v8, v5, v7, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc ; GFX942-NEXT: v_perm_b32 v4, v8, v1, s0 ; GFX942-NEXT: v_pk_max_f16 v4, v4, v3 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v8, v3 src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v8, v5, v0, s0 -; GFX942-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX942-NEXT: v_perm_b32 v8, v6, v0, s0 ; GFX942-NEXT: v_pk_max_f16 v8, v8, v2 -; GFX942-NEXT: v_cndmask_b32_e32 v7, v6, v7, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cndmask_b32_sdwa v7, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v6, v2 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v9, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v6, v5, v8, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 ; GFX942-NEXT: v_perm_b32 v1, v7, v1, s0 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc -; GFX942-NEXT: v_perm_b32 v0, v5, v0, s0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc +; GFX942-NEXT: v_perm_b32 v0, v6, v0, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v4f16__inlineimm1: @@ -3151,37 +3100,33 @@ define <4 x half> @v_fmaximum3_v4f16__inlineimm2(<4 x half> %a, <4 x half> %b) { ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v2, v1, v4, s0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, 4.0 op_sel_hi:[1,0] ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v2, v0, v6, s0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, 4.0 op_sel_hi:[1,0] ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v6, v6 ; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v4, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -4005,19 +3950,17 @@ define <4 x half> @v_no_fmaximum3_v2f16__multi_use(<2 x half> %a, <2 x half> %b, ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v1, v5, s0 ; GFX942-NEXT: v_pk_max_f16 v3, v0, v2 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v5, v2 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v1, v1, v5, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll index 049c6799da079..1be0590bcaae0 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll @@ -114,12 +114,9 @@ define <2 x half> @test_fmin_legacy_ule_v2f16(<2 x half> %a, <2 x half> %b) #0 { ; VI-SAFE-LABEL: test_fmin_legacy_ule_v2f16: ; VI-SAFE: ; %bb.0: ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v3, v2 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-SAFE-NEXT: v_cmp_ngt_f16 vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; VI-SAFE-NEXT: v_cndmask_b32_sdwa v2, v1, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v1 -; VI-SAFE-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-SAFE-NEXT: s_setpc_b64 s[30:31] @@ -221,16 +218,13 @@ define <3 x half> @test_fmin_legacy_ule_v3f16(<3 x half> %a, <3 x half> %b) #0 { ; VI-SAFE-LABEL: test_fmin_legacy_ule_v3f16: ; VI-SAFE: ; %bb.0: ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v5, v4 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; VI-SAFE-NEXT: v_cmp_ngt_f16 vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; VI-SAFE-NEXT: v_cndmask_b32_sdwa v4, v2, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v1, v3 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v2 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; VI-SAFE-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; VI-NNAN-LABEL: test_fmin_legacy_ule_v3f16: @@ -351,22 +345,16 @@ define <4 x half> @test_fmin_legacy_ule_v4f16(<4 x half> %a, <4 x half> %b) #0 { ; VI-SAFE-LABEL: test_fmin_legacy_ule_v4f16: ; VI-SAFE: ; %bb.0: ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v7, v6 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc -; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v5, v4 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; VI-SAFE-NEXT: v_cmp_ngt_f16 vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; VI-SAFE-NEXT: v_cndmask_b32_sdwa v4, v3, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-SAFE-NEXT: v_cmp_ngt_f16 vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; VI-SAFE-NEXT: v_cndmask_b32_sdwa v5, v2, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v1, v3 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v2 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; VI-SAFE-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-SAFE-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; VI-SAFE-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-SAFE-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; VI-NNAN-LABEL: test_fmin_legacy_ule_v4f16: @@ -524,22 +512,14 @@ define <8 x half> @test_fmin_legacy_ule_v8f16(<8 x half> %a, <8 x half> %b) #0 { ; VI-SAFE-LABEL: test_fmin_legacy_ule_v8f16: ; VI-SAFE: ; %bb.0: ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v12, 16, v6 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v15, v14 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v14, v14, v15, vcc -; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v13, v12 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v12, v12, v13, vcc -; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v11, v10 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc -; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v9, v8 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc +; VI-SAFE-NEXT: v_cmp_ngt_f16 vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1 +; VI-SAFE-NEXT: v_cndmask_b32_sdwa v8, v7, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-SAFE-NEXT: v_cmp_ngt_f16 vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1 +; VI-SAFE-NEXT: v_cndmask_b32_sdwa v9, v6, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-SAFE-NEXT: v_cmp_ngt_f16 vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1 +; VI-SAFE-NEXT: v_cndmask_b32_sdwa v10, v5, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-SAFE-NEXT: v_cmp_ngt_f16 vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1 +; VI-SAFE-NEXT: v_cndmask_b32_sdwa v11, v4, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v3, v7 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v2, v6 @@ -548,14 +528,10 @@ define <8 x half> @test_fmin_legacy_ule_v8f16(<8 x half> %a, <8 x half> %b) #0 { ; VI-SAFE-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v4 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; VI-SAFE-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-SAFE-NEXT: v_lshlrev_b32_e32 v4, 16, v10 -; VI-SAFE-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-SAFE-NEXT: v_lshlrev_b32_e32 v4, 16, v12 -; VI-SAFE-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-SAFE-NEXT: v_lshlrev_b32_e32 v4, 16, v14 -; VI-SAFE-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-SAFE-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-SAFE-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-SAFE-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; VI-NNAN-LABEL: test_fmin_legacy_ule_v8f16: diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll index 96e9aa375f5ee..ce6b522adff0a 100644 --- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll @@ -2083,19 +2083,17 @@ define <2 x half> @v_fminimum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v1, v0, v5, s0 ; GFX942-NEXT: v_pk_min_f16 v1, v2, v1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v2, v5 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v2, v0 src0_sel:WORD_1 src1_sel:DWORD ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v4, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2131,19 +2129,17 @@ define <2 x half> @v_fminimum3_v2f16_commute(<2 x half> %a, <2 x half> %b, <2 x ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v1, v0, v5, s0 ; GFX942-NEXT: v_pk_min_f16 v1, v1, v2 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v5, v2 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v4, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2179,24 +2175,22 @@ define <2 x half> @v_fminimum3_v2f16__fabs_all(<2 x half> %a, <2 x half> %b, <2 ; GFX942-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v0 ; GFX942-NEXT: v_and_b32_e32 v4, 0x7fff7fff, v1 ; GFX942-NEXT: v_pk_min_f16 v3, v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v6, 0x7e00 -; GFX942-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v1| src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v2 -; GFX942-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v6, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v1| ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v6, v3, vcc -; GFX942-NEXT: v_perm_b32 v1, v4, v0, s0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_perm_b32 v1, v6, v0, s0 ; GFX942-NEXT: v_pk_min_f16 v1, v1, v5 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v4, |v2| src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v6, |v2| src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_sdwa v3, v4, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2| ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v6, v1, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX942-NEXT: v_perm_b32 v0, v3, v0, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2238,19 +2232,17 @@ define <2 x half> @v_fminimum3_v2f16__fneg_all(<2 x half> %a, <2 x half> %b, <2 ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v1 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v1, v0, v5, s0 ; GFX942-NEXT: v_pk_min_f16 v1, v1, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v5, -v2 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v2 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v4, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2284,25 +2276,23 @@ define <2 x half> @v_fminimum3_v2f16__inlineimm1(<2 x half> %a, <2 x half> %c) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_pk_min_f16 v2, v0, 2.0 op_sel_hi:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX942-NEXT: v_mov_b32_e32 v3, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v4, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX942-NEXT: v_perm_b32 v2, v3, v0, s0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: v_perm_b32 v2, v4, v0, s0 ; GFX942-NEXT: v_pk_min_f16 v2, v2, v1 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v3, v1 src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v4, v1 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_sdwa v4, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX942-NEXT: v_perm_b32 v0, v3, v0, s0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: v_perm_b32 v0, v4, v0, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v2f16__inlineimm1: @@ -2337,19 +2327,17 @@ define <2 x half> @v_fminimum3_v2f16__inlineimm2(<2 x half> %a, <2 x half> %b) { ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v1, v0, v4, s0 ; GFX942-NEXT: v_pk_min_f16 v1, v1, 4.0 op_sel_hi:[1,0] ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v3, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v2, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2387,18 +2375,16 @@ define <3 x half> @v_fminimum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v1, v1, v6, s0 ; GFX942-NEXT: v_pk_min_f16 v1, v5, v1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v5, v6 @@ -2408,10 +2394,9 @@ define <3 x half> @v_fminimum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v8 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2450,18 +2435,16 @@ define <3 x half> @v_fminimum3_v3f16_commute(<3 x half> %a, <3 x half> %b, <3 x ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v1, v1, v6, s0 ; GFX942-NEXT: v_pk_min_f16 v1, v1, v5 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 @@ -2471,10 +2454,9 @@ define <3 x half> @v_fminimum3_v3f16_commute(<3 x half> %a, <3 x half> %b, <3 x ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v8, v4 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2519,36 +2501,33 @@ define <3 x half> @v_fminimum3_v3f16__fabs_all(<3 x half> %a, <3 x half> %b, <3 ; GFX942-NEXT: v_and_b32_e32 v6, 0x7fff7fff, v0 ; GFX942-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v2 ; GFX942-NEXT: v_pk_min_f16 v7, v7, v9 -; GFX942-NEXT: v_mov_b32_e32 v12, 0x7e00 -; GFX942-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; GFX942-NEXT: v_mov_b32_e32 v9, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_min_f16 v6, v6, v8 ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 -; GFX942-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; GFX942-NEXT: v_cndmask_b32_sdwa v12, v9, v7, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v4 ; GFX942-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v5 -; GFX942-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v8, v9, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3| ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v7, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v1, v9, v7, vcc ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2| ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v6, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v0, v9, v6, vcc ; GFX942-NEXT: v_perm_b32 v2, v8, v0, s0 ; GFX942-NEXT: v_pk_min_f16 v2, v2, v11 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v4| src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX942-NEXT: v_perm_b32 v6, v9, v1, s0 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc +; GFX942-NEXT: v_perm_b32 v6, v12, v1, s0 ; GFX942-NEXT: v_pk_min_f16 v6, v6, v10 +; GFX942-NEXT: v_cndmask_b32_sdwa v3, v9, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v1, |v5| ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v6, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v1, v9, v6, vcc ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v4| ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v2, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc ; GFX942-NEXT: v_perm_b32 v0, v3, v0, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2596,18 +2575,16 @@ define <3 x half> @v_fminimum3_v3f16__fneg_all(<3 x half> %a, <3 x half> %b, <3 ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1] ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v1, v1, v6, s0 ; GFX942-NEXT: v_pk_min_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v6, -v5 @@ -2617,10 +2594,9 @@ define <3 x half> @v_fminimum3_v3f16__fneg_all(<3 x half> %a, <3 x half> %b, <3 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v8, -v4 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2657,34 +2633,31 @@ define <3 x half> @v_fminimum3_v3f16__inlineimm1(<3 x half> %a, <3 x half> %c) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_pk_min_f16 v4, v0, 2.0 op_sel_hi:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v6, 0x7e00 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX942-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_min_f16 v7, v1, 2.0 ; GFX942-NEXT: s_mov_b32 s1, 0x5040100 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v6, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 ; GFX942-NEXT: s_movk_i32 s0, 0x7e00 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc -; GFX942-NEXT: v_perm_b32 v4, v5, v0, s1 -; GFX942-NEXT: v_pk_min_f16 v4, v4, v2 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc ; GFX942-NEXT: v_pack_b32_f16 v7, v1, s0 ; GFX942-NEXT: v_pk_min_f16 v7, v7, v3 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX942-NEXT: v_perm_b32 v4, v6, v0, s1 +; GFX942-NEXT: v_pk_min_f16 v4, v4, v2 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v6, v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_sdwa v6, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc -; GFX942-NEXT: v_perm_b32 v0, v5, v0, s1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX942-NEXT: v_perm_b32 v0, v6, v0, s1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v3f16__inlineimm1: @@ -2722,18 +2695,16 @@ define <3 x half> @v_fminimum3_v3f16__inlineimm2(<3 x half> %a, <3 x half> %b) { ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v1, v1, v4, s0 ; GFX942-NEXT: v_pk_min_f16 v1, v1, 4.0 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 @@ -2743,10 +2714,9 @@ define <3 x half> @v_fminimum3_v3f16__inlineimm2(<3 x half> %a, <3 x half> %b) { ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v6, v6 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2785,37 +2755,33 @@ define <4 x half> @v_fminimum3_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v2, v1, v6, s0 ; GFX942-NEXT: v_pk_min_f16 v2, v5, v2 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v5, v6 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v5, v1 src0_sel:WORD_1 src1_sel:DWORD ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 ; GFX942-NEXT: v_pk_min_f16 v2, v4, v2 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v8 ; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v5, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2854,37 +2820,33 @@ define <4 x half> @v_fminimum3_v4f16_commute(<4 x half> %a, <4 x half> %b, <4 x ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v2, v1, v6, s0 ; GFX942-NEXT: v_pk_min_f16 v2, v2, v5 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 ; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v8, v4 ; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v5, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2929,42 +2891,37 @@ define <4 x half> @v_fminimum3_v4f16__fabs_all(<4 x half> %a, <4 x half> %b, <4 ; GFX942-NEXT: v_and_b32_e32 v6, 0x7fff7fff, v1 ; GFX942-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v3 ; GFX942-NEXT: v_pk_min_f16 v7, v7, v9 -; GFX942-NEXT: v_mov_b32_e32 v12, 0x7e00 -; GFX942-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; GFX942-NEXT: v_mov_b32_e32 v9, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_min_f16 v6, v6, v8 ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 -; GFX942-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; GFX942-NEXT: v_cndmask_b32_sdwa v12, v9, v7, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v5 ; GFX942-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v4 -; GFX942-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v8, v9, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2| ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v7, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v0, v9, v7, vcc ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3| ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v6, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v1, v9, v6, vcc ; GFX942-NEXT: v_perm_b32 v2, v8, v1, s0 ; GFX942-NEXT: v_pk_min_f16 v2, v2, v11 -; GFX942-NEXT: v_perm_b32 v6, v9, v0, s0 -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v5| src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_perm_b32 v6, v12, v0, s0 ; GFX942-NEXT: v_pk_min_f16 v6, v6, v10 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v9, |v4| src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cndmask_b32_sdwa v3, v9, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v12, |v4| src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v7, v12, v7, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v7, v9, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v1, |v5| ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v2, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v1, v9, v2, vcc ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v4| ; GFX942-NEXT: v_perm_b32 v1, v3, v1, s0 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v6, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v0, v9, v6, vcc ; GFX942-NEXT: v_perm_b32 v0, v7, v0, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -3012,37 +2969,33 @@ define <4 x half> @v_fminimum3_v4f16__fneg_all(<4 x half> %a, <4 x half> %b, <4 ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1] ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v2, v1, v6, s0 ; GFX942-NEXT: v_pk_min_f16 v2, v2, v5 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v6, -v5 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, -v5 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 ; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v8, -v4 ; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v5, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -3079,41 +3032,37 @@ define <4 x half> @v_fminimum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_pk_min_f16 v4, v0, 2.0 op_sel_hi:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v6, 0x7e00 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX942-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_min_f16 v7, v1, 2.0 op_sel_hi:[1,0] ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; GFX942-NEXT: v_cndmask_b32_sdwa v6, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v1 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v8, v6, v8, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v8, v5, v7, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc ; GFX942-NEXT: v_perm_b32 v4, v8, v1, s0 ; GFX942-NEXT: v_pk_min_f16 v4, v4, v3 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v8, v3 src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v8, v5, v0, s0 -; GFX942-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX942-NEXT: v_perm_b32 v8, v6, v0, s0 ; GFX942-NEXT: v_pk_min_f16 v8, v8, v2 -; GFX942-NEXT: v_cndmask_b32_e32 v7, v6, v7, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cndmask_b32_sdwa v7, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v6, v2 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v9, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v6, v5, v8, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 ; GFX942-NEXT: v_perm_b32 v1, v7, v1, s0 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc -; GFX942-NEXT: v_perm_b32 v0, v5, v0, s0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc +; GFX942-NEXT: v_perm_b32 v0, v6, v0, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v4f16__inlineimm1: @@ -3151,37 +3100,33 @@ define <4 x half> @v_fminimum3_v4f16__inlineimm2(<4 x half> %a, <4 x half> %b) { ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v2, v1, v4, s0 ; GFX942-NEXT: v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0] ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v2, v0, v6, s0 ; GFX942-NEXT: v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0] ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v6, v6 ; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v4, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -4005,19 +3950,17 @@ define <4 x half> @v_no_fminimum3_v2f16__multi_use(<2 x half> %a, <2 x half> %b, ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v1, v5, s0 ; GFX942-NEXT: v_pk_min_f16 v3, v0, v2 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v5, v2 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v1, v1, v5, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll index ff894d184e6c4..1b508af610d74 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll @@ -208,10 +208,8 @@ define <2 x i16> @fneg_xor_select_v2i16(<2 x i1> %cond, <2 x i16> %arg0, <2 x i1 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 @@ -746,22 +744,19 @@ define <2 x half> @select_fneg_select_v2f16(<2 x i1> %cond0, <2 x i1> %cond1, <2 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_xor_b32_e32 v4, 0x80008000, v4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX9-NEXT: v_xor_b32_e32 v4, 0x80008000, v4 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v4, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v4, v1, v0, s4 ; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX9-NEXT: v_xor_b32_e32 v4, 0x80008000, v4 +; GFX9-NEXT: v_perm_b32 v4, v1, v0, s4 ; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX9-NEXT: v_xor_b32_e32 v4, 0x80008000, v4 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 @@ -854,22 +849,19 @@ define <2 x i16> @select_fneg_xor_select_v2i16(<2 x i1> %cond0, <2 x i1> %cond1, ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_xor_b32_e32 v4, 0x80008000, v4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX9-NEXT: v_xor_b32_e32 v4, 0x80008000, v4 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v4, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v4, v1, v0, s4 ; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX9-NEXT: v_xor_b32_e32 v4, 0x80008000, v4 +; GFX9-NEXT: v_perm_b32 v4, v1, v0, s4 ; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX9-NEXT: v_xor_b32_e32 v4, 0x80008000, v4 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index 4b9da7b49e997..e55178d0d8d30 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -336,52 +336,52 @@ define amdgpu_kernel void @half8_inselt(ptr addrspace(1) %out, <8 x half> %vec, ; GCN-LABEL: half8_inselt: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 -; GCN-NEXT: s_load_dword s6, s[4:5], 0x44 +; GCN-NEXT: s_load_dword s8, s[4:5], 0x44 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s7, s3, 16 -; GCN-NEXT: s_cmp_lg_u32 s6, 7 -; GCN-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s6, 6 -; GCN-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; GCN-NEXT: s_lshr_b32 s6, s3, 16 +; GCN-NEXT: s_cmp_lg_u32 s8, 7 +; GCN-NEXT: v_mov_b32_e32 v1, s6 +; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[6:7] +; GCN-NEXT: s_cmp_lg_u32 s8, 6 +; GCN-NEXT: v_cndmask_b32_sdwa v1, v0, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GCN-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_lshr_b32 s3, s2, 16 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: s_cmp_lg_u32 s8, 5 +; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc -; GCN-NEXT: s_cmp_lg_u32 s6, 5 +; GCN-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[6:7] ; GCN-NEXT: v_or_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: s_cmp_lg_u32 s8, 4 +; GCN-NEXT: v_cndmask_b32_sdwa v1, v0, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s6, 4 -; GCN-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; GCN-NEXT: s_lshr_b32 s6, s1, 16 +; GCN-NEXT: s_cmp_lg_u32 s8, 3 ; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s2, s1, 16 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc -; GCN-NEXT: s_cmp_lg_u32 s6, 3 +; GCN-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[2:3] ; GCN-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v1, s2 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s6, 2 -; GCN-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, s6 +; GCN-NEXT: s_cmp_lg_u32 s8, 2 +; GCN-NEXT: v_cndmask_b32_sdwa v1, v0, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GCN-NEXT: v_mov_b32_e32 v4, s1 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_lshr_b32 s1, s0, 16 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: s_cmp_lg_u32 s8, 1 +; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc -; GCN-NEXT: s_cmp_lg_u32 s6, 1 +; GCN-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[2:3] ; GCN-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GCN-NEXT: v_mov_b32_e32 v4, s1 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s6, 0 -; GCN-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc +; GCN-NEXT: s_cmp_lg_u32 s8, 0 +; GCN-NEXT: v_cndmask_b32_sdwa v4, v0, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GCN-NEXT: v_mov_b32_e32 v5, s0 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GCN-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GCN-NEXT: v_mov_b32_e32 v4, s4 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll index b51cb9df8d784..816f5c2727c6f 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll @@ -1102,7 +1102,7 @@ define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out, ; VI-LABEL: v_insertelement_v8bf16_dynamic: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 @@ -1112,139 +1112,135 @@ define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out, ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; VI-NEXT: s_cmp_eq_u32 s9, 6 ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 -; VI-NEXT: s_cmp_eq_u32 s5, 6 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s9, 7 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: v_mov_b32_e32 v6, s4 -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 7 +; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s9, 4 +; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[2:3] +; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s9, 5 +; VI-NEXT: v_mov_b32_e32 v6, s8 +; VI-NEXT: s_cselect_b64 s[10:11], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s9, 2 +; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s9, 3 +; VI-NEXT: s_cselect_b64 s[12:13], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s9, 0 +; VI-NEXT: s_cselect_b64 s[6:7], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s9, 1 +; VI-NEXT: s_cselect_b64 s[8:9], -1, 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 4 -; VI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 5 -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 2 -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 3 -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; VI-NEXT: v_cndmask_b32_e64 v7, v3, v6, s[0:1] +; VI-NEXT: v_cndmask_b32_sdwa v3, v3, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[10:11] +; VI-NEXT: v_cndmask_b32_e64 v8, v2, v6, s[2:3] +; VI-NEXT: v_cndmask_b32_sdwa v2, v2, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[12:13] +; VI-NEXT: v_cndmask_b32_e64 v9, v1, v6, s[4:5] +; VI-NEXT: v_cndmask_b32_sdwa v1, v1, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[8:9] ; VI-NEXT: v_or_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 0 -; VI-NEXT: v_or_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 1 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc -; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e64 v7, v0, v6, s[6:7] +; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX900-LABEL: v_insertelement_v8bf16_dynamic: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX900-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX900-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0x0 +; GFX900-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x10 ; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX900-NEXT: s_mov_b32 s16, 0x5040100 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] -; GFX900-NEXT: s_cmp_eq_u32 s5, 6 -; GFX900-NEXT: v_mov_b32_e32 v5, s4 -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 7 -; GFX900-NEXT: s_mov_b32 s2, 0x5040100 +; GFX900-NEXT: global_load_dwordx4 v[0:3], v4, s[14:15] +; GFX900-NEXT: s_cmp_eq_u32 s11, 6 +; GFX900-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s11, 7 +; GFX900-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s11, 4 +; GFX900-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[2:3] +; GFX900-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s11, 5 +; GFX900-NEXT: v_mov_b32_e32 v5, s10 +; GFX900-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s11, 2 +; GFX900-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s11, 3 +; GFX900-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s11, 0 +; GFX900-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s11, 1 +; GFX900-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_cndmask_b32_e32 v6, v3, v5, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 4 -; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 5 -; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 2 -; GFX900-NEXT: v_perm_b32 v3, v3, v6, s2 -; GFX900-NEXT: v_cndmask_b32_e32 v6, v7, v5, vcc -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 3 -; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 0 -; GFX900-NEXT: v_perm_b32 v2, v6, v2, s2 -; GFX900-NEXT: v_cndmask_b32_e32 v6, v8, v5, vcc -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 1 -; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc -; GFX900-NEXT: v_perm_b32 v1, v6, v1, s2 -; GFX900-NEXT: v_perm_b32 v0, v5, v0, s2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX900-NEXT: v_cndmask_b32_e64 v6, v3, v5, s[0:1] +; GFX900-NEXT: v_cndmask_b32_sdwa v3, v3, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX900-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[8:9] +; GFX900-NEXT: v_cndmask_b32_e64 v7, v2, v5, s[2:3] +; GFX900-NEXT: v_cndmask_b32_sdwa v2, v2, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX900-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[14:15] +; GFX900-NEXT: v_cndmask_b32_e64 v8, v1, v5, s[4:5] +; GFX900-NEXT: v_cndmask_b32_sdwa v1, v1, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX900-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[10:11] +; GFX900-NEXT: v_perm_b32 v3, v3, v6, s16 +; GFX900-NEXT: v_cndmask_b32_e64 v6, v0, v5, s[6:7] +; GFX900-NEXT: v_cndmask_b32_sdwa v0, v0, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX900-NEXT: v_perm_b32 v2, v2, v7, s16 +; GFX900-NEXT: v_perm_b32 v1, v1, v8, s16 +; GFX900-NEXT: v_perm_b32 v0, v0, v6, s16 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] ; GFX900-NEXT: s_endpgm ; ; GFX942-LABEL: v_insertelement_v8bf16_dynamic: ; GFX942: ; %bb.0: -; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX942-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 ; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX942-NEXT: s_mov_b32 s16, 0x5040100 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] -; GFX942-NEXT: s_cmp_eq_u32 s7, 6 -; GFX942-NEXT: v_mov_b32_e32 v5, s6 -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s7, 7 -; GFX942-NEXT: s_mov_b32 s2, 0x5040100 +; GFX942-NEXT: global_load_dwordx4 v[0:3], v4, s[10:11] +; GFX942-NEXT: s_cmp_eq_u32 s13, 6 +; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s13, 7 +; GFX942-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s13, 4 +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[2:3] +; GFX942-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s13, 5 +; GFX942-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s13, 2 +; GFX942-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s13, 3 +; GFX942-NEXT: v_mov_b32_e32 v5, s12 +; GFX942-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s13, 0 +; GFX942-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s13, 1 +; GFX942-NEXT: s_cselect_b64 s[12:13], -1, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_cndmask_b32_e32 v6, v3, v5, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s7, 4 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s7, 5 -; GFX942-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX942-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s7, 2 -; GFX942-NEXT: v_perm_b32 v3, v3, v6, s2 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v5, vcc -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s7, 3 -; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s7, 0 -; GFX942-NEXT: v_perm_b32 v2, v6, v2, s2 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v8, v5, vcc -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s7, 1 -; GFX942-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc -; GFX942-NEXT: v_perm_b32 v1, v6, v1, s2 -; GFX942-NEXT: v_perm_b32 v0, v5, v0, s2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_cndmask_b32_e64 v6, v3, v5, s[0:1] +; GFX942-NEXT: v_cndmask_b32_sdwa v3, v3, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[10:11] +; GFX942-NEXT: v_cndmask_b32_e64 v7, v2, v5, s[2:3] +; GFX942-NEXT: v_cndmask_b32_e64 v8, v1, v5, s[4:5] +; GFX942-NEXT: v_cndmask_b32_sdwa v2, v2, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[14:15] +; GFX942-NEXT: v_cndmask_b32_e64 v9, v0, v5, s[6:7] +; GFX942-NEXT: v_perm_b32 v3, v3, v6, s16 +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v1, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[12:13] +; GFX942-NEXT: v_perm_b32 v2, v2, v7, s16 +; GFX942-NEXT: v_perm_b32 v1, v1, v8, s16 +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v0, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX942-NEXT: v_perm_b32 v0, v0, v9, s16 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] ; GFX942-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1459,263 +1455,255 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out ; ; VI-LABEL: v_insertelement_v16bf16_dynamic: ; VI: ; %bb.0: +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x10 +; VI-NEXT: s_load_dwordx2 s[16:17], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 -; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v8 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v4 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; VI-NEXT: v_mov_b32_e32 v9, s1 ; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8 +; VI-NEXT: s_cmp_eq_u32 s17, 14 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8 -; VI-NEXT: s_cmp_eq_u32 s7, 14 -; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc -; VI-NEXT: v_mov_b32_e32 v12, s6 -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 15 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cndmask_b32_e32 v13, v3, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 12 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 13 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; VI-NEXT: v_cndmask_b32_e64 v2, v2, v12, s[0:1] ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 10 +; VI-NEXT: s_cmp_eq_u32 s17, 15 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 11 -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; VI-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3] +; VI-NEXT: s_cmp_eq_u32 s17, 12 +; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[2:3] ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 8 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: s_cmp_eq_u32 s17, 13 +; VI-NEXT: v_mov_b32_e32 v12, s16 +; VI-NEXT: s_cselect_b64 s[18:19], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s17, 10 ; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; VI-NEXT: v_cndmask_b32_e64 v15, v15, v12, s[2:3] -; VI-NEXT: s_cmp_eq_u32 s7, 9 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; VI-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 6 -; VI-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e32 v15, v16, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 7 -; VI-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[4:5] +; VI-NEXT: s_cmp_eq_u32 s17, 11 +; VI-NEXT: s_cselect_b64 s[20:21], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s17, 8 +; VI-NEXT: s_cselect_b64 s[6:7], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s17, 9 +; VI-NEXT: s_cselect_b64 s[22:23], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s17, 6 +; VI-NEXT: s_cselect_b64 s[8:9], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s17, 7 +; VI-NEXT: s_cselect_b64 s[24:25], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s17, 4 +; VI-NEXT: s_cselect_b64 s[10:11], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s17, 5 +; VI-NEXT: s_cselect_b64 s[26:27], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s17, 2 +; VI-NEXT: s_cselect_b64 s[12:13], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s17, 3 +; VI-NEXT: s_cselect_b64 s[28:29], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s17, 0 +; VI-NEXT: s_cselect_b64 s[14:15], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s17, 1 +; VI-NEXT: s_cselect_b64 s[16:17], -1, 0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cndmask_b32_e64 v13, v3, v12, s[0:1] +; VI-NEXT: v_cndmask_b32_sdwa v3, v3, v12, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[18:19] +; VI-NEXT: v_cndmask_b32_e64 v14, v2, v12, s[2:3] +; VI-NEXT: v_cndmask_b32_sdwa v2, v2, v12, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[20:21] +; VI-NEXT: v_cndmask_b32_e64 v15, v1, v12, s[4:5] +; VI-NEXT: v_cndmask_b32_sdwa v1, v1, v12, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[22:23] +; VI-NEXT: v_cndmask_b32_e64 v16, v0, v12, s[6:7] +; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v12, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[24:25] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_cndmask_b32_e64 v14, v14, v12, s[0:1] -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 4 +; VI-NEXT: v_cndmask_b32_e64 v17, v7, v12, s[8:9] +; VI-NEXT: v_cndmask_b32_sdwa v7, v7, v12, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[26:27] ; VI-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; VI-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e32 v15, v17, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 5 -; VI-NEXT: v_or_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; VI-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 2 -; VI-NEXT: v_cndmask_b32_e32 v13, v13, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 3 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 -; VI-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 0 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_cndmask_b32_e32 v14, v14, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 1 -; VI-NEXT: v_or_b32_sdwa v6, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_or_b32_sdwa v7, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e64 v13, v6, v12, s[10:11] +; VI-NEXT: v_cndmask_b32_sdwa v6, v6, v12, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[28:29] +; VI-NEXT: v_or_b32_sdwa v2, v14, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e64 v14, v5, v12, s[12:13] +; VI-NEXT: v_cndmask_b32_sdwa v5, v5, v12, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[16:17] +; VI-NEXT: v_or_b32_sdwa v6, v13, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e64 v13, v4, v12, s[14:15] +; VI-NEXT: v_cndmask_b32_sdwa v4, v4, v12, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v17, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v14, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v13, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; VI-NEXT: flat_store_dwordx4 v[10:11], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX900-LABEL: v_insertelement_v16bf16_dynamic: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX900-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 -; GFX900-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX900-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 +; GFX900-NEXT: s_load_dwordx2 s[20:21], s[8:9], 0x10 +; GFX900-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; GFX900-NEXT: s_mov_b32 s33, 0x5040100 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[2:3] -; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[2:3] offset:16 -; GFX900-NEXT: s_cmp_eq_u32 s5, 6 -; GFX900-NEXT: v_mov_b32_e32 v9, s4 -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 7 -; GFX900-NEXT: s_mov_b32 s2, 0x5040100 +; GFX900-NEXT: global_load_dwordx4 v[0:3], v8, s[18:19] +; GFX900-NEXT: global_load_dwordx4 v[4:7], v8, s[18:19] offset:16 +; GFX900-NEXT: s_cmp_eq_u32 s21, 6 +; GFX900-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s21, 7 +; GFX900-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s21, 4 +; GFX900-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[2:3] +; GFX900-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s21, 5 +; GFX900-NEXT: v_mov_b32_e32 v9, s20 +; GFX900-NEXT: s_cselect_b64 s[18:19], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s21, 2 +; GFX900-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s21, 3 +; GFX900-NEXT: s_cselect_b64 s[22:23], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s21, 0 +; GFX900-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s21, 1 +; GFX900-NEXT: s_cselect_b64 s[24:25], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s21, 14 +; GFX900-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s21, 15 +; GFX900-NEXT: s_cselect_b64 s[26:27], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s21, 12 +; GFX900-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s21, 13 +; GFX900-NEXT: s_cselect_b64 s[28:29], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s21, 10 +; GFX900-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s21, 11 +; GFX900-NEXT: s_cselect_b64 s[30:31], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s21, 8 +; GFX900-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s21, 9 +; GFX900-NEXT: s_cselect_b64 s[20:21], -1, 0 ; GFX900-NEXT: s_waitcnt vmcnt(1) -; GFX900-NEXT: v_cndmask_b32_e32 v10, v4, v9, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 4 -; GFX900-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 5 -; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 2 -; GFX900-NEXT: v_perm_b32 v4, v4, v10, s2 -; GFX900-NEXT: v_cndmask_b32_e32 v10, v11, v9, vcc -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 3 -; GFX900-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 0 -; GFX900-NEXT: v_cndmask_b32_e32 v11, v12, v9, vcc -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 1 -; GFX900-NEXT: v_lshrrev_b32_e32 v13, 16, v1 -; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 14 -; GFX900-NEXT: v_cndmask_b32_e32 v12, v13, v9, vcc -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 15 +; GFX900-NEXT: v_cndmask_b32_e64 v10, v3, v9, s[0:1] +; GFX900-NEXT: v_cndmask_b32_sdwa v3, v3, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX900-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[18:19] +; GFX900-NEXT: v_cndmask_b32_e64 v11, v2, v9, s[2:3] +; GFX900-NEXT: v_cndmask_b32_sdwa v2, v2, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX900-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[22:23] +; GFX900-NEXT: v_cndmask_b32_e64 v12, v1, v9, s[4:5] +; GFX900-NEXT: v_cndmask_b32_sdwa v1, v1, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX900-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[24:25] +; GFX900-NEXT: v_cndmask_b32_e64 v13, v0, v9, s[6:7] +; GFX900-NEXT: v_cndmask_b32_sdwa v0, v0, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX900-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[26:27] ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_lshrrev_b32_e32 v14, 16, v8 -; GFX900-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 12 -; GFX900-NEXT: v_perm_b32 v1, v12, v1, s2 -; GFX900-NEXT: v_cndmask_b32_e32 v12, v14, v9, vcc -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 13 -; GFX900-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GFX900-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 10 -; GFX900-NEXT: v_perm_b32 v8, v12, v8, s2 -; GFX900-NEXT: v_cndmask_b32_e32 v12, v15, v9, vcc -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 11 -; GFX900-NEXT: v_perm_b32 v3, v10, v3, s2 -; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; GFX900-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 8 -; GFX900-NEXT: v_cndmask_b32_e32 v10, v10, v9, vcc -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 9 -; GFX900-NEXT: v_perm_b32 v2, v11, v2, s2 -; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; GFX900-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc -; GFX900-NEXT: v_perm_b32 v7, v12, v7, s2 -; GFX900-NEXT: v_perm_b32 v6, v10, v6, s2 -; GFX900-NEXT: v_perm_b32 v5, v9, v5, s2 -; GFX900-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1] offset:16 -; GFX900-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] +; GFX900-NEXT: v_cndmask_b32_e64 v14, v7, v9, s[8:9] +; GFX900-NEXT: v_cndmask_b32_sdwa v7, v7, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX900-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[28:29] +; GFX900-NEXT: v_cndmask_b32_e64 v15, v6, v9, s[10:11] +; GFX900-NEXT: v_cndmask_b32_sdwa v6, v6, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX900-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[30:31] +; GFX900-NEXT: v_perm_b32 v3, v3, v10, s33 +; GFX900-NEXT: v_cndmask_b32_e64 v10, v5, v9, s[12:13] +; GFX900-NEXT: v_cndmask_b32_sdwa v5, v5, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX900-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[20:21] +; GFX900-NEXT: v_perm_b32 v2, v2, v11, s33 +; GFX900-NEXT: v_cndmask_b32_e64 v11, v4, v9, s[14:15] +; GFX900-NEXT: v_cndmask_b32_sdwa v4, v4, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX900-NEXT: v_perm_b32 v7, v7, v14, s33 +; GFX900-NEXT: v_perm_b32 v6, v6, v15, s33 +; GFX900-NEXT: v_perm_b32 v5, v5, v10, s33 +; GFX900-NEXT: v_perm_b32 v4, v4, v11, s33 +; GFX900-NEXT: v_perm_b32 v1, v1, v12, s33 +; GFX900-NEXT: v_perm_b32 v0, v0, v13, s33 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX900-NEXT: s_endpgm ; ; GFX942-LABEL: v_insertelement_v16bf16_dynamic: ; GFX942: ; %bb.0: -; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX942-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 +; GFX942-NEXT: s_load_dwordx2 s[20:21], s[4:5], 0x10 ; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX942-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; GFX942-NEXT: s_mov_b32 s33, 0x5040100 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] -; GFX942-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16 -; GFX942-NEXT: s_cmp_eq_u32 s7, 6 -; GFX942-NEXT: v_mov_b32_e32 v9, s6 -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s7, 7 -; GFX942-NEXT: s_mov_b32 s2, 0x5040100 +; GFX942-NEXT: global_load_dwordx4 v[0:3], v8, s[18:19] +; GFX942-NEXT: global_load_dwordx4 v[4:7], v8, s[18:19] offset:16 +; GFX942-NEXT: s_cmp_eq_u32 s21, 6 +; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s21, 7 +; GFX942-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s21, 4 +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[2:3] +; GFX942-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s21, 5 +; GFX942-NEXT: s_cselect_b64 s[18:19], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s21, 2 +; GFX942-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s21, 3 +; GFX942-NEXT: s_cselect_b64 s[22:23], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s21, 0 +; GFX942-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s21, 1 +; GFX942-NEXT: s_cselect_b64 s[24:25], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s21, 14 +; GFX942-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s21, 15 +; GFX942-NEXT: v_mov_b32_e32 v9, s20 +; GFX942-NEXT: s_cselect_b64 s[26:27], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s21, 12 +; GFX942-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s21, 13 +; GFX942-NEXT: s_cselect_b64 s[28:29], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s21, 10 +; GFX942-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s21, 11 +; GFX942-NEXT: s_cselect_b64 s[30:31], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s21, 8 +; GFX942-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s21, 9 +; GFX942-NEXT: s_cselect_b64 s[20:21], -1, 0 ; GFX942-NEXT: s_waitcnt vmcnt(1) -; GFX942-NEXT: v_cndmask_b32_e32 v10, v3, v9, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s7, 4 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s7, 5 -; GFX942-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; GFX942-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s7, 2 -; GFX942-NEXT: v_perm_b32 v3, v3, v10, s2 -; GFX942-NEXT: v_cndmask_b32_e32 v10, v11, v9, vcc -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s7, 3 -; GFX942-NEXT: v_lshrrev_b32_e32 v12, 16, v1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s7, 0 -; GFX942-NEXT: v_perm_b32 v2, v10, v2, s2 -; GFX942-NEXT: v_cndmask_b32_e32 v10, v12, v9, vcc -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s7, 1 -; GFX942-NEXT: v_lshrrev_b32_e32 v13, 16, v0 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s7, 14 -; GFX942-NEXT: v_perm_b32 v1, v10, v1, s2 -; GFX942-NEXT: v_cndmask_b32_e32 v10, v13, v9, vcc -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s7, 15 +; GFX942-NEXT: v_cndmask_b32_e64 v10, v3, v9, s[0:1] +; GFX942-NEXT: v_cndmask_b32_sdwa v3, v3, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[18:19] +; GFX942-NEXT: v_cndmask_b32_e64 v11, v2, v9, s[2:3] +; GFX942-NEXT: v_cndmask_b32_e64 v12, v1, v9, s[4:5] +; GFX942-NEXT: v_cndmask_b32_sdwa v2, v2, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[22:23] +; GFX942-NEXT: v_cndmask_b32_e64 v13, v0, v9, s[6:7] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; GFX942-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s7, 12 -; GFX942-NEXT: v_perm_b32 v0, v10, v0, s2 -; GFX942-NEXT: v_cndmask_b32_e32 v10, v14, v9, vcc -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s7, 13 -; GFX942-NEXT: v_lshrrev_b32_e32 v15, 16, v6 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s7, 10 -; GFX942-NEXT: v_perm_b32 v7, v10, v7, s2 -; GFX942-NEXT: v_cndmask_b32_e32 v10, v15, v9, vcc -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s7, 11 -; GFX942-NEXT: v_lshrrev_b32_e32 v16, 16, v5 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s7, 8 -; GFX942-NEXT: v_perm_b32 v6, v10, v6, s2 -; GFX942-NEXT: v_cndmask_b32_e32 v10, v16, v9, vcc -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s7, 9 -; GFX942-NEXT: v_lshrrev_b32_e32 v17, 16, v4 -; GFX942-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: v_cndmask_b32_e32 v9, v17, v9, vcc -; GFX942-NEXT: v_perm_b32 v5, v10, v5, s2 -; GFX942-NEXT: v_perm_b32 v4, v9, v4, s2 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_cndmask_b32_e64 v14, v7, v9, s[8:9] +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v1, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[24:25] +; GFX942-NEXT: v_cndmask_b32_e64 v15, v6, v9, s[10:11] +; GFX942-NEXT: v_cndmask_b32_e64 v16, v5, v9, s[12:13] +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v0, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[26:27] +; GFX942-NEXT: v_cndmask_b32_e64 v17, v4, v9, s[14:15] +; GFX942-NEXT: v_perm_b32 v3, v3, v10, s33 +; GFX942-NEXT: v_cndmask_b32_sdwa v7, v7, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[28:29] +; GFX942-NEXT: v_perm_b32 v7, v7, v14, s33 +; GFX942-NEXT: v_perm_b32 v2, v2, v11, s33 +; GFX942-NEXT: v_cndmask_b32_sdwa v6, v6, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[30:31] +; GFX942-NEXT: v_perm_b32 v6, v6, v15, s33 +; GFX942-NEXT: v_perm_b32 v1, v1, v12, s33 +; GFX942-NEXT: v_cndmask_b32_sdwa v5, v5, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[20:21] +; GFX942-NEXT: v_perm_b32 v5, v5, v16, s33 +; GFX942-NEXT: v_perm_b32 v0, v0, v13, s33 +; GFX942-NEXT: v_cndmask_b32_sdwa v4, v4, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX942-NEXT: v_perm_b32 v4, v4, v17, s33 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX942-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index e11900ac0ca68..de979f4f69698 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -2694,53 +2694,53 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) { ; GFX9-LABEL: v_insertelement_v8f16_dynamic: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX9-NEXT: s_mov_b32 s16, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] -; GFX9-NEXT: s_cmp_eq_u32 s5, 6 -; GFX9-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 7 -; GFX9-NEXT: s_mov_b32 s2, 0x5040100 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[14:15] +; GFX9-NEXT: s_cmp_eq_u32 s11, 6 +; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s11, 7 +; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s11, 4 +; GFX9-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[2:3] +; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s11, 5 +; GFX9-NEXT: v_mov_b32_e32 v5, s10 +; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s11, 2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s11, 3 +; GFX9-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s11, 0 +; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s11, 1 +; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v5, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 4 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 5 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 2 -; GFX9-NEXT: v_perm_b32 v3, v3, v6, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v5, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 3 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 0 -; GFX9-NEXT: v_perm_b32 v2, v6, v2, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v5, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 1 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc -; GFX9-NEXT: v_perm_b32 v1, v6, v1, s2 -; GFX9-NEXT: v_perm_b32 v0, v5, v0, s2 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v3, v5, s[0:1] +; GFX9-NEXT: v_cndmask_b32_sdwa v3, v3, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v2, v5, s[2:3] +; GFX9-NEXT: v_cndmask_b32_sdwa v2, v2, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[14:15] +; GFX9-NEXT: v_cndmask_b32_e64 v8, v1, v5, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v1, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[10:11] +; GFX9-NEXT: v_perm_b32 v3, v3, v6, s16 +; GFX9-NEXT: v_cndmask_b32_e64 v6, v0, v5, s[6:7] +; GFX9-NEXT: v_cndmask_b32_sdwa v0, v0, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v2, v2, v7, s16 +; GFX9-NEXT: v_perm_b32 v1, v1, v8, s16 +; GFX9-NEXT: v_perm_b32 v0, v0, v6, s16 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v8f16_dynamic: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 @@ -2750,47 +2750,43 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; VI-NEXT: s_cmp_eq_u32 s9, 6 ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 -; VI-NEXT: s_cmp_eq_u32 s5, 6 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s9, 7 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: v_mov_b32_e32 v6, s4 -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 7 +; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s9, 4 +; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[2:3] +; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s9, 5 +; VI-NEXT: v_mov_b32_e32 v6, s8 +; VI-NEXT: s_cselect_b64 s[10:11], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s9, 2 +; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s9, 3 +; VI-NEXT: s_cselect_b64 s[12:13], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s9, 0 +; VI-NEXT: s_cselect_b64 s[6:7], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s9, 1 +; VI-NEXT: s_cselect_b64 s[8:9], -1, 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 4 -; VI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 5 -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 2 -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 3 -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; VI-NEXT: v_cndmask_b32_e64 v7, v3, v6, s[0:1] +; VI-NEXT: v_cndmask_b32_sdwa v3, v3, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[10:11] +; VI-NEXT: v_cndmask_b32_e64 v8, v2, v6, s[2:3] +; VI-NEXT: v_cndmask_b32_sdwa v2, v2, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[12:13] +; VI-NEXT: v_cndmask_b32_e64 v9, v1, v6, s[4:5] +; VI-NEXT: v_cndmask_b32_sdwa v1, v1, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[8:9] ; VI-NEXT: v_or_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 0 -; VI-NEXT: v_or_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 1 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc -; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e64 v7, v0, v6, s[6:7] +; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; @@ -3196,180 +3192,172 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) { ; GFX9-LABEL: v_insertelement_v16f16_dynamic: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX9-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[20:21], s[8:9], 0x10 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; GFX9-NEXT: s_mov_b32 s33, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[1:4], v0, s[2:3] -; GFX9-NEXT: global_load_dwordx4 v[5:8], v0, s[2:3] offset:16 -; GFX9-NEXT: s_cmp_eq_u32 s5, 6 -; GFX9-NEXT: v_mov_b32_e32 v9, s4 -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 7 -; GFX9-NEXT: s_mov_b32 s2, 0x5040100 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[18:19] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[18:19] offset:16 +; GFX9-NEXT: s_cmp_eq_u32 s21, 6 +; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s21, 7 +; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s21, 4 +; GFX9-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[2:3] +; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s21, 5 +; GFX9-NEXT: v_mov_b32_e32 v9, s20 +; GFX9-NEXT: s_cselect_b64 s[18:19], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s21, 2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s21, 3 +; GFX9-NEXT: s_cselect_b64 s[22:23], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s21, 0 +; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s21, 1 +; GFX9-NEXT: s_cselect_b64 s[24:25], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s21, 14 +; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s21, 15 +; GFX9-NEXT: s_cselect_b64 s[26:27], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s21, 12 +; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s21, 13 +; GFX9-NEXT: s_cselect_b64 s[28:29], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s21, 10 +; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s21, 11 +; GFX9-NEXT: s_cselect_b64 s[30:31], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s21, 8 +; GFX9-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s21, 9 +; GFX9-NEXT: s_cselect_b64 s[20:21], -1, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e32 v10, v4, v9, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 5 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 2 -; GFX9-NEXT: v_perm_b32 v4, v4, v10, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v9, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 3 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v12, v9, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 1 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 14 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v13, v9, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 15 +; GFX9-NEXT: v_cndmask_b32_e64 v10, v3, v9, s[0:1] +; GFX9-NEXT: v_cndmask_b32_sdwa v3, v3, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[18:19] +; GFX9-NEXT: v_cndmask_b32_e64 v11, v2, v9, s[2:3] +; GFX9-NEXT: v_cndmask_b32_sdwa v2, v2, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[22:23] +; GFX9-NEXT: v_cndmask_b32_e64 v12, v1, v9, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v1, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[24:25] +; GFX9-NEXT: v_cndmask_b32_e64 v13, v0, v9, s[6:7] +; GFX9-NEXT: v_cndmask_b32_sdwa v0, v0, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[26:27] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 12 -; GFX9-NEXT: v_perm_b32 v1, v12, v1, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v14, v9, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 13 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 10 -; GFX9-NEXT: v_perm_b32 v8, v12, v8, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v15, v9, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 11 -; GFX9-NEXT: v_perm_b32 v3, v10, v3, s2 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 8 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v9, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 9 -; GFX9-NEXT: v_perm_b32 v2, v11, v2, s2 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc -; GFX9-NEXT: v_perm_b32 v7, v12, v7, s2 -; GFX9-NEXT: v_perm_b32 v6, v10, v6, s2 -; GFX9-NEXT: v_perm_b32 v5, v9, v5, s2 -; GFX9-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1] offset:16 -; GFX9-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v14, v7, v9, s[8:9] +; GFX9-NEXT: v_cndmask_b32_sdwa v7, v7, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[28:29] +; GFX9-NEXT: v_cndmask_b32_e64 v15, v6, v9, s[10:11] +; GFX9-NEXT: v_cndmask_b32_sdwa v6, v6, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[30:31] +; GFX9-NEXT: v_perm_b32 v3, v3, v10, s33 +; GFX9-NEXT: v_cndmask_b32_e64 v10, v5, v9, s[12:13] +; GFX9-NEXT: v_cndmask_b32_sdwa v5, v5, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[20:21] +; GFX9-NEXT: v_perm_b32 v2, v2, v11, s33 +; GFX9-NEXT: v_cndmask_b32_e64 v11, v4, v9, s[14:15] +; GFX9-NEXT: v_cndmask_b32_sdwa v4, v4, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v7, v7, v14, s33 +; GFX9-NEXT: v_perm_b32 v6, v6, v15, s33 +; GFX9-NEXT: v_perm_b32 v5, v5, v10, s33 +; GFX9-NEXT: v_perm_b32 v4, v4, v11, s33 +; GFX9-NEXT: v_perm_b32 v1, v1, v12, s33 +; GFX9-NEXT: v_perm_b32 v0, v0, v13, s33 +; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v16f16_dynamic: ; VI: ; %bb.0: +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x10 +; VI-NEXT: s_load_dwordx2 s[16:17], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 -; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v8 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v4 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; VI-NEXT: v_mov_b32_e32 v9, s1 ; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8 +; VI-NEXT: s_cmp_eq_u32 s17, 14 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8 -; VI-NEXT: s_cmp_eq_u32 s7, 14 -; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc -; VI-NEXT: v_mov_b32_e32 v12, s6 -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 15 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cndmask_b32_e32 v13, v3, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 12 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 13 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; VI-NEXT: v_cndmask_b32_e64 v2, v2, v12, s[0:1] -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 10 +; VI-NEXT: s_cmp_eq_u32 s17, 15 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 11 -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; VI-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3] +; VI-NEXT: s_cmp_eq_u32 s17, 12 +; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[2:3] ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 8 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: s_cmp_eq_u32 s17, 13 +; VI-NEXT: v_mov_b32_e32 v12, s16 +; VI-NEXT: s_cselect_b64 s[18:19], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s17, 10 ; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; VI-NEXT: v_cndmask_b32_e64 v15, v15, v12, s[2:3] -; VI-NEXT: s_cmp_eq_u32 s7, 9 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; VI-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 6 -; VI-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e32 v15, v16, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 7 -; VI-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[4:5] +; VI-NEXT: s_cmp_eq_u32 s17, 11 +; VI-NEXT: s_cselect_b64 s[20:21], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s17, 8 +; VI-NEXT: s_cselect_b64 s[6:7], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s17, 9 +; VI-NEXT: s_cselect_b64 s[22:23], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s17, 6 +; VI-NEXT: s_cselect_b64 s[8:9], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s17, 7 +; VI-NEXT: s_cselect_b64 s[24:25], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s17, 4 +; VI-NEXT: s_cselect_b64 s[10:11], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s17, 5 +; VI-NEXT: s_cselect_b64 s[26:27], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s17, 2 +; VI-NEXT: s_cselect_b64 s[12:13], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s17, 3 +; VI-NEXT: s_cselect_b64 s[28:29], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s17, 0 +; VI-NEXT: s_cselect_b64 s[14:15], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s17, 1 +; VI-NEXT: s_cselect_b64 s[16:17], -1, 0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cndmask_b32_e64 v13, v3, v12, s[0:1] +; VI-NEXT: v_cndmask_b32_sdwa v3, v3, v12, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[18:19] +; VI-NEXT: v_cndmask_b32_e64 v14, v2, v12, s[2:3] +; VI-NEXT: v_cndmask_b32_sdwa v2, v2, v12, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[20:21] +; VI-NEXT: v_cndmask_b32_e64 v15, v1, v12, s[4:5] +; VI-NEXT: v_cndmask_b32_sdwa v1, v1, v12, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[22:23] +; VI-NEXT: v_cndmask_b32_e64 v16, v0, v12, s[6:7] +; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v12, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[24:25] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_cndmask_b32_e64 v14, v14, v12, s[0:1] -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 4 +; VI-NEXT: v_cndmask_b32_e64 v17, v7, v12, s[8:9] +; VI-NEXT: v_cndmask_b32_sdwa v7, v7, v12, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[26:27] ; VI-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; VI-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e32 v15, v17, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 5 -; VI-NEXT: v_or_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; VI-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 2 -; VI-NEXT: v_cndmask_b32_e32 v13, v13, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 3 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 -; VI-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 0 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_cndmask_b32_e32 v14, v14, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 1 -; VI-NEXT: v_or_b32_sdwa v6, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_or_b32_sdwa v7, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e64 v13, v6, v12, s[10:11] +; VI-NEXT: v_cndmask_b32_sdwa v6, v6, v12, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[28:29] +; VI-NEXT: v_or_b32_sdwa v2, v14, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e64 v14, v5, v12, s[12:13] +; VI-NEXT: v_cndmask_b32_sdwa v5, v5, v12, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[16:17] +; VI-NEXT: v_or_b32_sdwa v6, v13, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e64 v13, v4, v12, s[14:15] +; VI-NEXT: v_cndmask_b32_sdwa v4, v4, v12, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v17, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v14, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v13, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; VI-NEXT: flat_store_dwordx4 v[10:11], v[0:3] ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll index 362b9dacaf257..59c051fb9e741 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll @@ -664,16 +664,13 @@ define <2 x half> @v_maximum_v2f16(<2 x half> %src0, <2 x half> %src1) { ; GFX8-LABEL: v_maximum_v2f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX8-NEXT: v_max_f16_e32 v4, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v3, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc -; GFX8-NEXT: v_max_f16_e32 v3, v0, v1 +; GFX8-NEXT: v_max_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_sdwa v2, v3, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v4, v0, v1 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -684,9 +681,8 @@ define <2 x half> @v_maximum_v2f16(<2 x half> %src0, <2 x half> %src1) { ; GFX900-NEXT: v_mov_b32_e32 v3, 0x7e00 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 ; GFX900-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v0, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -834,16 +830,13 @@ define <2 x half> @v_maximum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) { ; GFX8-LABEL: v_maximum_v2f16__nsz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX8-NEXT: v_max_f16_e32 v4, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v3, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc -; GFX8-NEXT: v_max_f16_e32 v3, v0, v1 +; GFX8-NEXT: v_max_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_sdwa v2, v3, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v4, v0, v1 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -854,9 +847,8 @@ define <2 x half> @v_maximum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) { ; GFX900-NEXT: v_mov_b32_e32 v3, 0x7e00 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 ; GFX900-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v0, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1017,11 +1009,10 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX8-NEXT: v_max_f16_e32 v1, s5, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7e00 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s5, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX8-NEXT: v_cndmask_b32_sdwa v0, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v1, s17 ; GFX8-NEXT: v_max_f16_e32 v3, s16, v1 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s16, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: ;;#ASMSTART @@ -1041,9 +1032,8 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX900-NEXT: s_lshr_b32 s5, s16, 16 ; GFX900-NEXT: v_mov_b32_e32 v3, s4 ; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, s5, v3 -; GFX900-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX900-NEXT: ;;#ASMSTART @@ -1170,20 +1160,17 @@ define <3 x half> @v_maximum_v3f16(<3 x half> %src0, <3 x half> %src1) { ; GFX8-LABEL: v_maximum_v3f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX8-NEXT: v_max_f16_e32 v6, v5, v4 -; GFX8-NEXT: v_mov_b32_e32 v7, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v5, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v6, vcc -; GFX8-NEXT: v_max_f16_e32 v5, v1, v3 +; GFX8-NEXT: v_max_f16_sdwa v4, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_sdwa v4, v5, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v6, v1, v3 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc ; GFX8-NEXT: v_max_f16_e32 v3, v0, v2 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_maximum_v3f16: @@ -1196,9 +1183,8 @@ define <3 x half> @v_maximum_v3f16(<3 x half> %src0, <3 x half> %src1) { ; GFX900-NEXT: v_pk_max_f16 v3, v0, v2 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 ; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1377,20 +1363,17 @@ define <3 x half> @v_maximum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) { ; GFX8-LABEL: v_maximum_v3f16__nsz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX8-NEXT: v_max_f16_e32 v6, v5, v4 -; GFX8-NEXT: v_mov_b32_e32 v7, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v5, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v6, vcc -; GFX8-NEXT: v_max_f16_e32 v5, v1, v3 +; GFX8-NEXT: v_max_f16_sdwa v4, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_sdwa v4, v5, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v6, v1, v3 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc ; GFX8-NEXT: v_max_f16_e32 v3, v0, v2 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_maximum_v3f16__nsz: @@ -1403,9 +1386,8 @@ define <3 x half> @v_maximum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) { ; GFX900-NEXT: v_pk_max_f16 v3, v0, v2 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 ; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1591,27 +1573,21 @@ define <4 x half> @v_maximum_v4f16(<4 x half> %src0, <4 x half> %src1) { ; GFX8-LABEL: v_maximum_v4f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX8-NEXT: v_max_f16_e32 v6, v5, v4 -; GFX8-NEXT: v_mov_b32_e32 v7, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v5, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v6, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX8-NEXT: v_max_f16_e32 v8, v6, v5 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc -; GFX8-NEXT: v_max_f16_e32 v6, v1, v3 +; GFX8-NEXT: v_max_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX8-NEXT: v_cmp_o_f16 vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_sdwa v4, v5, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_sdwa v6, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_sdwa v6, v5, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v7, v1, v3 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc ; GFX8-NEXT: v_max_f16_e32 v3, v0, v2 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_maximum_v4f16: @@ -1621,15 +1597,13 @@ define <4 x half> @v_maximum_v4f16(<4 x half> %src0, <4 x half> %src1) { ; GFX900-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX900-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v1, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: v_pk_max_f16 v3, v0, v2 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 ; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 ; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4 @@ -1834,27 +1808,21 @@ define <4 x half> @v_maximum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) { ; GFX8-LABEL: v_maximum_v4f16__nsz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX8-NEXT: v_max_f16_e32 v6, v5, v4 -; GFX8-NEXT: v_mov_b32_e32 v7, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v5, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v6, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX8-NEXT: v_max_f16_e32 v8, v6, v5 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc -; GFX8-NEXT: v_max_f16_e32 v6, v1, v3 +; GFX8-NEXT: v_max_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX8-NEXT: v_cmp_o_f16 vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_sdwa v4, v5, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_sdwa v6, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_sdwa v6, v5, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v7, v1, v3 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc ; GFX8-NEXT: v_max_f16_e32 v3, v0, v2 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_maximum_v4f16__nsz: @@ -1864,15 +1832,13 @@ define <4 x half> @v_maximum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) { ; GFX900-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX900-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v1, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: v_pk_max_f16 v3, v0, v2 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 ; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 ; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4 @@ -2105,47 +2071,35 @@ define <8 x half> @v_maximum_v8f16(<8 x half> %src0, <8 x half> %src1) { ; GFX8-LABEL: v_maximum_v8f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; GFX8-NEXT: v_max_f16_e32 v10, v9, v8 -; GFX8-NEXT: v_mov_b32_e32 v11, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v9, v8 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v11, v10, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v6 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX8-NEXT: v_max_f16_e32 v12, v10, v9 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v10, v9 -; GFX8-NEXT: v_cndmask_b32_e32 v9, v11, v12, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v1 -; GFX8-NEXT: v_max_f16_e32 v13, v12, v10 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v12, v10 -; GFX8-NEXT: v_cndmask_b32_e32 v10, v11, v13, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v0 -; GFX8-NEXT: v_max_f16_e32 v14, v13, v12 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v13, v12 -; GFX8-NEXT: v_cndmask_b32_e32 v12, v11, v14, vcc +; GFX8-NEXT: v_max_f16_sdwa v8, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v9, 0x7e00 +; GFX8-NEXT: v_cmp_o_f16 vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_sdwa v8, v9, v8, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_sdwa v10, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cmp_o_f16 vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_sdwa v10, v9, v10, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_sdwa v11, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cmp_o_f16 vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_sdwa v11, v9, v11, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_sdwa v12, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_sdwa v12, v9, v12, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_max_f16_e32 v13, v3, v7 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v3, v7 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v13, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v9, v13, vcc ; GFX8-NEXT: v_max_f16_e32 v7, v2, v6 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v2, v6 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v11, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v9, v7, vcc ; GFX8-NEXT: v_max_f16_e32 v6, v1, v5 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v1, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v11, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v6, vcc ; GFX8-NEXT: v_max_f16_e32 v5, v0, v4 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v11, v5, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v12 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v10 -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v9 -; GFX8-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e32 v0, v9, v5, vcc +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_maximum_v8f16: @@ -2155,27 +2109,23 @@ define <8 x half> @v_maximum_v8f16(<8 x half> %src0, <8 x half> %src1) { ; GFX900-NEXT: v_mov_b32_e32 v9, 0x7e00 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v3, v7 ; GFX900-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v3, v9, v8, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v3, v9, v8, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: v_pk_max_f16 v7, v2, v6 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v2, v6 ; GFX900-NEXT: v_cndmask_b32_e32 v8, v9, v7, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v2, v9, v7, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v2, v9, v7, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: v_pk_max_f16 v6, v1, v5 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v5 ; GFX900-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v6, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v1, v9, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: v_pk_max_f16 v5, v0, v4 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v4 ; GFX900-NEXT: v_cndmask_b32_e32 v6, v9, v5, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v0, v9, v5, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v0, v9, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_perm_b32 v0, v0, v6, s4 ; GFX900-NEXT: v_perm_b32 v1, v1, v7, s4 @@ -2433,87 +2383,63 @@ define <16 x half> @v_maximum_v16f16(<16 x half> %src0, <16 x half> %src1) { ; GFX8-LABEL: v_maximum_v16f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; GFX8-NEXT: v_max_f16_e32 v16, v18, v17 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v18, v17 -; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v13 -; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; GFX8-NEXT: v_max_f16_e32 v20, v18, v17 -; GFX8-NEXT: v_cmp_o_f16_e64 s[4:5], v18, v17 -; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v12 -; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v4 -; GFX8-NEXT: v_max_f16_e32 v21, v18, v17 -; GFX8-NEXT: v_cmp_o_f16_e64 s[6:7], v18, v17 -; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v11 -; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v3 -; GFX8-NEXT: v_max_f16_e32 v22, v18, v17 -; GFX8-NEXT: v_cmp_o_f16_e64 s[8:9], v18, v17 -; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v10 -; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; GFX8-NEXT: v_max_f16_e32 v23, v18, v17 -; GFX8-NEXT: v_cmp_o_f16_e64 s[10:11], v18, v17 -; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v9 -; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v1 -; GFX8-NEXT: v_max_f16_e32 v24, v18, v17 -; GFX8-NEXT: v_cmp_o_f16_e64 s[12:13], v18, v17 -; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; GFX8-NEXT: v_max_f16_e32 v25, v18, v17 -; GFX8-NEXT: v_cmp_o_f16_e64 s[14:15], v18, v17 -; GFX8-NEXT: v_max_f16_e32 v17, v6, v14 -; GFX8-NEXT: v_cmp_o_f16_e64 s[16:17], v6, v14 -; GFX8-NEXT: v_max_f16_e32 v6, v5, v13 -; GFX8-NEXT: v_cmp_o_f16_e64 s[18:19], v5, v13 -; GFX8-NEXT: v_max_f16_e32 v5, v4, v12 -; GFX8-NEXT: v_cmp_o_f16_e64 s[20:21], v4, v12 -; GFX8-NEXT: v_max_f16_e32 v4, v3, v11 -; GFX8-NEXT: v_cmp_o_f16_e64 s[22:23], v3, v11 -; GFX8-NEXT: v_max_f16_e32 v11, v7, v15 -; GFX8-NEXT: v_cmp_o_f16_e64 s[24:25], v7, v15 -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v15 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX8-NEXT: v_mov_b32_e32 v19, 0x7e00 -; GFX8-NEXT: v_max_f16_e32 v13, v7, v12 -; GFX8-NEXT: v_cmp_o_f16_e64 s[26:27], v7, v12 -; GFX8-NEXT: v_max_f16_e32 v3, v2, v10 -; GFX8-NEXT: v_cndmask_b32_e64 v12, v19, v13, s[26:27] -; GFX8-NEXT: v_cndmask_b32_e32 v13, v19, v16, vcc -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v2, v10 -; GFX8-NEXT: v_max_f16_e32 v14, v1, v9 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v19, v3, vcc +; GFX8-NEXT: v_max_f16_sdwa v16, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v17, 0x7e00 +; GFX8-NEXT: v_cmp_o_f16 vcc, v7, v15 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_sdwa v16, v17, v16, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_sdwa v18, v6, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cmp_o_f16 vcc, v6, v14 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_sdwa v18, v17, v18, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v19, v6, v14 +; GFX8-NEXT: v_cmp_o_f16_e64 s[4:5], v6, v14 +; GFX8-NEXT: v_max_f16_sdwa v6, v5, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cmp_o_f16 vcc, v5, v13 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_sdwa v6, v17, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v14, v5, v13 +; GFX8-NEXT: v_cmp_o_f16_e64 s[6:7], v5, v13 +; GFX8-NEXT: v_max_f16_sdwa v5, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cmp_o_f16 vcc, v4, v12 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_sdwa v5, v17, v5, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v13, v4, v12 +; GFX8-NEXT: v_cmp_o_f16_e64 s[8:9], v4, v12 +; GFX8-NEXT: v_max_f16_sdwa v4, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cmp_o_f16 vcc, v3, v11 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_sdwa v4, v17, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v12, v3, v11 +; GFX8-NEXT: v_cmp_o_f16_e64 s[10:11], v3, v11 +; GFX8-NEXT: v_max_f16_sdwa v3, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cmp_o_f16 vcc, v2, v10 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_sdwa v3, v17, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v11, v2, v10 +; GFX8-NEXT: v_cmp_o_f16_e64 s[12:13], v2, v10 +; GFX8-NEXT: v_max_f16_sdwa v2, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cmp_o_f16 vcc, v1, v9 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_sdwa v2, v17, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v10, v7, v15 +; GFX8-NEXT: v_cmp_o_f16_e64 s[14:15], v7, v15 +; GFX8-NEXT: v_max_f16_sdwa v7, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v8 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_sdwa v7, v17, v7, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e64 v15, v17, v19, s[4:5] +; GFX8-NEXT: v_max_f16_e32 v19, v1, v9 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v1, v9 -; GFX8-NEXT: v_max_f16_e32 v7, v0, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v18, v19, v22, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v22, v19, v25, s[14:15] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v19, v14, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v19, vcc +; GFX8-NEXT: v_max_f16_e32 v9, v0, v8 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v16, v19, v21, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v21, v19, v24, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v19, v7, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v22 -; GFX8-NEXT: v_cndmask_b32_e64 v15, v19, v20, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v20, v19, v23, s[10:11] -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v21 -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v20 -; GFX8-NEXT: v_cndmask_b32_e64 v4, v19, v4, s[22:23] -; GFX8-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v18 -; GFX8-NEXT: v_cndmask_b32_e64 v5, v19, v5, s[20:21] -; GFX8-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v16 -; GFX8-NEXT: v_cndmask_b32_e64 v6, v19, v6, s[18:19] -; GFX8-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v15 -; GFX8-NEXT: v_cndmask_b32_e64 v11, v19, v11, s[24:25] -; GFX8-NEXT: v_cndmask_b32_e64 v17, v19, v17, s[16:17] -; GFX8-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v13 -; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v12 -; GFX8-NEXT: v_or_b32_sdwa v6, v17, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v7, v11, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[14:15] +; GFX8-NEXT: v_cndmask_b32_e64 v14, v17, v14, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v13, v17, v13, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v12, v17, v12, s[10:11] +; GFX8-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v17, v9, vcc +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v12, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v13, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v5, v14, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v6, v15, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v7, v10, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_maximum_v16f16: @@ -2523,51 +2449,43 @@ define <16 x half> @v_maximum_v16f16(<16 x half> %src0, <16 x half> %src1) { ; GFX900-NEXT: v_mov_b32_e32 v17, 0x7e00 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v7, v15 ; GFX900-NEXT: v_cndmask_b32_e32 v18, v17, v16, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v7, v15 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v7, v17, v16, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v7, v17, v16, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: v_pk_max_f16 v15, v6, v14 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v6, v14 ; GFX900-NEXT: v_cndmask_b32_e32 v16, v17, v15, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v6, v14 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v6, v17, v15, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v6, v17, v15, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: v_pk_max_f16 v14, v5, v13 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v5, v13 ; GFX900-NEXT: v_cndmask_b32_e32 v15, v17, v14, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v5, v13 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v5, v17, v14, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v5, v17, v14, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: v_pk_max_f16 v13, v4, v12 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v4, v12 ; GFX900-NEXT: v_cndmask_b32_e32 v14, v17, v13, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v4, v12 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v4, v17, v13, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v4, v17, v13, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: v_pk_max_f16 v12, v3, v11 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v3, v11 ; GFX900-NEXT: v_cndmask_b32_e32 v13, v17, v12, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v12, 16, v12 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v3, v11 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v3, v17, v12, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v3, v17, v12, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: v_pk_max_f16 v11, v2, v10 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v2, v10 ; GFX900-NEXT: v_cndmask_b32_e32 v12, v17, v11, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v2, v10 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v2, v17, v11, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v2, v17, v11, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: v_pk_max_f16 v10, v1, v9 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v9 ; GFX900-NEXT: v_cndmask_b32_e32 v11, v17, v10, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v1, v9 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v10, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v1, v17, v10, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: v_pk_max_f16 v9, v0, v8 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v8 ; GFX900-NEXT: v_cndmask_b32_e32 v10, v17, v9, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v8 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v0, v17, v9, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v0, v17, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_perm_b32 v0, v0, v10, s4 ; GFX900-NEXT: v_perm_b32 v1, v1, v11, s4 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll index f6d37b34807b1..e33e34d7901cc 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll @@ -554,16 +554,13 @@ define <2 x half> @v_minimum_v2f16(<2 x half> %src0, <2 x half> %src1) { ; GFX8-LABEL: v_minimum_v2f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX8-NEXT: v_min_f16_e32 v4, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v3, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc -; GFX8-NEXT: v_min_f16_e32 v3, v0, v1 +; GFX8-NEXT: v_min_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_sdwa v2, v3, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v4, v0, v1 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -574,9 +571,8 @@ define <2 x half> @v_minimum_v2f16(<2 x half> %src0, <2 x half> %src1) { ; GFX900-NEXT: v_mov_b32_e32 v3, 0x7e00 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 ; GFX900-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v0, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -689,16 +685,13 @@ define <2 x half> @v_minimum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) { ; GFX8-LABEL: v_minimum_v2f16__nsz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX8-NEXT: v_min_f16_e32 v4, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v3, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc -; GFX8-NEXT: v_min_f16_e32 v3, v0, v1 +; GFX8-NEXT: v_min_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_sdwa v2, v3, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v4, v0, v1 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -709,9 +702,8 @@ define <2 x half> @v_minimum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) { ; GFX900-NEXT: v_mov_b32_e32 v3, 0x7e00 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 ; GFX900-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v0, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -830,11 +822,10 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX8-NEXT: v_min_f16_e32 v1, s5, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7e00 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s5, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX8-NEXT: v_cndmask_b32_sdwa v0, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v1, s17 ; GFX8-NEXT: v_min_f16_e32 v3, s16, v1 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s16, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: ;;#ASMSTART @@ -854,9 +845,8 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX900-NEXT: s_lshr_b32 s5, s16, 16 ; GFX900-NEXT: v_mov_b32_e32 v3, s4 ; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, s5, v3 -; GFX900-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX900-NEXT: ;;#ASMSTART @@ -956,20 +946,17 @@ define <3 x half> @v_minimum_v3f16(<3 x half> %src0, <3 x half> %src1) { ; GFX8-LABEL: v_minimum_v3f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX8-NEXT: v_min_f16_e32 v6, v5, v4 -; GFX8-NEXT: v_mov_b32_e32 v7, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v5, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v6, vcc -; GFX8-NEXT: v_min_f16_e32 v5, v1, v3 +; GFX8-NEXT: v_min_f16_sdwa v4, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_sdwa v4, v5, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v6, v1, v3 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc ; GFX8-NEXT: v_min_f16_e32 v3, v0, v2 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_minimum_v3f16: @@ -982,9 +969,8 @@ define <3 x half> @v_minimum_v3f16(<3 x half> %src0, <3 x half> %src1) { ; GFX900-NEXT: v_pk_min_f16 v3, v0, v2 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 ; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1116,20 +1102,17 @@ define <3 x half> @v_minimum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) { ; GFX8-LABEL: v_minimum_v3f16__nsz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX8-NEXT: v_min_f16_e32 v6, v5, v4 -; GFX8-NEXT: v_mov_b32_e32 v7, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v5, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v6, vcc -; GFX8-NEXT: v_min_f16_e32 v5, v1, v3 +; GFX8-NEXT: v_min_f16_sdwa v4, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_sdwa v4, v5, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v6, v1, v3 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc ; GFX8-NEXT: v_min_f16_e32 v3, v0, v2 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_minimum_v3f16__nsz: @@ -1142,9 +1125,8 @@ define <3 x half> @v_minimum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) { ; GFX900-NEXT: v_pk_min_f16 v3, v0, v2 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 ; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1276,27 +1258,21 @@ define <4 x half> @v_minimum_v4f16(<4 x half> %src0, <4 x half> %src1) { ; GFX8-LABEL: v_minimum_v4f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX8-NEXT: v_min_f16_e32 v6, v5, v4 -; GFX8-NEXT: v_mov_b32_e32 v7, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v5, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v6, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX8-NEXT: v_min_f16_e32 v8, v6, v5 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc -; GFX8-NEXT: v_min_f16_e32 v6, v1, v3 +; GFX8-NEXT: v_min_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX8-NEXT: v_cmp_o_f16 vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_sdwa v4, v5, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_sdwa v6, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_sdwa v6, v5, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v7, v1, v3 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc ; GFX8-NEXT: v_min_f16_e32 v3, v0, v2 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_minimum_v4f16: @@ -1306,15 +1282,13 @@ define <4 x half> @v_minimum_v4f16(<4 x half> %src0, <4 x half> %src1) { ; GFX900-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX900-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v1, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: v_pk_min_f16 v3, v0, v2 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 ; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 ; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4 @@ -1460,27 +1434,21 @@ define <4 x half> @v_minimum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) { ; GFX8-LABEL: v_minimum_v4f16__nsz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX8-NEXT: v_min_f16_e32 v6, v5, v4 -; GFX8-NEXT: v_mov_b32_e32 v7, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v5, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v6, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX8-NEXT: v_min_f16_e32 v8, v6, v5 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc -; GFX8-NEXT: v_min_f16_e32 v6, v1, v3 +; GFX8-NEXT: v_min_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX8-NEXT: v_cmp_o_f16 vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_sdwa v4, v5, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_sdwa v6, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_sdwa v6, v5, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v7, v1, v3 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc ; GFX8-NEXT: v_min_f16_e32 v3, v0, v2 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_minimum_v4f16__nsz: @@ -1490,15 +1458,13 @@ define <4 x half> @v_minimum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) { ; GFX900-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX900-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v1, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: v_pk_min_f16 v3, v0, v2 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 ; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 ; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4 @@ -1644,47 +1610,35 @@ define <8 x half> @v_minimum_v8f16(<8 x half> %src0, <8 x half> %src1) { ; GFX8-LABEL: v_minimum_v8f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; GFX8-NEXT: v_min_f16_e32 v10, v9, v8 -; GFX8-NEXT: v_mov_b32_e32 v11, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v9, v8 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v11, v10, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v6 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX8-NEXT: v_min_f16_e32 v12, v10, v9 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v10, v9 -; GFX8-NEXT: v_cndmask_b32_e32 v9, v11, v12, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v1 -; GFX8-NEXT: v_min_f16_e32 v13, v12, v10 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v12, v10 -; GFX8-NEXT: v_cndmask_b32_e32 v10, v11, v13, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v0 -; GFX8-NEXT: v_min_f16_e32 v14, v13, v12 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v13, v12 -; GFX8-NEXT: v_cndmask_b32_e32 v12, v11, v14, vcc +; GFX8-NEXT: v_min_f16_sdwa v8, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v9, 0x7e00 +; GFX8-NEXT: v_cmp_o_f16 vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_sdwa v8, v9, v8, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_sdwa v10, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cmp_o_f16 vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_sdwa v10, v9, v10, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_sdwa v11, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cmp_o_f16 vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_sdwa v11, v9, v11, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_sdwa v12, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_sdwa v12, v9, v12, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_min_f16_e32 v13, v3, v7 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v3, v7 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v13, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v9, v13, vcc ; GFX8-NEXT: v_min_f16_e32 v7, v2, v6 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v2, v6 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v11, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v9, v7, vcc ; GFX8-NEXT: v_min_f16_e32 v6, v1, v5 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v1, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v11, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v6, vcc ; GFX8-NEXT: v_min_f16_e32 v5, v0, v4 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v11, v5, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v12 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v10 -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v9 -; GFX8-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e32 v0, v9, v5, vcc +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_minimum_v8f16: @@ -1694,27 +1648,23 @@ define <8 x half> @v_minimum_v8f16(<8 x half> %src0, <8 x half> %src1) { ; GFX900-NEXT: v_mov_b32_e32 v9, 0x7e00 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v3, v7 ; GFX900-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v3, v9, v8, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v3, v9, v8, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: v_pk_min_f16 v7, v2, v6 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v2, v6 ; GFX900-NEXT: v_cndmask_b32_e32 v8, v9, v7, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v2, v9, v7, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v2, v9, v7, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: v_pk_min_f16 v6, v1, v5 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v5 ; GFX900-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v6, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v1, v9, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: v_pk_min_f16 v5, v0, v4 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v4 ; GFX900-NEXT: v_cndmask_b32_e32 v6, v9, v5, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v0, v9, v5, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v0, v9, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_perm_b32 v0, v0, v6, s4 ; GFX900-NEXT: v_perm_b32 v1, v1, v7, s4 @@ -1852,87 +1802,63 @@ define <16 x half> @v_minimum_v16f16(<16 x half> %src0, <16 x half> %src1) { ; GFX8-LABEL: v_minimum_v16f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; GFX8-NEXT: v_min_f16_e32 v16, v18, v17 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v18, v17 -; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v13 -; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; GFX8-NEXT: v_min_f16_e32 v20, v18, v17 -; GFX8-NEXT: v_cmp_o_f16_e64 s[4:5], v18, v17 -; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v12 -; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v4 -; GFX8-NEXT: v_min_f16_e32 v21, v18, v17 -; GFX8-NEXT: v_cmp_o_f16_e64 s[6:7], v18, v17 -; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v11 -; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v3 -; GFX8-NEXT: v_min_f16_e32 v22, v18, v17 -; GFX8-NEXT: v_cmp_o_f16_e64 s[8:9], v18, v17 -; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v10 -; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; GFX8-NEXT: v_min_f16_e32 v23, v18, v17 -; GFX8-NEXT: v_cmp_o_f16_e64 s[10:11], v18, v17 -; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v9 -; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v1 -; GFX8-NEXT: v_min_f16_e32 v24, v18, v17 -; GFX8-NEXT: v_cmp_o_f16_e64 s[12:13], v18, v17 -; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; GFX8-NEXT: v_min_f16_e32 v25, v18, v17 -; GFX8-NEXT: v_cmp_o_f16_e64 s[14:15], v18, v17 -; GFX8-NEXT: v_min_f16_e32 v17, v6, v14 -; GFX8-NEXT: v_cmp_o_f16_e64 s[16:17], v6, v14 -; GFX8-NEXT: v_min_f16_e32 v6, v5, v13 -; GFX8-NEXT: v_cmp_o_f16_e64 s[18:19], v5, v13 -; GFX8-NEXT: v_min_f16_e32 v5, v4, v12 -; GFX8-NEXT: v_cmp_o_f16_e64 s[20:21], v4, v12 -; GFX8-NEXT: v_min_f16_e32 v4, v3, v11 -; GFX8-NEXT: v_cmp_o_f16_e64 s[22:23], v3, v11 -; GFX8-NEXT: v_min_f16_e32 v11, v7, v15 -; GFX8-NEXT: v_cmp_o_f16_e64 s[24:25], v7, v15 -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v15 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX8-NEXT: v_mov_b32_e32 v19, 0x7e00 -; GFX8-NEXT: v_min_f16_e32 v13, v7, v12 -; GFX8-NEXT: v_cmp_o_f16_e64 s[26:27], v7, v12 -; GFX8-NEXT: v_min_f16_e32 v3, v2, v10 -; GFX8-NEXT: v_cndmask_b32_e64 v12, v19, v13, s[26:27] -; GFX8-NEXT: v_cndmask_b32_e32 v13, v19, v16, vcc -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v2, v10 -; GFX8-NEXT: v_min_f16_e32 v14, v1, v9 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v19, v3, vcc +; GFX8-NEXT: v_min_f16_sdwa v16, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v17, 0x7e00 +; GFX8-NEXT: v_cmp_o_f16 vcc, v7, v15 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_sdwa v16, v17, v16, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_sdwa v18, v6, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cmp_o_f16 vcc, v6, v14 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_sdwa v18, v17, v18, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v19, v6, v14 +; GFX8-NEXT: v_cmp_o_f16_e64 s[4:5], v6, v14 +; GFX8-NEXT: v_min_f16_sdwa v6, v5, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cmp_o_f16 vcc, v5, v13 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_sdwa v6, v17, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v14, v5, v13 +; GFX8-NEXT: v_cmp_o_f16_e64 s[6:7], v5, v13 +; GFX8-NEXT: v_min_f16_sdwa v5, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cmp_o_f16 vcc, v4, v12 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_sdwa v5, v17, v5, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v13, v4, v12 +; GFX8-NEXT: v_cmp_o_f16_e64 s[8:9], v4, v12 +; GFX8-NEXT: v_min_f16_sdwa v4, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cmp_o_f16 vcc, v3, v11 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_sdwa v4, v17, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v12, v3, v11 +; GFX8-NEXT: v_cmp_o_f16_e64 s[10:11], v3, v11 +; GFX8-NEXT: v_min_f16_sdwa v3, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cmp_o_f16 vcc, v2, v10 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_sdwa v3, v17, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v11, v2, v10 +; GFX8-NEXT: v_cmp_o_f16_e64 s[12:13], v2, v10 +; GFX8-NEXT: v_min_f16_sdwa v2, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cmp_o_f16 vcc, v1, v9 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_sdwa v2, v17, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v10, v7, v15 +; GFX8-NEXT: v_cmp_o_f16_e64 s[14:15], v7, v15 +; GFX8-NEXT: v_min_f16_sdwa v7, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v8 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_sdwa v7, v17, v7, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e64 v15, v17, v19, s[4:5] +; GFX8-NEXT: v_min_f16_e32 v19, v1, v9 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v1, v9 -; GFX8-NEXT: v_min_f16_e32 v7, v0, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v18, v19, v22, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v22, v19, v25, s[14:15] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v19, v14, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v19, vcc +; GFX8-NEXT: v_min_f16_e32 v9, v0, v8 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v16, v19, v21, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v21, v19, v24, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v19, v7, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v22 -; GFX8-NEXT: v_cndmask_b32_e64 v15, v19, v20, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v20, v19, v23, s[10:11] -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v21 -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v20 -; GFX8-NEXT: v_cndmask_b32_e64 v4, v19, v4, s[22:23] -; GFX8-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v18 -; GFX8-NEXT: v_cndmask_b32_e64 v5, v19, v5, s[20:21] -; GFX8-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v16 -; GFX8-NEXT: v_cndmask_b32_e64 v6, v19, v6, s[18:19] -; GFX8-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v15 -; GFX8-NEXT: v_cndmask_b32_e64 v11, v19, v11, s[24:25] -; GFX8-NEXT: v_cndmask_b32_e64 v17, v19, v17, s[16:17] -; GFX8-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v13 -; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v12 -; GFX8-NEXT: v_or_b32_sdwa v6, v17, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v7, v11, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[14:15] +; GFX8-NEXT: v_cndmask_b32_e64 v14, v17, v14, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v13, v17, v13, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v12, v17, v12, s[10:11] +; GFX8-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v17, v9, vcc +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v12, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v13, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v5, v14, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v6, v15, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v7, v10, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_minimum_v16f16: @@ -1942,51 +1868,43 @@ define <16 x half> @v_minimum_v16f16(<16 x half> %src0, <16 x half> %src1) { ; GFX900-NEXT: v_mov_b32_e32 v17, 0x7e00 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v7, v15 ; GFX900-NEXT: v_cndmask_b32_e32 v18, v17, v16, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v7, v15 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v7, v17, v16, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v7, v17, v16, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: v_pk_min_f16 v15, v6, v14 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v6, v14 ; GFX900-NEXT: v_cndmask_b32_e32 v16, v17, v15, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v6, v14 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v6, v17, v15, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v6, v17, v15, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: v_pk_min_f16 v14, v5, v13 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v5, v13 ; GFX900-NEXT: v_cndmask_b32_e32 v15, v17, v14, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v5, v13 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v5, v17, v14, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v5, v17, v14, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: v_pk_min_f16 v13, v4, v12 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v4, v12 ; GFX900-NEXT: v_cndmask_b32_e32 v14, v17, v13, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v4, v12 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v4, v17, v13, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v4, v17, v13, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: v_pk_min_f16 v12, v3, v11 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v3, v11 ; GFX900-NEXT: v_cndmask_b32_e32 v13, v17, v12, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v12, 16, v12 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v3, v11 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v3, v17, v12, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v3, v17, v12, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: v_pk_min_f16 v11, v2, v10 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v2, v10 ; GFX900-NEXT: v_cndmask_b32_e32 v12, v17, v11, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v2, v10 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v2, v17, v11, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v2, v17, v11, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: v_pk_min_f16 v10, v1, v9 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v9 ; GFX900-NEXT: v_cndmask_b32_e32 v11, v17, v10, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v1, v9 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v10, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v1, v17, v10, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: v_pk_min_f16 v9, v0, v8 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v8 ; GFX900-NEXT: v_cndmask_b32_e32 v10, v17, v9, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v8 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v0, v17, v9, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v0, v17, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_perm_b32 v0, v0, v10, s4 ; GFX900-NEXT: v_perm_b32 v1, v1, v11, s4 diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.ll index 718a266f49f5d..e90fe93cf8d1c 100644 --- a/llvm/test/CodeGen/AMDGPU/maximumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/maximumnum.ll @@ -211,8 +211,7 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) { ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v3, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: v_cndmask_b32_sdwa v2, v1, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2 @@ -243,8 +242,7 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) { ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v2, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_cndmask_b32_sdwa v2, v1, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.ll index 2b4d687bb0c29..ae37f4b43896c 100644 --- a/llvm/test/CodeGen/AMDGPU/minimumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/minimumnum.ll @@ -211,8 +211,7 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) { ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v3, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: v_cndmask_b32_sdwa v2, v1, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2 @@ -244,8 +243,7 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) { ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v2, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_cndmask_b32_sdwa v2, v1, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff diff --git a/llvm/test/CodeGen/AMDGPU/saddsat.ll b/llvm/test/CodeGen/AMDGPU/saddsat.ll index 4e27cf20d3c98..7ff0d3f511c60 100644 --- a/llvm/test/CodeGen/AMDGPU/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/saddsat.ll @@ -184,17 +184,17 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX8-NEXT: v_add_u16_e32 v4, v3, v2 ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v4, v3 ; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v2 +; GFX8-NEXT: s_xor_b64 s[4:5], s[4:5], vcc ; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v4 +; GFX8-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[4:5] ; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2 -; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX8-NEXT: v_cndmask_b32_sdwa v2, v4, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v1 ; GFX8-NEXT: v_add_u16_e32 v1, v0, v1 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v1, v0 ; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v1 ; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -247,10 +247,11 @@ define <3 x i16> @v_saddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) { ; GFX8-NEXT: v_add_u16_e32 v6, v5, v4 ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5 ; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v4 +; GFX8-NEXT: s_xor_b64 s[4:5], s[4:5], vcc ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 15, v6 +; GFX8-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[4:5] ; GFX8-NEXT: v_xor_b32_e32 v4, 0xffff8000, v4 -; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX8-NEXT: v_cndmask_b32_sdwa v4, v6, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v3 ; GFX8-NEXT: v_add_u16_e32 v3, v1, v3 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v3, v1 @@ -265,8 +266,7 @@ define <3 x i16> @v_saddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) { ; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_saddsat_v3i16: @@ -324,17 +324,17 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX8-NEXT: v_add_u16_e32 v6, v5, v4 ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5 ; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v4 +; GFX8-NEXT: s_xor_b64 s[4:5], s[4:5], vcc ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 15, v6 +; GFX8-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[4:5] ; GFX8-NEXT: v_xor_b32_e32 v4, 0xffff8000, v4 -; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX8-NEXT: v_cndmask_b32_sdwa v4, v6, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v2 ; GFX8-NEXT: v_add_u16_e32 v2, v0, v2 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v2, v0 ; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v2 ; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3 @@ -342,17 +342,17 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX8-NEXT: v_add_u16_e32 v5, v4, v2 ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v5, v4 ; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v2 +; GFX8-NEXT: s_xor_b64 s[4:5], s[4:5], vcc ; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v5 +; GFX8-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[4:5] ; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2 -; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GFX8-NEXT: v_cndmask_b32_sdwa v2, v5, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v3 ; GFX8-NEXT: v_add_u16_e32 v3, v1, v3 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v3, v1 ; GFX8-NEXT: v_ashrrev_i16_e32 v1, 15, v3 ; GFX8-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-vcnd_mask-1.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-vcnd_mask-1.mir new file mode 100644 index 0000000000000..011bbf4dbe75d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-vcnd_mask-1.mir @@ -0,0 +1,48 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc %s -mtriple=amdgcn -mcpu=gfx900 -run-pass=si-peephole-sdwa -o - | FileCheck -check-prefix=gfx9 %s + +# Test conversion of V_CNDMASK_B32 to VOPC for enabling further conversion to SDWA. +# For this, the definition of the src2 carry-in operand must changed to write +# to VCC. + +--- +name: v_vselect_v2bf16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; gfx9-LABEL: name: v_vselect_v2bf16 + ; gfx9: liveins: $vgpr0, $vgpr1, $vgpr2 + ; gfx9-NEXT: {{ $}} + ; gfx9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; gfx9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; gfx9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; gfx9-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 1, [[COPY1]], implicit $exec + ; gfx9-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 1, [[COPY2]], implicit $exec + ; gfx9-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 killed [[V_AND_B32_e64_1]], 1, implicit $exec + ; gfx9-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, [[COPY]], killed [[V_CMP_EQ_U32_e64_]], implicit $exec + ; gfx9-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[COPY]], implicit $exec + ; gfx9-NEXT: $vcc = V_CMP_EQ_U32_e64 killed [[V_AND_B32_e64_]], 1, implicit $exec + ; gfx9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; gfx9-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, [[COPY]], 0, 6, 0, 6, 5, implicit $vcc, implicit $exec + ; gfx9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 84148480 + ; gfx9-NEXT: [[V_PERM_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERM_B32_e64 killed [[V_CNDMASK_B32_sdwa]], killed [[V_CNDMASK_B32_e64_]], killed [[S_MOV_B32_]], implicit $exec + ; gfx9-NEXT: $vgpr0 = COPY [[V_PERM_B32_e64_]] + ; gfx9-NEXT: SI_RETURN implicit $vgpr0 + %10:vgpr_32 = COPY $vgpr2 + %9:vgpr_32 = COPY $vgpr1 + %8:vgpr_32 = COPY $vgpr0 + %11:vgpr_32 = V_AND_B32_e64 1, %9, implicit $exec + %12:sreg_64_xexec = V_CMP_EQ_U32_e64 killed %11, 1, implicit $exec + %13:vgpr_32 = V_AND_B32_e64 1, %8, implicit $exec + %14:sreg_64_xexec = V_CMP_EQ_U32_e64 killed %13, 1, implicit $exec + %17:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %10, killed %14, implicit $exec + %20:vgpr_32 = V_LSHRREV_B32_e64 16, %10, implicit $exec + %22:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %20, killed %12, implicit $exec + %24:sreg_32 = S_MOV_B32 84148480 + %25:vgpr_32 = V_PERM_B32_e64 killed %22, killed %17, killed %24, implicit $exec + $vgpr0 = COPY %25 + SI_RETURN implicit $vgpr0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-vcnd_mask-2.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-vcnd_mask-2.mir new file mode 100644 index 0000000000000..e66d7d80a803c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-vcnd_mask-2.mir @@ -0,0 +1,43 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc %s -mtriple=amdgcn -mcpu=gfx1030 -run-pass=si-peephole-sdwa -o - | FileCheck %s + +# For conversion of_CNDMASK_B32_e64 to SDWA, the destination of V_CMP_O_F16_e64 must be +# changed to vcc_lo first. This would introduce a vcc_hi use that requires special +# handling in si-peephole-sdwa. + +--- +name: v_minimum_v2f16__nsz +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: v_minimum_v2f16__nsz + ; CHECK: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_PK_MIN_F16_:%[0-9]+]]:vgpr_32 = V_PK_MIN_F16 8, [[COPY]], 8, undef [[DEF]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 32256, 0, [[V_PK_MIN_F16_]], killed undef [[DEF1]], implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[COPY]], implicit $exec + ; CHECK-NEXT: [[V_CMP_O_F16_sdwa:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_O_F16_sdwa 0, [[COPY]], 0, undef [[V_CNDMASK_B32_e64_]], 0, 5, 6, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[V_PK_MIN_F16_]], implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 32256, 0, undef [[V_LSHRREV_B32_e64_1]], killed undef [[V_CMP_O_F16_sdwa]], implicit $exec + ; CHECK-NEXT: [[V_PERM_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERM_B32_e64 killed [[V_CNDMASK_B32_e64_1]], [[V_CNDMASK_B32_e64_]], 84148480, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_PERM_B32_e64_]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 + %30:vgpr_32 = IMPLICIT_DEF + %31:sreg_32_xm0_xexec = IMPLICIT_DEF + %8:vgpr_32 = COPY $vgpr0 + %13:vgpr_32 = V_PK_MIN_F16 8, %8, 8, undef %30, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %16:vgpr_32 = V_CNDMASK_B32_e64 0, 32256, 0, %13, killed undef %31, implicit $exec + %20:vgpr_32 = V_LSHRREV_B32_e64 16, %8, implicit $exec + %22:sreg_32_xm0_xexec = V_CMP_O_F16_e64 0, undef %20, 0, undef %16, 0, implicit $mode, implicit $exec + %23:vgpr_32 = V_LSHRREV_B32_e64 16, undef %13, implicit $exec + %25:vgpr_32 = V_CNDMASK_B32_e64 0, 32256, 0, undef %23, killed undef %22, implicit $exec + %27:vgpr_32 = V_PERM_B32_e64 killed %25, killed %16, 84148480, implicit $exec + $vgpr0 = COPY %27 + SI_RETURN implicit $vgpr0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll index a2e1ae83a6e5f..aa65c6afd4eb9 100644 --- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll @@ -42,10 +42,8 @@ define <2 x half> @add_select_fabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; VI-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; VI-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_add_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v0, v0, v4 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -58,10 +56,8 @@ define <2 x half> @add_select_fabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v4 @@ -175,14 +171,12 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_lhs_fabs_fabs_v2f16(<2 x ; VI-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc +; VI-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_add_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v0, v0, v5 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_add_f16_sdwa v1, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_f16_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v2, v2, v4 ; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -194,10 +188,8 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_lhs_fabs_fabs_v2f16(<2 x ; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v5 @@ -313,10 +305,8 @@ define { <2 x half>, <2 x half> } @add_select_multi_store_use_lhs_fabs_fabs_v2f1 ; VI-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_add_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v0, v0, v4 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -330,10 +320,8 @@ define { <2 x half>, <2 x half> } @add_select_multi_store_use_lhs_fabs_fabs_v2f1 ; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v4 @@ -454,14 +442,12 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_rhs_fabs_fabs_v2f16(<2 x ; VI-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; VI-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_add_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v0, v0, v4 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_add_f16_sdwa v1, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_f16_sdwa v1, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v2, v3, v5 ; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -473,10 +459,8 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_rhs_fabs_fabs_v2f16(<2 x ; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v4 @@ -589,10 +573,8 @@ define <2 x half> @add_select_fabs_var_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; VI-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; VI-NEXT: v_add_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -604,10 +586,8 @@ define <2 x half> @add_select_fabs_var_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 @@ -706,9 +686,8 @@ define <2 x half> @add_select_fabs_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; VI-NEXT: v_mov_b32_e32 v4, 0xbc00 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; VI-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v0, v0, v3 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -721,9 +700,8 @@ define <2 x half> @add_select_fabs_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX9-NEXT: v_mov_b32_e32 v4, 0xbc00 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v3 @@ -1039,9 +1017,8 @@ define <2 x half> @add_select_negk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; VI-NEXT: v_mov_b32_e32 v4, 0xbc00 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; VI-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v0, v0, v3 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1054,9 +1031,8 @@ define <2 x half> @add_select_negk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX9-NEXT: v_mov_b32_e32 v4, 0xbc00 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v3 @@ -1152,9 +1128,8 @@ define <2 x half> @add_select_negliteralk_fabs_v2f16(<2 x i32> %c, <2 x half> %x ; VI-NEXT: v_mov_b32_e32 v4, 0xe400 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; VI-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v0, v0, v3 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1167,9 +1142,8 @@ define <2 x half> @add_select_negliteralk_fabs_v2f16(<2 x i32> %c, <2 x half> %x ; GFX9-NEXT: v_mov_b32_e32 v4, 0xe400 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v3 @@ -1264,9 +1238,8 @@ define <2 x half> @add_select_fabs_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; VI-NEXT: v_mov_b32_e32 v4, 0x3c00 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; VI-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v0, v0, v3 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1279,9 +1252,8 @@ define <2 x half> @add_select_fabs_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX9-NEXT: v_mov_b32_e32 v4, 0x3c00 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v3 @@ -1376,9 +1348,8 @@ define <2 x half> @add_select_posk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; VI-NEXT: v_mov_b32_e32 v4, 0x3c00 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; VI-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v0, v0, v3 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1391,9 +1362,8 @@ define <2 x half> @add_select_posk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX9-NEXT: v_mov_b32_e32 v4, 0x3c00 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v3 @@ -1488,10 +1458,8 @@ define <2 x half> @add_select_fneg_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; VI-LABEL: add_select_fneg_fneg_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc +; VI-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; VI-NEXT: v_sub_f16_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -1502,10 +1470,8 @@ define <2 x half> @add_select_fneg_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX9-LABEL: add_select_fneg_fneg_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 @@ -1605,16 +1571,14 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_lhs_fneg_fneg_v2f16(<2 x ; VI-LABEL: add_select_multi_use_lhs_fneg_fneg_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; VI-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; VI-NEXT: v_sub_f16_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_sub_f16_e32 v0, v4, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_sub_f16_sdwa v1, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_sub_f16_sdwa v1, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_sub_f16_e32 v2, v5, v2 ; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -1622,10 +1586,8 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_lhs_fneg_fneg_v2f16(<2 x ; GFX9-LABEL: add_select_multi_use_lhs_fneg_fneg_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 @@ -1729,10 +1691,8 @@ define { <2 x half>, <2 x half> } @add_select_multi_store_use_lhs_fneg_fneg_v2f1 ; VI-LABEL: add_select_multi_store_use_lhs_fneg_fneg_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; VI-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; VI-NEXT: v_xor_b32_e32 v5, 0x80008000, v2 @@ -1745,10 +1705,8 @@ define { <2 x half>, <2 x half> } @add_select_multi_store_use_lhs_fneg_fneg_v2f1 ; GFX9-LABEL: add_select_multi_store_use_lhs_fneg_fneg_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 @@ -1856,16 +1814,14 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_rhs_fneg_fneg_v2f16(<2 x ; VI-LABEL: add_select_multi_use_rhs_fneg_fneg_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; VI-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; VI-NEXT: v_sub_f16_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_sub_f16_e32 v0, v4, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_sub_f16_sdwa v1, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_sub_f16_sdwa v1, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_sub_f16_e32 v2, v5, v3 ; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -1873,10 +1829,8 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_rhs_fneg_fneg_v2f16(<2 x ; GFX9-LABEL: add_select_multi_use_rhs_fneg_fneg_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 @@ -1983,10 +1937,8 @@ define <2 x half> @add_select_fneg_var_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; VI-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; VI-NEXT: v_add_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -1998,10 +1950,8 @@ define <2 x half> @add_select_fneg_var_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 @@ -2096,12 +2046,11 @@ define <2 x half> @add_select_fneg_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; VI-LABEL: add_select_fneg_negk_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; VI-NEXT: v_mov_b32_e32 v5, 0x3c00 +; VI-NEXT: v_mov_b32_e32 v4, 0x3c00 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; VI-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; VI-NEXT: v_sub_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_sub_f16_e32 v0, v3, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -2110,12 +2059,11 @@ define <2 x half> @add_select_fneg_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX9-LABEL: add_select_fneg_negk_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX9-NEXT: v_mov_b32_e32 v5, 0x3c00 +; GFX9-NEXT: v_mov_b32_e32 v4, 0x3c00 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] @@ -2201,12 +2149,11 @@ define <2 x half> @add_select_fneg_inv2pi_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; VI-LABEL: add_select_fneg_inv2pi_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; VI-NEXT: v_mov_b32_e32 v5, 0xb118 +; VI-NEXT: v_mov_b32_e32 v4, 0xb118 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; VI-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; VI-NEXT: v_sub_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_sub_f16_e32 v0, v3, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -2215,12 +2162,11 @@ define <2 x half> @add_select_fneg_inv2pi_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX9-LABEL: add_select_fneg_inv2pi_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX9-NEXT: v_mov_b32_e32 v5, 0xb118 +; GFX9-NEXT: v_mov_b32_e32 v4, 0xb118 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] @@ -2306,12 +2252,11 @@ define <2 x half> @add_select_fneg_neginv2pi_v2f16(<2 x i32> %c, <2 x half> %x, ; VI-LABEL: add_select_fneg_neginv2pi_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; VI-NEXT: v_mov_b32_e32 v5, 0x3118 +; VI-NEXT: v_mov_b32_e32 v4, 0x3118 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; VI-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; VI-NEXT: v_sub_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_sub_f16_e32 v0, v3, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -2320,12 +2265,11 @@ define <2 x half> @add_select_fneg_neginv2pi_v2f16(<2 x i32> %c, <2 x half> %x, ; GFX9-LABEL: add_select_fneg_neginv2pi_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX9-NEXT: v_mov_b32_e32 v5, 0x3118 +; GFX9-NEXT: v_mov_b32_e32 v4, 0x3118 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] @@ -2725,12 +2669,11 @@ define <2 x half> @add_select_negk_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; VI-LABEL: add_select_negk_fneg_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; VI-NEXT: v_mov_b32_e32 v5, 0x3c00 +; VI-NEXT: v_mov_b32_e32 v4, 0x3c00 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; VI-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; VI-NEXT: v_sub_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_sub_f16_e32 v0, v3, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -2739,12 +2682,11 @@ define <2 x half> @add_select_negk_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX9-LABEL: add_select_negk_fneg_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX9-NEXT: v_mov_b32_e32 v5, 0x3c00 +; GFX9-NEXT: v_mov_b32_e32 v4, 0x3c00 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] @@ -2829,12 +2771,11 @@ define <2 x half> @add_select_fneg_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; VI-LABEL: add_select_fneg_posk_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; VI-NEXT: v_mov_b32_e32 v5, 0xbc00 +; VI-NEXT: v_mov_b32_e32 v4, 0xbc00 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; VI-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; VI-NEXT: v_sub_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_sub_f16_e32 v0, v3, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -2843,12 +2784,11 @@ define <2 x half> @add_select_fneg_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX9-LABEL: add_select_fneg_posk_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX9-NEXT: v_mov_b32_e32 v5, 0xbc00 +; GFX9-NEXT: v_mov_b32_e32 v4, 0xbc00 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] @@ -2933,12 +2873,11 @@ define <2 x half> @add_select_posk_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; VI-LABEL: add_select_posk_fneg_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; VI-NEXT: v_mov_b32_e32 v5, 0xbc00 +; VI-NEXT: v_mov_b32_e32 v4, 0xbc00 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; VI-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; VI-NEXT: v_sub_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_sub_f16_e32 v0, v3, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -2947,12 +2886,11 @@ define <2 x half> @add_select_posk_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX9-LABEL: add_select_posk_fneg_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX9-NEXT: v_mov_b32_e32 v5, 0xbc00 +; GFX9-NEXT: v_mov_b32_e32 v4, 0xbc00 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] @@ -3049,10 +2987,8 @@ define <2 x half> @add_select_negfabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; VI-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; VI-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_add_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v0, v0, v4 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3065,10 +3001,8 @@ define <2 x half> @add_select_negfabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v4 @@ -3181,10 +3115,8 @@ define <2 x half> @add_select_fabs_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; VI-NEXT: v_or_b32_e32 v3, 0x80008000, v3 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; VI-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_add_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v0, v0, v4 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3197,10 +3129,8 @@ define <2 x half> @add_select_fabs_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX9-NEXT: v_or_b32_e32 v3, 0x80008000, v3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v4 @@ -3313,10 +3243,8 @@ define <2 x half> @add_select_neg_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; VI-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; VI-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_add_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v0, v0, v4 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3329,10 +3257,8 @@ define <2 x half> @add_select_neg_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v4 @@ -3444,10 +3370,8 @@ define <2 x half> @add_select_fabs_neg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; VI-NEXT: v_xor_b32_e32 v3, 0x80008000, v3 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; VI-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_add_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v0, v0, v4 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3460,10 +3384,8 @@ define <2 x half> @add_select_fabs_neg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; GFX9-NEXT: v_xor_b32_e32 v3, 0x80008000, v3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v4 @@ -3568,10 +3490,8 @@ define <2 x half> @add_select_neg_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc +; VI-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; VI-NEXT: v_sub_f16_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -3583,10 +3503,8 @@ define <2 x half> @add_select_neg_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 @@ -3688,10 +3606,8 @@ define <2 x half> @add_select_negfabs_neg_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc +; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; VI-NEXT: v_sub_f16_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -3703,10 +3619,8 @@ define <2 x half> @add_select_negfabs_neg_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 @@ -3811,9 +3725,8 @@ define <2 x half> @mul_select_negfabs_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; VI-NEXT: v_mov_b32_e32 v4, 0x4400 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; VI-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_mul_f16_e32 v0, v0, v3 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3826,9 +3739,8 @@ define <2 x half> @mul_select_negfabs_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x4400 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v3 @@ -3928,9 +3840,8 @@ define <2 x half> @mul_select_posk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; VI-NEXT: v_mov_b32_e32 v4, 0x4400 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; VI-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_mul_f16_e32 v0, v0, v3 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3943,9 +3854,8 @@ define <2 x half> @mul_select_posk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x4400 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v3 @@ -4045,9 +3955,8 @@ define <2 x half> @mul_select_negfabs_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; VI-NEXT: v_mov_b32_e32 v4, 0xc400 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; VI-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_mul_f16_e32 v0, v0, v3 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -4060,9 +3969,8 @@ define <2 x half> @mul_select_negfabs_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX9-NEXT: v_mov_b32_e32 v4, 0xc400 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v3 @@ -4162,9 +4070,8 @@ define <2 x half> @mul_select_negk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; VI-NEXT: v_mov_b32_e32 v4, 0xc400 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; VI-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_mul_f16_e32 v0, v0, v3 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -4177,9 +4084,8 @@ define <2 x half> @mul_select_negk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX9-NEXT: v_mov_b32_e32 v4, 0xc400 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v3 @@ -4287,10 +4193,8 @@ define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, < ; VI-SAFE-NEXT: v_mov_b32_e32 v3, 0x4000 ; VI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc -; VI-SAFE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-SAFE-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-SAFE-NEXT: s_setpc_b64 s[30:31] ; @@ -4302,9 +4206,8 @@ define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, < ; GFX9-SAFE-NEXT: v_mov_b32_e32 v3, 0x4000 ; GFX9-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GFX9-SAFE-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-SAFE-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-SAFE-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] @@ -4362,8 +4265,7 @@ define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, < ; VI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; VI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NSZ-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; VI-NSZ-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NSZ-NEXT: v_cndmask_b32_sdwa v1, v4, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NSZ-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NSZ-NEXT: s_setpc_b64 s[30:31] ; @@ -4374,9 +4276,8 @@ define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, < ; GFX9-NSZ-NEXT: v_mov_b32_e32 v3, 0x4000 ; GFX9-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NSZ-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NSZ-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NSZ-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GFX9-NSZ-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NSZ-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NSZ-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NSZ-NEXT: s_setpc_b64 s[30:31] @@ -4447,10 +4348,8 @@ define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) { ; VI-SAFE-NEXT: v_mov_b32_e32 v3, 0x4000 ; VI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc -; VI-SAFE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-SAFE-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-SAFE-NEXT: s_setpc_b64 s[30:31] ; @@ -4462,9 +4361,8 @@ define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) { ; GFX9-SAFE-NEXT: v_mov_b32_e32 v3, 0x4000 ; GFX9-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GFX9-SAFE-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-SAFE-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-SAFE-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] @@ -4522,8 +4420,7 @@ define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) { ; VI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; VI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NSZ-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; VI-NSZ-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NSZ-NEXT: v_cndmask_b32_sdwa v1, v4, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NSZ-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NSZ-NEXT: s_setpc_b64 s[30:31] ; @@ -4534,9 +4431,8 @@ define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) { ; GFX9-NSZ-NEXT: v_mov_b32_e32 v3, 0x4000 ; GFX9-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NSZ-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NSZ-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NSZ-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GFX9-NSZ-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NSZ-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NSZ-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NSZ-NEXT: s_setpc_b64 s[30:31] @@ -4598,8 +4494,7 @@ define <2 x half> @select_fneg_posk_src_mul_v2f16(<2 x i32> %c, <2 x half> %x) { ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_cndmask_b32_sdwa v1, v4, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -4610,9 +4505,8 @@ define <2 x half> @select_fneg_posk_src_mul_v2f16(<2 x i32> %c, <2 x half> %x) { ; GFX9-NEXT: v_mov_b32_e32 v3, 0x4000 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4714,10 +4608,8 @@ define <2 x half> @select_fneg_posk_src_fma_v2f16(<2 x i32> %c, <2 x half> %x, < ; VI-SAFE-NEXT: v_mov_b32_e32 v3, 0x4000 ; VI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc -; VI-SAFE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-SAFE-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-SAFE-NEXT: s_setpc_b64 s[30:31] ; @@ -4729,9 +4621,8 @@ define <2 x half> @select_fneg_posk_src_fma_v2f16(<2 x i32> %c, <2 x half> %x, < ; GFX9-SAFE-NEXT: v_mov_b32_e32 v3, 0x4000 ; GFX9-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GFX9-SAFE-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-SAFE-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-SAFE-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] @@ -4775,8 +4666,7 @@ define <2 x half> @select_fneg_posk_src_fma_v2f16(<2 x i32> %c, <2 x half> %x, < ; VI-NSZ-NEXT: v_fma_f16 v4, v5, -4.0, -v4 ; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; VI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NSZ-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; VI-NSZ-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NSZ-NEXT: v_cndmask_b32_sdwa v1, v3, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NSZ-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NSZ-NEXT: s_setpc_b64 s[30:31] ; @@ -4787,9 +4677,8 @@ define <2 x half> @select_fneg_posk_src_fma_v2f16(<2 x i32> %c, <2 x half> %x, < ; GFX9-NSZ-NEXT: v_mov_b32_e32 v3, 0x4000 ; GFX9-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NSZ-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NSZ-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NSZ-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GFX9-NSZ-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NSZ-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NSZ-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NSZ-NEXT: s_setpc_b64 s[30:31] @@ -4868,10 +4757,8 @@ define <2 x half> @select_fneg_posk_src_fmad_v2f16(<2 x i32> %c, <2 x half> %x, ; VI-SAFE-NEXT: v_mov_b32_e32 v3, 0x4000 ; VI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc -; VI-SAFE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-SAFE-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-SAFE-NEXT: s_setpc_b64 s[30:31] ; @@ -4883,9 +4770,8 @@ define <2 x half> @select_fneg_posk_src_fmad_v2f16(<2 x i32> %c, <2 x half> %x, ; GFX9-SAFE-NEXT: v_mov_b32_e32 v3, 0x4000 ; GFX9-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GFX9-SAFE-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-SAFE-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-SAFE-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] @@ -4950,8 +4836,7 @@ define <2 x half> @select_fneg_posk_src_fmad_v2f16(<2 x i32> %c, <2 x half> %x, ; VI-NSZ-NEXT: v_fma_f16 v4, v5, -4.0, -v4 ; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; VI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NSZ-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; VI-NSZ-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NSZ-NEXT: v_cndmask_b32_sdwa v1, v3, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NSZ-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NSZ-NEXT: s_setpc_b64 s[30:31] ; @@ -4962,9 +4847,8 @@ define <2 x half> @select_fneg_posk_src_fmad_v2f16(<2 x i32> %c, <2 x half> %x, ; GFX9-NSZ-NEXT: v_mov_b32_e32 v3, 0x4000 ; GFX9-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NSZ-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NSZ-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NSZ-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GFX9-NSZ-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NSZ-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NSZ-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NSZ-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll index 7339b545686f5..862bc792680f9 100644 --- a/llvm/test/CodeGen/AMDGPU/select.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll @@ -809,12 +809,10 @@ define amdgpu_kernel void @select_v2f16( ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x44 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_mov_b32 s18, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s16, s10 ; VI-NEXT: s_mov_b32 s17, s11 -; VI-NEXT: s_mov_b32 s18, s2 ; VI-NEXT: s_mov_b32 s19, s3 ; VI-NEXT: s_mov_b32 s20, s12 ; VI-NEXT: s_mov_b32 s21, s13 @@ -823,27 +821,22 @@ define amdgpu_kernel void @select_v2f16( ; VI-NEXT: s_mov_b32 s12, s14 ; VI-NEXT: s_mov_b32 s13, s15 ; VI-NEXT: s_mov_b32 s14, s2 -; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; VI-NEXT: buffer_load_dword v1, off, s[20:23], 0 -; VI-NEXT: buffer_load_dword v2, off, s[16:19], 0 ; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0 +; VI-NEXT: buffer_load_dword v1, off, s[20:23], 0 +; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 ; VI-NEXT: buffer_load_dword v3, off, s[12:15], 0 ; VI-NEXT: s_mov_b32 s0, s8 ; VI-NEXT: s_mov_b32 s1, s9 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; VI-NEXT: v_cmp_lt_f16_e32 vcc, v2, v1 +; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; VI-NEXT: v_cmp_lt_f16_e32 vcc, v6, v5 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc +; VI-NEXT: v_cmp_lt_f16 vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_cndmask_b32_sdwa v0, v2, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -1023,20 +1016,16 @@ define amdgpu_kernel void @select_v2f16_imm_a( ; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 -; VI-NEXT: s_movk_i32 s2, 0x3900 +; VI-NEXT: v_mov_b32_e32 v3, 0x3900 ; VI-NEXT: s_mov_b32 s8, s0 ; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, 0.5, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_cmp_lt_f16_e32 vcc, s2, v3 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v4, v2, v1, vcc +; VI-NEXT: v_cmp_gt_f16 vcc, v0, v3 src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_sdwa v0, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; @@ -1202,20 +1191,16 @@ define amdgpu_kernel void @select_v2f16_imm_b( ; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 -; VI-NEXT: s_movk_i32 s2, 0x3900 +; VI-NEXT: v_mov_b32_e32 v3, 0x3900 ; VI-NEXT: s_mov_b32 s8, s0 ; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; VI-NEXT: v_cmp_gt_f16_e32 vcc, 0.5, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_cmp_gt_f16_e32 vcc, s2, v3 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v4, v2, v1, vcc +; VI-NEXT: v_cmp_lt_f16 vcc, v0, v3 src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_sdwa v0, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; @@ -1362,42 +1347,37 @@ define amdgpu_kernel void @select_v2f16_imm_c( ; ; VI-LABEL: select_v2f16_imm_c: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s18, s2 -; VI-NEXT: s_mov_b32 s19, s3 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s16, s8 -; VI-NEXT: s_mov_b32 s17, s9 -; VI-NEXT: s_mov_b32 s14, s2 -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: s_mov_b32 s8, s10 -; VI-NEXT: s_mov_b32 s9, s11 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0 -; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s11, s3 -; VI-NEXT: buffer_load_dword v2, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 ; VI-NEXT: v_mov_b32_e32 v3, 0x3800 ; VI-NEXT: v_mov_b32_e32 v4, 0x3900 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v1, v0 +; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v6, v5 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; VI-NEXT: v_cmp_nlt_f16 vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_cndmask_b32_sdwa v0, v4, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX11-TRUE16-LABEL: select_v2f16_imm_c: @@ -1543,42 +1523,37 @@ define amdgpu_kernel void @select_v2f16_imm_d( ; ; VI-LABEL: select_v2f16_imm_d: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s18, s2 -; VI-NEXT: s_mov_b32 s19, s3 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s16, s8 -; VI-NEXT: s_mov_b32 s17, s9 -; VI-NEXT: s_mov_b32 s14, s2 -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: s_mov_b32 s8, s10 -; VI-NEXT: s_mov_b32 s9, s11 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0 -; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s11, s3 -; VI-NEXT: buffer_load_dword v2, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 ; VI-NEXT: v_mov_b32_e32 v3, 0x3800 ; VI-NEXT: v_mov_b32_e32 v4, 0x3900 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; VI-NEXT: v_cmp_lt_f16_e32 vcc, v1, v0 +; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; VI-NEXT: v_cmp_lt_f16_e32 vcc, v6, v5 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; VI-NEXT: v_cmp_lt_f16 vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_cndmask_b32_sdwa v0, v4, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX11-TRUE16-LABEL: select_v2f16_imm_d: @@ -1757,22 +1732,16 @@ define <4 x half> @v_vselect_v4f16(<4 x half> %a, <4 x half> %b, <4 x i32> %cond ; VI-LABEL: v_vselect_v4f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 -; VI-NEXT: v_cndmask_b32_e32 v7, v9, v8, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; VI-NEXT: v_cndmask_b32_sdwa v7, v3, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; VI-NEXT: v_cndmask_b32_e32 v5, v9, v8, vcc +; VI-NEXT: v_cndmask_b32_sdwa v5, v2, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 ; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_vselect_v4f16: @@ -1949,22 +1918,14 @@ define <8 x half> @v_vselect_v8f16(<8 x half> %a, <8 x half> %b, <8 x i32> %cond ; VI-LABEL: v_vselect_v8f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15 -; VI-NEXT: v_cndmask_b32_e32 v15, v17, v16, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v6 +; VI-NEXT: v_cndmask_b32_sdwa v15, v7, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13 -; VI-NEXT: v_cndmask_b32_e32 v13, v17, v16, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; VI-NEXT: v_cndmask_b32_sdwa v13, v6, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v11 -; VI-NEXT: v_cndmask_b32_e32 v11, v17, v16, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v4 +; VI-NEXT: v_cndmask_b32_sdwa v11, v5, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 -; VI-NEXT: v_cndmask_b32_e32 v9, v17, v16, vcc +; VI-NEXT: v_cndmask_b32_sdwa v9, v4, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14 ; VI-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12 @@ -1973,14 +1934,10 @@ define <8 x half> @v_vselect_v8f16(<8 x half> %a, <8 x half> %b, <8 x i32> %cond ; VI-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 -; VI-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v11 -; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v13 -; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v15 -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_vselect_v8f16: @@ -2314,72 +2271,48 @@ define <16 x half> @v_vselect_v16f16(<16 x half> %a, <16 x half> %b, <16 x i32> ; VI-LABEL: v_vselect_v16f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 -; VI-NEXT: v_cmp_eq_u32_e64 s[18:19], 0, v17 -; VI-NEXT: v_cmp_eq_u32_e64 s[40:41], 0, v29 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; VI-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v18 -; VI-NEXT: v_cmp_eq_u32_e64 s[28:29], 0, v27 -; VI-NEXT: v_cndmask_b32_e64 v16, v17, v16, s[40:41] -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 -; VI-NEXT: v_cmp_eq_u32_e64 s[20:21], 0, v19 -; VI-NEXT: v_cmp_eq_u32_e64 s[26:27], 0, v25 -; VI-NEXT: v_cndmask_b32_e64 v17, v18, v17, s[28:29] -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 -; VI-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v20 -; VI-NEXT: v_cmp_eq_u32_e64 s[24:25], 0, v23 -; VI-NEXT: v_cndmask_b32_e64 v18, v19, v18, s[26:27] -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 -; VI-NEXT: v_cmp_eq_u32_e64 s[22:23], 0, v21 -; VI-NEXT: v_cndmask_b32_e64 v19, v20, v19, s[24:25] -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; VI-NEXT: v_cmp_eq_u32_e64 s[10:11], 0, v22 -; VI-NEXT: v_cndmask_b32_e64 v20, v21, v20, s[22:23] -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 -; VI-NEXT: v_cndmask_b32_e64 v21, v22, v21, s[20:21] -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; VI-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[4:5] -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; VI-NEXT: v_cndmask_b32_e64 v22, v23, v22, s[18:19] -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 -; VI-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[6:7] -; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v22 -; VI-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[8:9] -; VI-NEXT: v_or_b32_sdwa v0, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; VI-NEXT: v_cndmask_b32_e32 v7, v15, v7, vcc -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v15 -; VI-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v24 -; VI-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[10:11] -; VI-NEXT: v_cmp_eq_u32_e64 s[14:15], 0, v26 -; VI-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[12:13] -; VI-NEXT: v_cmp_eq_u32_e64 s[16:17], 0, v28 -; VI-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[14:15] -; VI-NEXT: v_cndmask_b32_e64 v6, v14, v6, s[16:17] +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v30 +; VI-NEXT: v_cndmask_b32_e64 v30, v15, v7, s[4:5] +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v28 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 -; VI-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc -; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v21 -; VI-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v20 -; VI-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v19 -; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v18 -; VI-NEXT: v_or_b32_sdwa v4, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 -; VI-NEXT: v_or_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v16 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_or_b32_sdwa v6, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 +; VI-NEXT: v_cndmask_b32_sdwa v7, v15, v7, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v29 +; VI-NEXT: v_cndmask_b32_e64 v15, v14, v6, s[4:5] +; VI-NEXT: v_cndmask_b32_sdwa v6, v14, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26 +; VI-NEXT: v_cndmask_b32_e32 v14, v13, v5, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 +; VI-NEXT: v_cndmask_b32_e32 v24, v12, v4, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v22 +; VI-NEXT: v_cndmask_b32_e32 v22, v11, v3, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v20 +; VI-NEXT: v_cndmask_b32_e32 v20, v10, v2, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 +; VI-NEXT: v_cndmask_b32_e32 v18, v9, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v8, v0, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v27 +; VI-NEXT: v_cndmask_b32_sdwa v5, v13, v5, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v25 +; VI-NEXT: v_cndmask_b32_sdwa v4, v12, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v23 +; VI-NEXT: v_cndmask_b32_sdwa v3, v11, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v21 +; VI-NEXT: v_cndmask_b32_sdwa v2, v10, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 +; VI-NEXT: v_cndmask_b32_sdwa v1, v9, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 +; VI-NEXT: v_cndmask_b32_sdwa v0, v8, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v7, v30, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v15, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v14, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v24, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v22, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_vselect_v16f16: @@ -3135,223 +3068,144 @@ define <32 x half> @v_vselect_v32f16(<32 x half> %a, <32 x half> %b, <32 x i32> ; VI-LABEL: v_vselect_v32f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:120 -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:112 -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:104 -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 -; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:124 -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v30 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v9 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:76 ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v36 -; VI-NEXT: v_cndmask_b32_e32 v36, v43, v38, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v35 -; VI-NEXT: v_cndmask_b32_e32 v35, v45, v44, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v34 -; VI-NEXT: v_cndmask_b32_e32 v34, v47, v46, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v33 -; VI-NEXT: v_cndmask_b32_e32 v33, v57, v56, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v25 -; VI-NEXT: v_cndmask_b32_e32 v32, v59, v58, vcc -; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v24 -; VI-NEXT: v_cndmask_b32_e32 v38, v38, v60, vcc +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v32 +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_cndmask_b32_e64 v31, v33, v15, s[4:5] +; VI-NEXT: v_cndmask_b32_sdwa v15, v33, v15, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v39 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v15 -; VI-NEXT: v_cndmask_b32_e32 v39, v44, v43, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v34 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v35 +; VI-NEXT: v_cndmask_b32_e64 v32, v30, v14, s[4:5] +; VI-NEXT: v_cndmask_b32_sdwa v14, v30, v14, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:64 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v36 +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:56 +; VI-NEXT: v_cndmask_b32_e64 v33, v29, v13, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v37 +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:48 +; VI-NEXT: v_cndmask_b32_e64 v34, v28, v12, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v38 +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:40 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v48 +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32 +; VI-NEXT: v_cndmask_b32_sdwa v13, v29, v13, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v49 +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 +; VI-NEXT: v_cndmask_b32_sdwa v12, v28, v12, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v37 -; VI-NEXT: s_waitcnt vmcnt(10) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v55 -; VI-NEXT: v_cndmask_b32_e32 v31, v31, v45, vcc -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v23 -; VI-NEXT: s_waitcnt vmcnt(9) ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v50 -; VI-NEXT: v_cndmask_b32_e32 v50, v43, v55, vcc -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v22 -; VI-NEXT: s_waitcnt vmcnt(8) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v53 -; VI-NEXT: v_cndmask_b32_e32 v53, v43, v55, vcc -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v21 -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v54 -; VI-NEXT: v_cndmask_b32_e32 v54, v43, v55, vcc -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v20 -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v52 -; VI-NEXT: v_cndmask_b32_e32 v52, v43, v55, vcc -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v19 -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16 +; VI-NEXT: v_cndmask_b32_e64 v35, v27, v11, s[4:5] +; VI-NEXT: v_cndmask_b32_sdwa v11, v27, v11, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: s_waitcnt vmcnt(11) ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v51 -; VI-NEXT: v_cndmask_b32_e32 v51, v43, v55, vcc -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:108 -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v49 -; VI-NEXT: v_cndmask_b32_e32 v49, v43, v55, vcc -; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:92 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v17 -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 -; VI-NEXT: s_waitcnt vmcnt(8) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v48 -; VI-NEXT: v_cndmask_b32_e32 v48, v46, v43, vcc -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v16 -; VI-NEXT: s_waitcnt vmcnt(10) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v40 -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 -; VI-NEXT: v_cndmask_b32_e32 v46, v58, v46, vcc -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:8 +; VI-NEXT: v_cndmask_b32_sdwa v27, v26, v10, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v41 -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 -; VI-NEXT: v_cndmask_b32_e32 v15, v37, v15, vcc -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 -; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v42 -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 -; VI-NEXT: v_cndmask_b32_e32 v14, v30, v14, vcc -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:4 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v52 +; VI-NEXT: v_cndmask_b32_sdwa v28, v25, v9, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v53 +; VI-NEXT: v_cndmask_b32_sdwa v29, v24, v8, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v54 +; VI-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v39 +; VI-NEXT: v_cndmask_b32_e64 v9, v25, v9, s[4:5] +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:12 +; VI-NEXT: v_or_b32_sdwa v9, v9, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v35, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v34, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v33, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v32, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v31, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:52 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 +; VI-NEXT: v_cndmask_b32_sdwa v30, v23, v7, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: s_waitcnt vmcnt(13) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v44 -; VI-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v36 +; VI-NEXT: v_cndmask_b32_sdwa v36, v22, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v55 -; VI-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v37 +; VI-NEXT: v_cndmask_b32_sdwa v37, v21, v5, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v45 -; VI-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v38 +; VI-NEXT: v_cndmask_b32_sdwa v38, v20, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: s_waitcnt vmcnt(10) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v47 -; VI-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v48 +; VI-NEXT: v_cndmask_b32_sdwa v48, v19, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v56 -; VI-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v49 +; VI-NEXT: v_cndmask_b32_sdwa v49, v18, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: s_waitcnt vmcnt(8) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v43 -; VI-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v50 +; VI-NEXT: v_cndmask_b32_sdwa v50, v17, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v57 -; VI-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v59 -; VI-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v40 -; VI-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v58 -; VI-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v41 -; VI-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v37 -; VI-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v42 -; VI-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v51 +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 -; VI-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v46 -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v48 -; VI-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v49 -; VI-NEXT: v_or_b32_sdwa v2, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v51 -; VI-NEXT: v_or_b32_sdwa v3, v3, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v52 -; VI-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v54 -; VI-NEXT: v_or_b32_sdwa v5, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v53 -; VI-NEXT: v_or_b32_sdwa v6, v6, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v50 -; VI-NEXT: v_or_b32_sdwa v7, v7, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v39 -; VI-NEXT: v_or_b32_sdwa v8, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v38 -; VI-NEXT: v_or_b32_sdwa v9, v9, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v32 -; VI-NEXT: v_or_b32_sdwa v10, v10, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v33 -; VI-NEXT: v_or_b32_sdwa v11, v11, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v34 -; VI-NEXT: v_or_b32_sdwa v12, v12, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v35 -; VI-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v36 -; VI-NEXT: v_or_b32_sdwa v14, v14, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v31 -; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v51 +; VI-NEXT: v_cndmask_b32_e64 v8, v24, v8, s[4:5] +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v26 +; VI-NEXT: v_cndmask_b32_e64 v7, v23, v7, s[4:5] +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:4 +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v39 +; VI-NEXT: v_cndmask_b32_e64 v6, v22, v6, s[4:5] +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v25 +; VI-NEXT: v_cndmask_b32_e64 v5, v21, v5, s[4:5] +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v52 +; VI-NEXT: v_cndmask_b32_e64 v4, v20, v4, s[4:5] +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v53 +; VI-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[4:5] +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v54 +; VI-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v55 +; VI-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[4:5] +; VI-NEXT: v_cndmask_b32_sdwa v17, v16, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v8, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v23 +; VI-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc +; VI-NEXT: v_or_b32_sdwa v0, v0, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_vselect_v32f16: diff --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll index 40d80f5e83e36..6972aa3a0529b 100644 --- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll @@ -184,17 +184,17 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX8-NEXT: v_sub_u16_e32 v4, v3, v2 ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v4, v3 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], 0, v2 +; GFX8-NEXT: s_xor_b64 s[4:5], s[4:5], vcc ; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v4 +; GFX8-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[4:5] ; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2 -; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX8-NEXT: v_cndmask_b32_sdwa v2, v4, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v1 ; GFX8-NEXT: v_sub_u16_e32 v1, v0, v1 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v1, v0 ; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v1 ; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -247,10 +247,11 @@ define <3 x i16> @v_ssubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) { ; GFX8-NEXT: v_sub_u16_e32 v6, v5, v4 ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], 0, v4 +; GFX8-NEXT: s_xor_b64 s[4:5], s[4:5], vcc ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 15, v6 +; GFX8-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[4:5] ; GFX8-NEXT: v_xor_b32_e32 v4, 0xffff8000, v4 -; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX8-NEXT: v_cndmask_b32_sdwa v4, v6, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v3 ; GFX8-NEXT: v_sub_u16_e32 v3, v1, v3 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v3, v1 @@ -265,8 +266,7 @@ define <3 x i16> @v_ssubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) { ; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_v3i16: @@ -324,17 +324,17 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX8-NEXT: v_sub_u16_e32 v6, v5, v4 ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], 0, v4 +; GFX8-NEXT: s_xor_b64 s[4:5], s[4:5], vcc ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 15, v6 +; GFX8-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[4:5] ; GFX8-NEXT: v_xor_b32_e32 v4, 0xffff8000, v4 -; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX8-NEXT: v_cndmask_b32_sdwa v4, v6, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v2 ; GFX8-NEXT: v_sub_u16_e32 v2, v0, v2 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v2, v0 ; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v2 ; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3 @@ -342,17 +342,17 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX8-NEXT: v_sub_u16_e32 v5, v4, v2 ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v5, v4 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], 0, v2 +; GFX8-NEXT: s_xor_b64 s[4:5], s[4:5], vcc ; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v5 +; GFX8-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[4:5] ; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2 -; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GFX8-NEXT: v_cndmask_b32_sdwa v2, v5, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v3 ; GFX8-NEXT: v_sub_u16_e32 v3, v1, v3 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v3, v1 ; GFX8-NEXT: v_ashrrev_i16_e32 v1, 15, v3 ; GFX8-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] From 16e411802931b60cc02a48f43ef547b5969fc4fe Mon Sep 17 00:00:00 2001 From: Frederik Harwath Date: Wed, 30 Apr 2025 05:36:35 -0400 Subject: [PATCH 03/26] Change computeRegisterLiveness use - Don' use auto - Adapt other use for consistency - Use default threshold - Adjust tests --- llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 6 +- llvm/test/CodeGen/AMDGPU/bf16.ll | 1007 ++++++++++------- .../CodeGen/AMDGPU/extract_vector_elt-f16.ll | 26 +- llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll | 54 +- llvm/test/CodeGen/AMDGPU/fmaximum3.ll | 229 ++-- llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll | 54 +- llvm/test/CodeGen/AMDGPU/fminimum3.ll | 229 ++-- .../CodeGen/AMDGPU/insert_vector_dynelt.ll | 30 +- .../AMDGPU/insert_vector_elt.v2bf16.ll | 203 ++-- .../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll | 203 ++-- llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll | 284 +++-- llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll | 284 +++-- llvm/test/CodeGen/AMDGPU/maximumnum.ll | 6 +- llvm/test/CodeGen/AMDGPU/minimumnum.ll | 6 +- llvm/test/CodeGen/AMDGPU/saddsat.ll | 26 +- .../AMDGPU/select-fabs-fneg-extract.v2f16.ll | 12 +- llvm/test/CodeGen/AMDGPU/select.f16.ll | 471 +++++--- llvm/test/CodeGen/AMDGPU/ssubsat.ll | 26 +- 18 files changed, 1834 insertions(+), 1322 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index f5f808623cc0c..7ad18356864a0 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -1038,7 +1038,8 @@ void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI, return; // Make sure VCC or its subregs are dead before MI. MachineBasicBlock &MBB = *MI.getParent(); - auto Liveness = MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25); + MachineBasicBlock::LivenessQueryResult Liveness = + MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25); if (Liveness != MachineBasicBlock::LQR_Dead) return; // Check if VCC is referenced in range of (MI,MISucc]. @@ -1100,7 +1101,8 @@ void SIPeepholeSDWA::convertToImplicitVcc(MachineInstr &MI, // Make sure VCC or its subregs are dead before MI. MachineBasicBlock &MBB = *MI.getParent(); - auto Liveness = MBB.computeRegisterLiveness(TRI, Vcc, MI, 100); + MachineBasicBlock::LivenessQueryResult Liveness = + MBB.computeRegisterLiveness(TRI, Vcc, MI); if (Liveness != MachineBasicBlock::LQR_Dead) { LLVM_DEBUG(dbgs() << "VCC not known to be dead before instruction.\n"); return; diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index e172bf090cca7..bc8665ffb9b63 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -40558,10 +40558,11 @@ define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX8-NEXT: v_cndmask_b32_sdwa v3, v4, v5, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX8-NEXT: v_mov_b32_e32 v4, s3 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 ; GFX8-NEXT: s_lshr_b32 s3, s2, 16 @@ -40569,10 +40570,11 @@ define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX8-NEXT: v_cndmask_b32_sdwa v1, v3, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 @@ -40758,18 +40760,24 @@ define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bflo ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v7 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX8-NEXT: v_cndmask_b32_sdwa v3, v7, v5, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v9, v8, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX8-NEXT: v_cndmask_b32_sdwa v1, v6, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_vselect_v4bf16: @@ -40975,32 +40983,44 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v7, 1, v7 ; GFX8-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v11 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v15 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 ; GFX8-NEXT: v_and_b32_e32 v5, 1, v5 -; GFX8-NEXT: v_cndmask_b32_sdwa v7, v15, v11, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v17, v16, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 ; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v15, v11, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v14 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 ; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX8-NEXT: v_cndmask_b32_sdwa v5, v14, v10, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v15, v11, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v14, v10, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v13 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX8-NEXT: v_cndmask_b32_sdwa v3, v13, v9, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v10, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v13, v9, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v12 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX8-NEXT: v_cndmask_b32_sdwa v1, v12, v8, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v10, v9, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v2, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v3, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_vselect_v8bf16: @@ -41011,25 +41031,33 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 ; GFX9-NEXT: v_and_b32_e32 v4, 1, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v15, v11, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 ; GFX9-NEXT: v_and_b32_e32 v5, 1, v5 -; GFX9-NEXT: v_cndmask_b32_sdwa v7, v15, v11, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v11, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v14, v10, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v14 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 ; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX9-NEXT: v_cndmask_b32_sdwa v5, v14, v10, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v11, v10, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v13, v9, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v13 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_cndmask_b32_sdwa v3, v13, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v10, v9, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v12 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v12, v8, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4 @@ -41425,128 +41453,168 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GFX8-LABEL: v_vselect_v16bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v15, 1, v15 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 -; GFX8-NEXT: buffer_load_dword v15, off, s[0:3], s32 -; GFX8-NEXT: v_and_b32_e32 v14, 1, v14 -; GFX8-NEXT: v_and_b32_e32 v13, 1, v13 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v14 -; GFX8-NEXT: v_and_b32_e32 v10, 1, v10 -; GFX8-NEXT: v_and_b32_e32 v8, 1, v8 -; GFX8-NEXT: v_and_b32_e32 v6, 1, v6 -; GFX8-NEXT: v_and_b32_e32 v12, 1, v12 -; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: v_and_b32_e32 v11, 1, v11 -; GFX8-NEXT: v_and_b32_e32 v9, 1, v9 -; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_sdwa v14, v15, v23, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13 -; GFX8-NEXT: v_cndmask_b32_sdwa v13, v30, v22, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 -; GFX8-NEXT: v_cndmask_b32_e64 v15, v15, v23, s[4:5] -; GFX8-NEXT: v_and_b32_e32 v23, 1, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v29, v21, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v28, v20, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v12 -; GFX8-NEXT: v_cndmask_b32_e32 v10, v27, v19, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX8-NEXT: v_cndmask_b32_e64 v12, v30, v22, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v30, v26, v18, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v31, v25, v17, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11 -; GFX8-NEXT: v_and_b32_e32 v22, 1, v7 -; GFX8-NEXT: v_cndmask_b32_sdwa v2, v29, v21, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 -; GFX8-NEXT: v_cndmask_b32_sdwa v4, v28, v20, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v22 -; GFX8-NEXT: v_or_b32_sdwa v5, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_cndmask_b32_sdwa v2, v27, v19, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v23 -; GFX8-NEXT: v_or_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_cndmask_b32_sdwa v8, v26, v18, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GFX8-NEXT: v_cndmask_b32_sdwa v9, v25, v17, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX8-NEXT: v_or_b32_sdwa v3, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v2, v30, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_cndmask_b32_sdwa v8, v24, v16, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_sdwa v7, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v6, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v31, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v0, 1, v1 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v2 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v3 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v4 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v5 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v6 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v7 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v8 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v9 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v10 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v11 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v12 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v13 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v14 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v15 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v30 +; GFX8-NEXT: v_cndmask_b32_e64 v6, v1, v0, s[28:29] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v29 +; GFX8-NEXT: v_cndmask_b32_e64 v5, v1, v0, s[24:25] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v28 +; GFX8-NEXT: v_cndmask_b32_e64 v4, v1, v0, s[20:21] +; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v23 +; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v24 +; GFX8-NEXT: v_cndmask_b32_e64 v7, v30, v22, s[26:27] +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_cndmask_b32_e64 v8, v29, v21, s[22:23] +; GFX8-NEXT: v_cndmask_b32_e64 v9, v28, v20, s[18:19] +; GFX8-NEXT: v_cndmask_b32_e64 v12, v27, v19, s[14:15] +; GFX8-NEXT: v_cndmask_b32_e64 v13, v26, v18, s[10:11] +; GFX8-NEXT: v_cndmask_b32_e64 v14, v25, v17, s[6:7] +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cndmask_b32_e64 v10, v0, v23, s[40:41] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v11, v0, v1, s[42:43] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v19 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v27 +; GFX8-NEXT: v_cndmask_b32_e64 v3, v1, v0, s[16:17] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v18 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v26 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[12:13] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v25 +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[8:9] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v15, v0, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v15, v24, v16, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v11 +; GFX8-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v7, v10, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_vselect_v16bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v15, 1, v15 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 -; GFX9-NEXT: v_and_b32_e32 v14, 1, v14 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v14 +; GFX9-NEXT: v_and_b32_e32 v12, 1, v12 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12 ; GFX9-NEXT: v_and_b32_e32 v13, 1, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v30, v22, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13 ; GFX9-NEXT: v_and_b32_e32 v10, 1, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v30, v22, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 +; GFX9-NEXT: v_and_b32_e32 v10, 1, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v29, v21, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v29 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v22, v21, vcc +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 ; GFX9-NEXT: v_and_b32_e32 v8, 1, v8 +; GFX9-NEXT: v_and_b32_e32 v9, 1, v9 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v28, v20, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v28 ; GFX9-NEXT: v_and_b32_e32 v6, 1, v6 -; GFX9-NEXT: v_and_b32_e32 v12, 1, v12 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 +; GFX9-NEXT: v_and_b32_e32 v7, 1, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v22, v8, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v27 ; GFX9-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v27, v19, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GFX9-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v22, v9, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; GFX9-NEXT: v_and_b32_e32 v14, 1, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v26, v18, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; GFX9-NEXT: v_and_b32_e32 v15, 1, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v27, v6, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14 ; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_and_b32_e32 v11, 1, v11 -; GFX9-NEXT: v_and_b32_e32 v9, 1, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v23 ; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v14, v15, v23, s[4:5] -; GFX9-NEXT: v_cndmask_b32_sdwa v15, v15, v23, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13 -; GFX9-NEXT: v_cndmask_b32_sdwa v13, v30, v22, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 -; GFX9-NEXT: v_and_b32_e32 v23, 1, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v29, v21, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v28, v20, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v27, v19, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v12, v30, v22, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v30, v26, v18, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v14, v21, v23, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v21 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v6, v7, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v31, v25, v17, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v25 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v15, v6, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11 -; GFX9-NEXT: v_and_b32_e32 v22, 1, v7 -; GFX9-NEXT: v_cndmask_b32_sdwa v2, v29, v21, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_cndmask_b32_sdwa v4, v28, v20, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v22 -; GFX9-NEXT: v_perm_b32 v5, v2, v5, s4 -; GFX9-NEXT: v_cndmask_b32_sdwa v2, v27, v19, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v23 -; GFX9-NEXT: v_perm_b32 v4, v4, v8, s4 -; GFX9-NEXT: v_cndmask_b32_sdwa v8, v26, v18, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GFX9-NEXT: v_cndmask_b32_sdwa v9, v25, v17, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v24 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX9-NEXT: v_perm_b32 v3, v2, v10, s4 -; GFX9-NEXT: v_perm_b32 v2, v8, v30, s4 -; GFX9-NEXT: v_cndmask_b32_sdwa v8, v24, v16, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_perm_b32 v7, v15, v14, s4 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v15, v6, vcc +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4 +; GFX9-NEXT: v_perm_b32 v2, v5, v4, s4 +; GFX9-NEXT: v_perm_b32 v3, v9, v19, s4 +; GFX9-NEXT: v_perm_b32 v4, v8, v20, s4 +; GFX9-NEXT: v_perm_b32 v5, v10, v11, s4 ; GFX9-NEXT: v_perm_b32 v6, v13, v12, s4 -; GFX9-NEXT: v_perm_b32 v1, v9, v31, s4 -; GFX9-NEXT: v_perm_b32 v0, v8, v0, s4 +; GFX9-NEXT: v_perm_b32 v7, v7, v14, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_vselect_v16bf16: @@ -42426,336 +42494,425 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX8-LABEL: v_vselect_v32bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v26, 1, v26 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v26 -; GFX8-NEXT: buffer_load_ushort v26, off, s[0:3], s32 -; GFX8-NEXT: v_and_b32_e32 v24, 1, v24 -; GFX8-NEXT: v_and_b32_e32 v22, 1, v22 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v24 -; GFX8-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128 -; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v22 -; GFX8-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 -; GFX8-NEXT: v_and_b32_e32 v30, 1, v30 -; GFX8-NEXT: v_and_b32_e32 v28, 1, v28 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v30 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v28 -; GFX8-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:124 -; GFX8-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:60 -; GFX8-NEXT: v_and_b32_e32 v18, 1, v18 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v18 -; GFX8-NEXT: v_and_b32_e32 v20, 1, v20 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v20 -; GFX8-NEXT: v_and_b32_e32 v25, 1, v25 -; GFX8-NEXT: v_and_b32_e32 v23, 1, v23 -; GFX8-NEXT: v_and_b32_e32 v21, 1, v21 -; GFX8-NEXT: v_and_b32_e32 v19, 1, v19 -; GFX8-NEXT: v_and_b32_e32 v17, 1, v17 -; GFX8-NEXT: v_and_b32_e32 v16, 1, v16 -; GFX8-NEXT: v_and_b32_e32 v15, 1, v15 -; GFX8-NEXT: v_and_b32_e32 v14, 1, v14 -; GFX8-NEXT: v_and_b32_e32 v13, 1, v13 -; GFX8-NEXT: v_and_b32_e32 v12, 1, v12 -; GFX8-NEXT: v_and_b32_e32 v11, 1, v11 -; GFX8-NEXT: v_and_b32_e32 v10, 1, v10 -; GFX8-NEXT: v_and_b32_e32 v9, 1, v9 -; GFX8-NEXT: v_and_b32_e32 v8, 1, v8 -; GFX8-NEXT: v_and_b32_e32 v7, 1, v7 -; GFX8-NEXT: v_and_b32_e32 v6, 1, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 1, v5 -; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_waitcnt vmcnt(5) -; GFX8-NEXT: v_and_b32_e32 v18, 1, v26 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v18 -; GFX8-NEXT: v_and_b32_e32 v26, 1, v27 -; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_cndmask_b32_e64 v18, v24, v22, s[8:9] -; GFX8-NEXT: v_cndmask_b32_sdwa v20, v24, v22, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_and_b32_e32 v22, 1, v29 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v22 -; GFX8-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:52 -; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_cndmask_b32_e64 v22, v28, v30, s[6:7] -; GFX8-NEXT: v_cndmask_b32_sdwa v24, v28, v30, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v26 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e64 v26, v31, v28, s[4:5] -; GFX8-NEXT: v_cndmask_b32_sdwa v27, v31, v28, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v25 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e64 v25, v28, v29, s[16:17] -; GFX8-NEXT: v_cndmask_b32_sdwa v28, v28, v29, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:48 -; GFX8-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:112 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v23 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e64 v23, v30, v29, s[12:13] -; GFX8-NEXT: v_cndmask_b32_sdwa v29, v30, v29, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:108 -; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v21 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e64 v21, v30, v31, s[10:11] -; GFX8-NEXT: v_cndmask_b32_sdwa v30, v30, v31, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104 -; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v19 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e64 v19, v31, v32, s[14:15] -; GFX8-NEXT: v_cndmask_b32_sdwa v31, v31, v32, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 -; GFX8-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 -; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_sdwa v16, v17, v32, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cndmask_b32_e64 v17, v17, v32, s[4:5] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 -; GFX8-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 -; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v14 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_sdwa v14, v32, v15, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cndmask_b32_e64 v15, v32, v15, s[4:5] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13 -; GFX8-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 -; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v12 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_sdwa v12, v13, v32, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cndmask_b32_e64 v13, v13, v32, s[4:5] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11 -; GFX8-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:88 -; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v10 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_sdwa v10, v11, v32, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v32, s[4:5] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8 -; GFX8-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 -; GFX8-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:20 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_sdwa v32, v8, v9, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[4:5] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v6 -; GFX8-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 -; GFX8-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:16 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_sdwa v9, v6, v7, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v4 -; GFX8-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 -; GFX8-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v1 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v2 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v3 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v4 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v5 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v6 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v7 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v8 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v9 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v10 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v11 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v12 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v13 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v14 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v15 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v16 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[44:45], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v17 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[46:47], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v18 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[56:57], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v19 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[58:59], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v20 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[60:61], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v21 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[62:63], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v22 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[72:73], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v23 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[74:75], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v24 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[76:77], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v25 +; GFX8-NEXT: v_writelane_b32 v34, s30, 0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[78:79], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v26 +; GFX8-NEXT: v_writelane_b32 v34, s31, 1 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[88:89], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v27 +; GFX8-NEXT: v_writelane_b32 v34, s34, 2 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[90:91], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v28 +; GFX8-NEXT: v_writelane_b32 v34, s35, 3 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v29 +; GFX8-NEXT: v_writelane_b32 v34, s36, 4 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v30 +; GFX8-NEXT: v_writelane_b32 v34, s37, 5 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[36:37], 1, v0 +; GFX8-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; GFX8-NEXT: v_writelane_b32 v34, s38, 6 +; GFX8-NEXT: v_writelane_b32 v34, s39, 7 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_sdwa v7, v4, v5, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[4:5] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[38:39], 1, v0 +; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 +; GFX8-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 +; GFX8-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 +; GFX8-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 +; GFX8-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 +; GFX8-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:16 +; GFX8-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 +; GFX8-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:20 +; GFX8-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88 +; GFX8-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:24 +; GFX8-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 +; GFX8-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28 +; GFX8-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 +; GFX8-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 +; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 +; GFX8-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36 +; GFX8-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 +; GFX8-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40 +; GFX8-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 +; GFX8-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44 +; GFX8-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 +; GFX8-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:48 +; GFX8-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 +; GFX8-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52 +; GFX8-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 +; GFX8-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:56 +; GFX8-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124 +; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 +; GFX8-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:128 +; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v29 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_sdwa v5, v2, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[4:5] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 -; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 -; GFX8-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 +; GFX8-NEXT: v_lshrrev_b32_e32 v28, 16, v32 +; GFX8-NEXT: v_cndmask_b32_e64 v28, v33, v28, s[38:39] +; GFX8-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[36:37] +; GFX8-NEXT: v_lshrrev_b32_e32 v32, 16, v31 +; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; GFX8-NEXT: v_cndmask_b32_e64 v32, v33, v32, s[34:35] +; GFX8-NEXT: v_cndmask_b32_e64 v30, v30, v31, s[30:31] +; GFX8-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX8-NEXT: v_cndmask_b32_e64 v31, v33, v31, s[90:91] +; GFX8-NEXT: v_cndmask_b32_e64 v26, v26, v27, s[88:89] +; GFX8-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX8-NEXT: v_cndmask_b32_e64 v27, v33, v27, s[78:79] +; GFX8-NEXT: v_cndmask_b32_e64 v24, v24, v25, s[76:77] +; GFX8-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX8-NEXT: v_cndmask_b32_e64 v25, v33, v25, s[74:75] +; GFX8-NEXT: v_cndmask_b32_e64 v22, v22, v23, s[72:73] +; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX8-NEXT: v_cndmask_b32_e64 v23, v33, v23, s[62:63] +; GFX8-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[60:61] +; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX8-NEXT: v_cndmask_b32_e64 v21, v33, v21, s[58:59] +; GFX8-NEXT: v_cndmask_b32_e64 v18, v18, v19, s[56:57] +; GFX8-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GFX8-NEXT: v_cndmask_b32_e64 v19, v33, v19, s[46:47] +; GFX8-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[44:45] +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX8-NEXT: v_cndmask_b32_e64 v17, v33, v17, s[42:43] +; GFX8-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[40:41] +; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX8-NEXT: v_cndmask_b32_e64 v15, v33, v15, s[28:29] +; GFX8-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[26:27] +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v11 +; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX8-NEXT: v_cndmask_b32_e64 v13, v33, v13, s[24:25] +; GFX8-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[22:23] +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v11, v33, v11, s[20:21] +; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[18:19] +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX8-NEXT: v_cndmask_b32_e64 v9, v33, v9, s[16:17] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[14:15] +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v7, v33, v7, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v5, v33, v5, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[6:7] +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v3, v33, v3, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v9 +; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v11 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v13 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v15 +; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v17 +; GFX8-NEXT: v_or_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v5, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v6, v12, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v7, v14, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v19 +; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v21 +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v23 +; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v25 +; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v27 +; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v31 +; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v32 +; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v28 +; GFX8-NEXT: v_or_b32_sdwa v8, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v9, v18, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v10, v20, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v11, v22, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v12, v24, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v13, v26, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v14, v30, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v15, v29, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_readlane_b32 s39, v34, 7 +; GFX8-NEXT: v_readlane_b32 s38, v34, 6 +; GFX8-NEXT: v_readlane_b32 s37, v34, 5 +; GFX8-NEXT: v_readlane_b32 s36, v34, 4 +; GFX8-NEXT: v_readlane_b32 s35, v34, 3 +; GFX8-NEXT: v_readlane_b32 s34, v34, 2 +; GFX8-NEXT: v_readlane_b32 s31, v34, 1 +; GFX8-NEXT: v_readlane_b32 s30, v34, 0 +; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_sdwa v3, v0, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v2, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v3, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v4, v8, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v5, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v6, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v7, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v8, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v9, v19, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v10, v21, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v11, v23, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v12, v25, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v13, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v14, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v15, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_vselect_v32bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v26, 1, v26 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v26 -; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 -; GFX9-NEXT: v_and_b32_e32 v24, 1, v24 -; GFX9-NEXT: v_and_b32_e32 v22, 1, v22 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v24 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v22 -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 -; GFX9-NEXT: v_and_b32_e32 v30, 1, v30 -; GFX9-NEXT: v_and_b32_e32 v28, 1, v28 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v30 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v28 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:124 -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:60 -; GFX9-NEXT: v_and_b32_e32 v18, 1, v18 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v18 -; GFX9-NEXT: v_and_b32_e32 v20, 1, v20 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v20 -; GFX9-NEXT: v_and_b32_e32 v25, 1, v25 -; GFX9-NEXT: v_and_b32_e32 v23, 1, v23 -; GFX9-NEXT: v_and_b32_e32 v21, 1, v21 -; GFX9-NEXT: v_and_b32_e32 v19, 1, v19 -; GFX9-NEXT: v_and_b32_e32 v17, 1, v17 -; GFX9-NEXT: v_and_b32_e32 v16, 1, v16 -; GFX9-NEXT: v_and_b32_e32 v15, 1, v15 -; GFX9-NEXT: v_and_b32_e32 v14, 1, v14 -; GFX9-NEXT: v_and_b32_e32 v13, 1, v13 -; GFX9-NEXT: v_and_b32_e32 v12, 1, v12 -; GFX9-NEXT: v_and_b32_e32 v11, 1, v11 -; GFX9-NEXT: v_and_b32_e32 v10, 1, v10 -; GFX9-NEXT: v_and_b32_e32 v9, 1, v9 -; GFX9-NEXT: v_and_b32_e32 v8, 1, v8 -; GFX9-NEXT: v_and_b32_e32 v7, 1, v7 -; GFX9-NEXT: v_and_b32_e32 v6, 1, v6 -; GFX9-NEXT: v_and_b32_e32 v5, 1, v5 -; GFX9-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_and_b32_e32 v18, 1, v26 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v18 -; GFX9-NEXT: v_and_b32_e32 v26, 1, v27 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_cndmask_b32_e64 v18, v24, v22, s[8:9] -; GFX9-NEXT: v_cndmask_b32_sdwa v20, v24, v22, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_and_b32_e32 v22, 1, v29 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v22 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:52 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e64 v22, v28, v30, s[6:7] -; GFX9-NEXT: v_cndmask_b32_sdwa v24, v28, v30, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v26 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v26, v31, v28, s[4:5] -; GFX9-NEXT: v_cndmask_b32_sdwa v27, v31, v28, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v25 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v25, v28, v29, s[10:11] -; GFX9-NEXT: v_cndmask_b32_sdwa v28, v28, v29, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:112 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v23 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v23, v30, v29, s[12:13] -; GFX9-NEXT: v_cndmask_b32_sdwa v29, v30, v29, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v21 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v21, v30, v31, s[14:15] -; GFX9-NEXT: v_cndmask_b32_sdwa v30, v30, v31, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v19 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v19, v31, v32, s[16:17] -; GFX9-NEXT: v_cndmask_b32_sdwa v31, v31, v32, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v16, v17, v32, s[4:5] -; GFX9-NEXT: v_cndmask_b32_sdwa v17, v17, v32, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v14 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v14, v32, v15, s[4:5] -; GFX9-NEXT: v_cndmask_b32_sdwa v15, v32, v15, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v12 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v12, v13, v32, s[4:5] -; GFX9-NEXT: v_cndmask_b32_sdwa v13, v13, v32, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11 -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:88 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v10 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v10, v11, v32, s[4:5] -; GFX9-NEXT: v_cndmask_b32_sdwa v11, v11, v32, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8 -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:20 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v32, v8, v9, s[4:5] -; GFX9-NEXT: v_cndmask_b32_sdwa v8, v8, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v6 -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:16 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v9, v6, v7, s[4:5] -; GFX9-NEXT: v_cndmask_b32_sdwa v6, v6, v7, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v4 -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v7, v4, v5, s[4:5] -; GFX9-NEXT: v_cndmask_b32_sdwa v4, v4, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v5, v2, v3, s[4:5] -; GFX9-NEXT: v_cndmask_b32_sdwa v2, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v3 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v5 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v4 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v7 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v6 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v9 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v8 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v11 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v10 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v13 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v12 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v15 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v14 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v17 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[44:45], 1, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v16 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[46:47], 1, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v19 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[56:57], 1, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v18 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[58:59], 1, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v21 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[60:61], 1, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v20 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[62:63], 1, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v23 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[72:73], 1, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v22 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[74:75], 1, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v25 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[76:77], 1, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v24 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[78:79], 1, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v27 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[88:89], 1, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v26 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[90:91], 1, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v29 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[92:93], 1, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v28 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[94:95], 1, v0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; GFX9-NEXT: v_writelane_b32 v33, s30, 0 +; GFX9-NEXT: v_writelane_b32 v33, s31, 1 +; GFX9-NEXT: v_writelane_b32 v33, s34, 2 +; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX9-NEXT: v_writelane_b32 v33, s35, 3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v30 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e64 v29, v31, v32, s[34:35] +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX9-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[30:31] +; GFX9-NEXT: v_cndmask_b32_e64 v32, v28, v30, s[94:95] +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GFX9-NEXT: v_cndmask_b32_e64 v28, v28, v30, s[92:93] +; GFX9-NEXT: v_cndmask_b32_e64 v30, v26, v27, s[90:91] +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GFX9-NEXT: v_cndmask_b32_e64 v26, v26, v27, s[88:89] +; GFX9-NEXT: v_cndmask_b32_e64 v27, v24, v25, s[78:79] +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GFX9-NEXT: v_cndmask_b32_e64 v24, v24, v25, s[76:77] +; GFX9-NEXT: v_cndmask_b32_e64 v25, v22, v23, s[74:75] +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GFX9-NEXT: v_cndmask_b32_e64 v22, v22, v23, s[72:73] +; GFX9-NEXT: v_cndmask_b32_e64 v23, v20, v21, s[62:63] +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GFX9-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[60:61] +; GFX9-NEXT: v_cndmask_b32_e64 v21, v18, v19, s[58:59] +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX9-NEXT: v_cndmask_b32_e64 v18, v18, v19, s[56:57] +; GFX9-NEXT: v_cndmask_b32_e64 v19, v16, v17, s[46:47] +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[44:45] +; GFX9-NEXT: v_cndmask_b32_e64 v17, v14, v15, s[42:43] +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX9-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[40:41] +; GFX9-NEXT: v_cndmask_b32_e64 v15, v12, v13, s[28:29] +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[26:27] +; GFX9-NEXT: v_cndmask_b32_e64 v13, v10, v11, s[24:25] +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[22:23] +; GFX9-NEXT: v_cndmask_b32_e64 v11, v8, v9, s[20:21] +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[18:19] +; GFX9-NEXT: v_cndmask_b32_e64 v9, v6, v7, s[16:17] +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[14:15] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v4, v5, s[12:13] +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v2, v3, s[8:9] +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v0, v1, s[4:5] -; GFX9-NEXT: v_cndmask_b32_sdwa v0, v0, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 ; GFX9-NEXT: v_perm_b32 v1, v2, v5, s4 ; GFX9-NEXT: v_perm_b32 v2, v4, v7, s4 ; GFX9-NEXT: v_perm_b32 v3, v6, v9, s4 -; GFX9-NEXT: v_perm_b32 v4, v8, v32, s4 -; GFX9-NEXT: v_perm_b32 v5, v11, v10, s4 -; GFX9-NEXT: v_perm_b32 v6, v13, v12, s4 -; GFX9-NEXT: v_perm_b32 v7, v15, v14, s4 -; GFX9-NEXT: v_perm_b32 v8, v17, v16, s4 -; GFX9-NEXT: v_perm_b32 v9, v31, v19, s4 -; GFX9-NEXT: v_perm_b32 v10, v30, v21, s4 -; GFX9-NEXT: v_perm_b32 v11, v29, v23, s4 -; GFX9-NEXT: v_perm_b32 v12, v28, v25, s4 -; GFX9-NEXT: v_perm_b32 v13, v27, v26, s4 -; GFX9-NEXT: v_perm_b32 v14, v24, v22, s4 -; GFX9-NEXT: v_perm_b32 v15, v20, v18, s4 +; GFX9-NEXT: v_perm_b32 v4, v8, v11, s4 +; GFX9-NEXT: v_perm_b32 v5, v10, v13, s4 +; GFX9-NEXT: v_perm_b32 v6, v12, v15, s4 +; GFX9-NEXT: v_perm_b32 v7, v14, v17, s4 +; GFX9-NEXT: v_perm_b32 v8, v16, v19, s4 +; GFX9-NEXT: v_perm_b32 v9, v18, v21, s4 +; GFX9-NEXT: v_perm_b32 v10, v20, v23, s4 +; GFX9-NEXT: v_perm_b32 v11, v22, v25, s4 +; GFX9-NEXT: v_perm_b32 v12, v24, v27, s4 +; GFX9-NEXT: v_perm_b32 v13, v26, v30, s4 +; GFX9-NEXT: v_perm_b32 v14, v28, v32, s4 +; GFX9-NEXT: v_perm_b32 v15, v31, v29, s4 +; GFX9-NEXT: v_readlane_b32 s35, v33, 3 +; GFX9-NEXT: v_readlane_b32 s34, v33, 2 +; GFX9-NEXT: v_readlane_b32 s31, v33, 1 +; GFX9-NEXT: v_readlane_b32 s30, v33, 0 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_vselect_v32bf16: diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll index 5488be398fe48..ea2c82f2679f3 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll @@ -832,35 +832,35 @@ define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1) ; VI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc ; VI-NEXT: flat_load_dwordx4 v[5:8], v[5:6] ; VI-NEXT: v_mov_b32_e32 v10, s1 -; VI-NEXT: s_cmp_eq_u32 s4, 1 ; VI-NEXT: v_add_u32_e32 v9, vcc, s0, v0 +; VI-NEXT: s_cmp_eq_u32 s4, 1 ; VI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] +; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s4, 2 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cndmask_b32_sdwa v0, v1, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s4, 3 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] +; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s4, 4 -; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s4, 5 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] +; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s4, 6 -; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s4, 7 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] +; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s4, 8 -; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s4, 9 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll index b4983bbdf4afa..16e217008ace5 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll @@ -344,16 +344,22 @@ define <4 x half> @test_fmax_legacy_ugt_v4f16(<4 x half> %a, <4 x half> %b) #0 { ; VI-SAFE-LABEL: test_fmax_legacy_ugt_v4f16: ; VI-SAFE: ; %bb.0: ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SAFE-NEXT: v_cmp_nle_f16 vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; VI-SAFE-NEXT: v_cndmask_b32_sdwa v4, v3, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-SAFE-NEXT: v_cmp_nle_f16 vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; VI-SAFE-NEXT: v_cndmask_b32_sdwa v5, v2, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-SAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; VI-SAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; VI-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; VI-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v7, v6 +; VI-SAFE-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc +; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v5, v4 +; VI-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v1, v3 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v0, v2 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-SAFE-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-SAFE-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-SAFE-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; VI-SAFE-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; VI-NNAN-LABEL: test_fmax_legacy_ugt_v4f16: @@ -511,14 +517,22 @@ define <8 x half> @test_fmax_legacy_ugt_v8f16(<8 x half> %a, <8 x half> %b) #0 { ; VI-SAFE-LABEL: test_fmax_legacy_ugt_v8f16: ; VI-SAFE: ; %bb.0: ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SAFE-NEXT: v_cmp_nle_f16 vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1 -; VI-SAFE-NEXT: v_cndmask_b32_sdwa v8, v7, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-SAFE-NEXT: v_cmp_nle_f16 vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1 -; VI-SAFE-NEXT: v_cndmask_b32_sdwa v9, v6, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-SAFE-NEXT: v_cmp_nle_f16 vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1 -; VI-SAFE-NEXT: v_cndmask_b32_sdwa v10, v5, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-SAFE-NEXT: v_cmp_nle_f16 vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1 -; VI-SAFE-NEXT: v_cndmask_b32_sdwa v11, v4, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-SAFE-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; VI-SAFE-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; VI-SAFE-NEXT: v_lshrrev_b32_e32 v12, 16, v6 +; VI-SAFE-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v15, v14 +; VI-SAFE-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; VI-SAFE-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; VI-SAFE-NEXT: v_cndmask_b32_e32 v14, v14, v15, vcc +; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v13, v12 +; VI-SAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; VI-SAFE-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; VI-SAFE-NEXT: v_cndmask_b32_e32 v12, v12, v13, vcc +; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v11, v10 +; VI-SAFE-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc +; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v9, v8 +; VI-SAFE-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc ; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v3, v7 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v2, v6 @@ -527,10 +541,14 @@ define <8 x half> @test_fmax_legacy_ugt_v8f16(<8 x half> %a, <8 x half> %b) #0 { ; VI-SAFE-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v0, v4 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-SAFE-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-SAFE-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-SAFE-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-SAFE-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-SAFE-NEXT: v_lshlrev_b32_e32 v4, 16, v10 +; VI-SAFE-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-SAFE-NEXT: v_lshlrev_b32_e32 v4, 16, v12 +; VI-SAFE-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-SAFE-NEXT: v_lshlrev_b32_e32 v4, 16, v14 +; VI-SAFE-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; VI-NNAN-LABEL: test_fmax_legacy_ugt_v8f16: diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll index 753b74ba696d1..05d45d2b2bfac 100644 --- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll @@ -2080,11 +2080,12 @@ define <2 x half> @v_fmaximum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c ; GFX942-NEXT: v_pk_max_f16 v3, v0, v1 ; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v1, v0, v5, s0 ; GFX942-NEXT: v_pk_max_f16 v1, v2, v1 @@ -2126,11 +2127,12 @@ define <2 x half> @v_fmaximum3_v2f16_commute(<2 x half> %a, <2 x half> %b, <2 x ; GFX942-NEXT: v_pk_max_f16 v3, v0, v1 ; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v1, v0, v5, s0 ; GFX942-NEXT: v_pk_max_f16 v1, v1, v2 @@ -2174,9 +2176,10 @@ define <2 x half> @v_fmaximum3_v2f16__fabs_all(<2 x half> %a, <2 x half> %b, <2 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v0 ; GFX942-NEXT: v_and_b32_e32 v4, 0x7fff7fff, v1 +; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], |v0|, |v1| src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_max_f16 v3, v3, v4 ; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v1| src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v2 ; GFX942-NEXT: v_cndmask_b32_sdwa v6, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -2229,11 +2232,12 @@ define <2 x half> @v_fmaximum3_v2f16__fneg_all(<2 x half> %a, <2 x half> %b, <2 ; GFX942-NEXT: v_pk_max_f16 v3, v0, v1 neg_lo:[1,1] neg_hi:[1,1] ; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v1 -; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], -v0, -v1 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v1, v0, v5, s0 ; GFX942-NEXT: v_pk_max_f16 v1, v1, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -2375,16 +2379,18 @@ define <3 x half> @v_fmaximum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc ; GFX942-NEXT: v_perm_b32 v1, v1, v6, s0 ; GFX942-NEXT: v_pk_max_f16 v1, v5, v1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v5, v6 @@ -2435,16 +2441,18 @@ define <3 x half> @v_fmaximum3_v3f16_commute(<3 x half> %a, <3 x half> %b, <3 x ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc ; GFX942-NEXT: v_perm_b32 v1, v1, v6, s0 ; GFX942-NEXT: v_pk_max_f16 v1, v1, v5 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 @@ -2501,33 +2509,36 @@ define <3 x half> @v_fmaximum3_v3f16__fabs_all(<3 x half> %a, <3 x half> %b, <3 ; GFX942-NEXT: v_and_b32_e32 v6, 0x7fff7fff, v0 ; GFX942-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v2 ; GFX942-NEXT: v_pk_max_f16 v7, v7, v9 -; GFX942-NEXT: v_mov_b32_e32 v9, 0x7e00 +; GFX942-NEXT: v_mov_b32_e32 v12, 0x7e00 +; GFX942-NEXT: v_lshrrev_b32_e32 v9, 16, v7 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_max_f16 v6, v6, v8 ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 -; GFX942-NEXT: v_cndmask_b32_sdwa v12, v9, v7, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v4 ; GFX942-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v5 -; GFX942-NEXT: v_cndmask_b32_sdwa v8, v9, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3| ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v9, v7, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v7, vcc ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2| ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v9, v6, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v6, vcc ; GFX942-NEXT: v_perm_b32 v2, v8, v0, s0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v11 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v4| src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v6, v12, v1, s0 +; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX942-NEXT: v_perm_b32 v6, v9, v1, s0 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc ; GFX942-NEXT: v_pk_max_f16 v6, v6, v10 -; GFX942-NEXT: v_cndmask_b32_sdwa v3, v9, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v1, |v5| ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v9, v6, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v6, vcc ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v4| ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v2, vcc ; GFX942-NEXT: v_perm_b32 v0, v3, v0, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2575,16 +2586,18 @@ define <3 x half> @v_fmaximum3_v3f16__fneg_all(<3 x half> %a, <3 x half> %b, <3 ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1] ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc ; GFX942-NEXT: v_perm_b32 v1, v1, v6, s0 ; GFX942-NEXT: v_pk_max_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v6, -v5 @@ -2643,14 +2656,16 @@ define <3 x half> @v_fmaximum3_v3f16__inlineimm1(<3 x half> %a, <3 x half> %c) { ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX942-NEXT: v_pack_b32_f16 v7, v1, s0 -; GFX942-NEXT: v_pk_max_f16 v7, v7, v3 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX942-NEXT: v_perm_b32 v4, v6, v0, s1 ; GFX942-NEXT: v_pk_max_f16 v4, v4, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v6, v2 src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_sdwa v6, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v5, v7, vcc +; GFX942-NEXT: v_pack_b32_f16 v7, v1, s0 +; GFX942-NEXT: v_pk_max_f16 v7, v7, v3 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc @@ -2695,16 +2710,18 @@ define <3 x half> @v_fmaximum3_v3f16__inlineimm2(<3 x half> %a, <3 x half> %b) { ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_sdwa v0, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_sdwa v1, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc ; GFX942-NEXT: v_perm_b32 v1, v1, v4, s0 ; GFX942-NEXT: v_pk_max_f16 v1, v1, 4.0 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 @@ -2752,37 +2769,38 @@ define <4 x half> @v_fmaximum3_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c ; GFX942-NEXT: v_pk_max_f16 v6, v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v7, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX942-NEXT: s_mov_b32 s0, 0x5040100 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] +; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_mov_b32 s2, 0x5040100 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v2, v1, v6, s0 +; GFX942-NEXT: v_perm_b32 v2, v1, v6, s2 ; GFX942-NEXT: v_pk_max_f16 v2, v5, v2 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v5, v6 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v5, v1 src0_sel:WORD_1 src1_sel:DWORD +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v5, v1 src0_sel:WORD_1 src1_sel:DWORD +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 +; GFX942-NEXT: v_perm_b32 v2, v0, v8, s2 ; GFX942-NEXT: v_pk_max_f16 v2, v4, v2 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v8 -; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX942-NEXT: v_perm_b32 v1, v1, v3, s2 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v0, v0, v5, s0 +; GFX942-NEXT: v_perm_b32 v0, v0, v5, s2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v4f16: @@ -2817,37 +2835,38 @@ define <4 x half> @v_fmaximum3_v4f16_commute(<4 x half> %a, <4 x half> %b, <4 x ; GFX942-NEXT: v_pk_max_f16 v6, v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v7, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX942-NEXT: s_mov_b32 s0, 0x5040100 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] +; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_mov_b32 s2, 0x5040100 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v2, v1, v6, s0 +; GFX942-NEXT: v_perm_b32 v2, v1, v6, s2 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v5 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v1, v5 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 +; GFX942-NEXT: v_perm_b32 v2, v0, v8, s2 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v8, v4 -; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX942-NEXT: v_perm_b32 v1, v1, v3, s2 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v0, v0, v5, s0 +; GFX942-NEXT: v_perm_b32 v0, v0, v5, s2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v4f16_commute: @@ -2891,38 +2910,41 @@ define <4 x half> @v_fmaximum3_v4f16__fabs_all(<4 x half> %a, <4 x half> %b, <4 ; GFX942-NEXT: v_and_b32_e32 v6, 0x7fff7fff, v1 ; GFX942-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v3 ; GFX942-NEXT: v_pk_max_f16 v7, v7, v9 -; GFX942-NEXT: v_mov_b32_e32 v9, 0x7e00 +; GFX942-NEXT: v_mov_b32_e32 v12, 0x7e00 +; GFX942-NEXT: v_lshrrev_b32_e32 v9, 16, v7 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_max_f16 v6, v6, v8 -; GFX942-NEXT: s_mov_b32 s0, 0x5040100 -; GFX942-NEXT: v_cndmask_b32_sdwa v12, v9, v7, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: s_mov_b32 s2, 0x5040100 +; GFX942-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v5 ; GFX942-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v4 -; GFX942-NEXT: v_cndmask_b32_sdwa v8, v9, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2| -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v9, v7, vcc +; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v8, |v5| src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v7, vcc ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3| ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v9, v6, vcc -; GFX942-NEXT: v_perm_b32 v2, v8, v1, s0 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v6, vcc +; GFX942-NEXT: v_perm_b32 v2, v8, v1, s2 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v11 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v5| src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v6, v12, v0, s0 +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] +; GFX942-NEXT: v_perm_b32 v6, v9, v0, s2 ; GFX942-NEXT: v_pk_max_f16 v6, v6, v10 -; GFX942-NEXT: v_cndmask_b32_sdwa v3, v9, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v12, |v4| src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cndmask_b32_sdwa v3, v12, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v9, |v4| src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_sdwa v7, v9, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cndmask_b32_sdwa v7, v12, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v1, |v5| ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v9, v2, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v2, vcc ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v4| -; GFX942-NEXT: v_perm_b32 v1, v3, v1, s0 +; GFX942-NEXT: v_perm_b32 v1, v3, v1, s2 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v9, v6, vcc -; GFX942-NEXT: v_perm_b32 v0, v7, v0, s0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v6, vcc +; GFX942-NEXT: v_perm_b32 v0, v7, v0, s2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v4f16__fabs_all: @@ -2966,37 +2988,38 @@ define <4 x half> @v_fmaximum3_v4f16__fneg_all(<4 x half> %a, <4 x half> %b, <4 ; GFX942-NEXT: v_pk_max_f16 v6, v0, v2 neg_lo:[1,1] neg_hi:[1,1] ; GFX942-NEXT: v_mov_b32_e32 v7, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v2 -; GFX942-NEXT: s_mov_b32 s0, 0x5040100 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1] -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] +; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_mov_b32 s2, 0x5040100 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v2, v1, v6, s0 +; GFX942-NEXT: v_perm_b32 v2, v1, v6, s2 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v5 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v6, -v5 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v1, -v5 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, -v5 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 +; GFX942-NEXT: v_perm_b32 v2, v0, v8, s2 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v8, -v4 -; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX942-NEXT: v_perm_b32 v1, v1, v3, s2 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v0, v0, v5, s0 +; GFX942-NEXT: v_perm_b32 v0, v0, v5, s2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v4f16__fneg_all: @@ -3035,21 +3058,23 @@ define <4 x half> @v_fmaximum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) { ; GFX942-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_max_f16 v7, v1, 2.0 op_sel_hi:[1,0] -; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_mov_b32 s2, 0x5040100 ; GFX942-NEXT: v_cndmask_b32_sdwa v6, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v7 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v1 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_sdwa v8, v5, v7, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cndmask_b32_e32 v8, v5, v8, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v8, v3 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc -; GFX942-NEXT: v_perm_b32 v4, v8, v1, s0 +; GFX942-NEXT: v_perm_b32 v4, v8, v1, s2 ; GFX942-NEXT: v_pk_max_f16 v4, v4, v3 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v8, v3 src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v8, v6, v0, s0 +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] +; GFX942-NEXT: v_perm_b32 v8, v6, v0, s2 ; GFX942-NEXT: v_pk_max_f16 v8, v8, v2 ; GFX942-NEXT: v_cndmask_b32_sdwa v7, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v6, v2 src0_sel:DWORD src1_sel:WORD_1 @@ -3059,10 +3084,10 @@ define <4 x half> @v_fmaximum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) { ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX942-NEXT: v_perm_b32 v1, v7, v1, s0 +; GFX942-NEXT: v_perm_b32 v1, v7, v1, s2 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc -; GFX942-NEXT: v_perm_b32 v0, v6, v0, s0 +; GFX942-NEXT: v_perm_b32 v0, v6, v0, s2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v4f16__inlineimm1: @@ -3097,18 +3122,19 @@ define <4 x half> @v_fmaximum3_v4f16__inlineimm2(<4 x half> %a, <4 x half> %b) { ; GFX942-NEXT: v_pk_max_f16 v4, v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX942-NEXT: s_mov_b32 s0, 0x5040100 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] +; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_sdwa v1, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v2, v1, v4, s0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, 4.0 op_sel_hi:[1,0] @@ -3947,11 +3973,12 @@ define <4 x half> @v_no_fmaximum3_v2f16__multi_use(<2 x half> %a, <2 x half> %b, ; GFX942-NEXT: v_pk_max_f16 v3, v0, v1 ; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_sdwa v1, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v1, v5, s0 ; GFX942-NEXT: v_pk_max_f16 v3, v0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll index 1be0590bcaae0..144074e114045 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll @@ -345,16 +345,22 @@ define <4 x half> @test_fmin_legacy_ule_v4f16(<4 x half> %a, <4 x half> %b) #0 { ; VI-SAFE-LABEL: test_fmin_legacy_ule_v4f16: ; VI-SAFE: ; %bb.0: ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SAFE-NEXT: v_cmp_ngt_f16 vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; VI-SAFE-NEXT: v_cndmask_b32_sdwa v4, v3, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-SAFE-NEXT: v_cmp_ngt_f16 vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; VI-SAFE-NEXT: v_cndmask_b32_sdwa v5, v2, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-SAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; VI-SAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; VI-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; VI-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v7, v6 +; VI-SAFE-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc +; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v5, v4 +; VI-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v1, v3 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v2 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-SAFE-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-SAFE-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-SAFE-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; VI-SAFE-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; VI-NNAN-LABEL: test_fmin_legacy_ule_v4f16: @@ -512,14 +518,22 @@ define <8 x half> @test_fmin_legacy_ule_v8f16(<8 x half> %a, <8 x half> %b) #0 { ; VI-SAFE-LABEL: test_fmin_legacy_ule_v8f16: ; VI-SAFE: ; %bb.0: ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SAFE-NEXT: v_cmp_ngt_f16 vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1 -; VI-SAFE-NEXT: v_cndmask_b32_sdwa v8, v7, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-SAFE-NEXT: v_cmp_ngt_f16 vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1 -; VI-SAFE-NEXT: v_cndmask_b32_sdwa v9, v6, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-SAFE-NEXT: v_cmp_ngt_f16 vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1 -; VI-SAFE-NEXT: v_cndmask_b32_sdwa v10, v5, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-SAFE-NEXT: v_cmp_ngt_f16 vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1 -; VI-SAFE-NEXT: v_cndmask_b32_sdwa v11, v4, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-SAFE-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; VI-SAFE-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; VI-SAFE-NEXT: v_lshrrev_b32_e32 v12, 16, v6 +; VI-SAFE-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v15, v14 +; VI-SAFE-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; VI-SAFE-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; VI-SAFE-NEXT: v_cndmask_b32_e32 v14, v14, v15, vcc +; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v13, v12 +; VI-SAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; VI-SAFE-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; VI-SAFE-NEXT: v_cndmask_b32_e32 v12, v12, v13, vcc +; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v11, v10 +; VI-SAFE-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc +; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v9, v8 +; VI-SAFE-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc ; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v3, v7 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v2, v6 @@ -528,10 +542,14 @@ define <8 x half> @test_fmin_legacy_ule_v8f16(<8 x half> %a, <8 x half> %b) #0 { ; VI-SAFE-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v4 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-SAFE-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-SAFE-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-SAFE-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-SAFE-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-SAFE-NEXT: v_lshlrev_b32_e32 v4, 16, v10 +; VI-SAFE-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-SAFE-NEXT: v_lshlrev_b32_e32 v4, 16, v12 +; VI-SAFE-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-SAFE-NEXT: v_lshlrev_b32_e32 v4, 16, v14 +; VI-SAFE-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; VI-NNAN-LABEL: test_fmin_legacy_ule_v8f16: diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll index ce6b522adff0a..44cc7428f888d 100644 --- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll @@ -2080,11 +2080,12 @@ define <2 x half> @v_fminimum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c ; GFX942-NEXT: v_pk_min_f16 v3, v0, v1 ; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v1, v0, v5, s0 ; GFX942-NEXT: v_pk_min_f16 v1, v2, v1 @@ -2126,11 +2127,12 @@ define <2 x half> @v_fminimum3_v2f16_commute(<2 x half> %a, <2 x half> %b, <2 x ; GFX942-NEXT: v_pk_min_f16 v3, v0, v1 ; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v1, v0, v5, s0 ; GFX942-NEXT: v_pk_min_f16 v1, v1, v2 @@ -2174,9 +2176,10 @@ define <2 x half> @v_fminimum3_v2f16__fabs_all(<2 x half> %a, <2 x half> %b, <2 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v0 ; GFX942-NEXT: v_and_b32_e32 v4, 0x7fff7fff, v1 +; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], |v0|, |v1| src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_min_f16 v3, v3, v4 ; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v1| src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v2 ; GFX942-NEXT: v_cndmask_b32_sdwa v6, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -2229,11 +2232,12 @@ define <2 x half> @v_fminimum3_v2f16__fneg_all(<2 x half> %a, <2 x half> %b, <2 ; GFX942-NEXT: v_pk_min_f16 v3, v0, v1 neg_lo:[1,1] neg_hi:[1,1] ; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v1 -; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], -v0, -v1 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v1, v0, v5, s0 ; GFX942-NEXT: v_pk_min_f16 v1, v1, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -2375,16 +2379,18 @@ define <3 x half> @v_fminimum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc ; GFX942-NEXT: v_perm_b32 v1, v1, v6, s0 ; GFX942-NEXT: v_pk_min_f16 v1, v5, v1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v5, v6 @@ -2435,16 +2441,18 @@ define <3 x half> @v_fminimum3_v3f16_commute(<3 x half> %a, <3 x half> %b, <3 x ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc ; GFX942-NEXT: v_perm_b32 v1, v1, v6, s0 ; GFX942-NEXT: v_pk_min_f16 v1, v1, v5 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 @@ -2501,33 +2509,36 @@ define <3 x half> @v_fminimum3_v3f16__fabs_all(<3 x half> %a, <3 x half> %b, <3 ; GFX942-NEXT: v_and_b32_e32 v6, 0x7fff7fff, v0 ; GFX942-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v2 ; GFX942-NEXT: v_pk_min_f16 v7, v7, v9 -; GFX942-NEXT: v_mov_b32_e32 v9, 0x7e00 +; GFX942-NEXT: v_mov_b32_e32 v12, 0x7e00 +; GFX942-NEXT: v_lshrrev_b32_e32 v9, 16, v7 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_min_f16 v6, v6, v8 ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 -; GFX942-NEXT: v_cndmask_b32_sdwa v12, v9, v7, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v4 ; GFX942-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v5 -; GFX942-NEXT: v_cndmask_b32_sdwa v8, v9, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3| ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v9, v7, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v7, vcc ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2| ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v9, v6, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v6, vcc ; GFX942-NEXT: v_perm_b32 v2, v8, v0, s0 ; GFX942-NEXT: v_pk_min_f16 v2, v2, v11 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v4| src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v6, v12, v1, s0 +; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX942-NEXT: v_perm_b32 v6, v9, v1, s0 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc ; GFX942-NEXT: v_pk_min_f16 v6, v6, v10 -; GFX942-NEXT: v_cndmask_b32_sdwa v3, v9, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v1, |v5| ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v9, v6, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v6, vcc ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v4| ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v2, vcc ; GFX942-NEXT: v_perm_b32 v0, v3, v0, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2575,16 +2586,18 @@ define <3 x half> @v_fminimum3_v3f16__fneg_all(<3 x half> %a, <3 x half> %b, <3 ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1] ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc ; GFX942-NEXT: v_perm_b32 v1, v1, v6, s0 ; GFX942-NEXT: v_pk_min_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v6, -v5 @@ -2643,14 +2656,16 @@ define <3 x half> @v_fminimum3_v3f16__inlineimm1(<3 x half> %a, <3 x half> %c) { ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX942-NEXT: v_pack_b32_f16 v7, v1, s0 -; GFX942-NEXT: v_pk_min_f16 v7, v7, v3 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX942-NEXT: v_perm_b32 v4, v6, v0, s1 ; GFX942-NEXT: v_pk_min_f16 v4, v4, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v6, v2 src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_sdwa v6, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v5, v7, vcc +; GFX942-NEXT: v_pack_b32_f16 v7, v1, s0 +; GFX942-NEXT: v_pk_min_f16 v7, v7, v3 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc @@ -2695,16 +2710,18 @@ define <3 x half> @v_fminimum3_v3f16__inlineimm2(<3 x half> %a, <3 x half> %b) { ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_sdwa v0, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_sdwa v1, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc ; GFX942-NEXT: v_perm_b32 v1, v1, v4, s0 ; GFX942-NEXT: v_pk_min_f16 v1, v1, 4.0 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 @@ -2752,37 +2769,38 @@ define <4 x half> @v_fminimum3_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c ; GFX942-NEXT: v_pk_min_f16 v6, v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v7, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX942-NEXT: s_mov_b32 s0, 0x5040100 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] +; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_mov_b32 s2, 0x5040100 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v2, v1, v6, s0 +; GFX942-NEXT: v_perm_b32 v2, v1, v6, s2 ; GFX942-NEXT: v_pk_min_f16 v2, v5, v2 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v5, v6 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v5, v1 src0_sel:WORD_1 src1_sel:DWORD +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v5, v1 src0_sel:WORD_1 src1_sel:DWORD +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 +; GFX942-NEXT: v_perm_b32 v2, v0, v8, s2 ; GFX942-NEXT: v_pk_min_f16 v2, v4, v2 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v8 -; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX942-NEXT: v_perm_b32 v1, v1, v3, s2 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v0, v0, v5, s0 +; GFX942-NEXT: v_perm_b32 v0, v0, v5, s2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v4f16: @@ -2817,37 +2835,38 @@ define <4 x half> @v_fminimum3_v4f16_commute(<4 x half> %a, <4 x half> %b, <4 x ; GFX942-NEXT: v_pk_min_f16 v6, v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v7, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX942-NEXT: s_mov_b32 s0, 0x5040100 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] +; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_mov_b32 s2, 0x5040100 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v2, v1, v6, s0 +; GFX942-NEXT: v_perm_b32 v2, v1, v6, s2 ; GFX942-NEXT: v_pk_min_f16 v2, v2, v5 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v1, v5 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 +; GFX942-NEXT: v_perm_b32 v2, v0, v8, s2 ; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v8, v4 -; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX942-NEXT: v_perm_b32 v1, v1, v3, s2 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v0, v0, v5, s0 +; GFX942-NEXT: v_perm_b32 v0, v0, v5, s2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v4f16_commute: @@ -2891,38 +2910,41 @@ define <4 x half> @v_fminimum3_v4f16__fabs_all(<4 x half> %a, <4 x half> %b, <4 ; GFX942-NEXT: v_and_b32_e32 v6, 0x7fff7fff, v1 ; GFX942-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v3 ; GFX942-NEXT: v_pk_min_f16 v7, v7, v9 -; GFX942-NEXT: v_mov_b32_e32 v9, 0x7e00 +; GFX942-NEXT: v_mov_b32_e32 v12, 0x7e00 +; GFX942-NEXT: v_lshrrev_b32_e32 v9, 16, v7 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_min_f16 v6, v6, v8 -; GFX942-NEXT: s_mov_b32 s0, 0x5040100 -; GFX942-NEXT: v_cndmask_b32_sdwa v12, v9, v7, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: s_mov_b32 s2, 0x5040100 +; GFX942-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v5 ; GFX942-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v4 -; GFX942-NEXT: v_cndmask_b32_sdwa v8, v9, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2| -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v9, v7, vcc +; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v8, |v5| src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v7, vcc ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3| ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v9, v6, vcc -; GFX942-NEXT: v_perm_b32 v2, v8, v1, s0 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v6, vcc +; GFX942-NEXT: v_perm_b32 v2, v8, v1, s2 ; GFX942-NEXT: v_pk_min_f16 v2, v2, v11 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v5| src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v6, v12, v0, s0 +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] +; GFX942-NEXT: v_perm_b32 v6, v9, v0, s2 ; GFX942-NEXT: v_pk_min_f16 v6, v6, v10 -; GFX942-NEXT: v_cndmask_b32_sdwa v3, v9, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v12, |v4| src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cndmask_b32_sdwa v3, v12, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v9, |v4| src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_sdwa v7, v9, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cndmask_b32_sdwa v7, v12, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v1, |v5| ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v9, v2, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v2, vcc ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v4| -; GFX942-NEXT: v_perm_b32 v1, v3, v1, s0 +; GFX942-NEXT: v_perm_b32 v1, v3, v1, s2 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v9, v6, vcc -; GFX942-NEXT: v_perm_b32 v0, v7, v0, s0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v6, vcc +; GFX942-NEXT: v_perm_b32 v0, v7, v0, s2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v4f16__fabs_all: @@ -2966,37 +2988,38 @@ define <4 x half> @v_fminimum3_v4f16__fneg_all(<4 x half> %a, <4 x half> %b, <4 ; GFX942-NEXT: v_pk_min_f16 v6, v0, v2 neg_lo:[1,1] neg_hi:[1,1] ; GFX942-NEXT: v_mov_b32_e32 v7, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v2 -; GFX942-NEXT: s_mov_b32 s0, 0x5040100 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1] -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] +; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_mov_b32 s2, 0x5040100 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v2, v1, v6, s0 +; GFX942-NEXT: v_perm_b32 v2, v1, v6, s2 ; GFX942-NEXT: v_pk_min_f16 v2, v2, v5 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v6, -v5 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v1, -v5 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, -v5 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 +; GFX942-NEXT: v_perm_b32 v2, v0, v8, s2 ; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v8, -v4 -; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX942-NEXT: v_perm_b32 v1, v1, v3, s2 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v0, v0, v5, s0 +; GFX942-NEXT: v_perm_b32 v0, v0, v5, s2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v4f16__fneg_all: @@ -3035,21 +3058,23 @@ define <4 x half> @v_fminimum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) { ; GFX942-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_min_f16 v7, v1, 2.0 op_sel_hi:[1,0] -; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_mov_b32 s2, 0x5040100 ; GFX942-NEXT: v_cndmask_b32_sdwa v6, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v7 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v1 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_sdwa v8, v5, v7, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cndmask_b32_e32 v8, v5, v8, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v8, v3 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc -; GFX942-NEXT: v_perm_b32 v4, v8, v1, s0 +; GFX942-NEXT: v_perm_b32 v4, v8, v1, s2 ; GFX942-NEXT: v_pk_min_f16 v4, v4, v3 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v8, v3 src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v8, v6, v0, s0 +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] +; GFX942-NEXT: v_perm_b32 v8, v6, v0, s2 ; GFX942-NEXT: v_pk_min_f16 v8, v8, v2 ; GFX942-NEXT: v_cndmask_b32_sdwa v7, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v6, v2 src0_sel:DWORD src1_sel:WORD_1 @@ -3059,10 +3084,10 @@ define <4 x half> @v_fminimum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) { ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX942-NEXT: v_perm_b32 v1, v7, v1, s0 +; GFX942-NEXT: v_perm_b32 v1, v7, v1, s2 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc -; GFX942-NEXT: v_perm_b32 v0, v6, v0, s0 +; GFX942-NEXT: v_perm_b32 v0, v6, v0, s2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v4f16__inlineimm1: @@ -3097,18 +3122,19 @@ define <4 x half> @v_fminimum3_v4f16__inlineimm2(<4 x half> %a, <4 x half> %b) { ; GFX942-NEXT: v_pk_min_f16 v4, v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX942-NEXT: s_mov_b32 s0, 0x5040100 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] +; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_sdwa v1, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v2, v1, v4, s0 ; GFX942-NEXT: v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0] @@ -3947,11 +3973,12 @@ define <4 x half> @v_no_fminimum3_v2f16__multi_use(<2 x half> %a, <2 x half> %b, ; GFX942-NEXT: v_pk_min_f16 v3, v0, v1 ; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_sdwa v1, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v1, v5, s0 ; GFX942-NEXT: v_pk_min_f16 v3, v0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index e55178d0d8d30..573fdd392267b 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -350,38 +350,38 @@ define amdgpu_kernel void @half8_inselt(ptr addrspace(1) %out, <8 x half> %vec, ; GCN-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_lshr_b32 s3, s2, 16 -; GCN-NEXT: s_cmp_lg_u32 s8, 5 -; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc -; GCN-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[6:7] +; GCN-NEXT: s_cmp_lg_u32 s8, 5 ; GCN-NEXT: v_or_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_cmp_lg_u32 s8, 4 -; GCN-NEXT: v_cndmask_b32_sdwa v1, v0, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s6, s1, 16 -; GCN-NEXT: s_cmp_lg_u32 s8, 3 +; GCN-NEXT: s_cmp_lg_u32 s8, 4 +; GCN-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_lshr_b32 s2, s1, 16 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc -; GCN-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[2:3] +; GCN-NEXT: s_cmp_lg_u32 s8, 3 ; GCN-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v1, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s8, 2 -; GCN-NEXT: v_cndmask_b32_sdwa v1, v0, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GCN-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v4, s1 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_lshr_b32 s1, s0, 16 -; GCN-NEXT: s_cmp_lg_u32 s8, 1 -; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc -; GCN-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[2:3] +; GCN-NEXT: s_cmp_lg_u32 s8, 1 ; GCN-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GCN-NEXT: v_mov_b32_e32 v4, s1 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s8, 0 -; GCN-NEXT: v_cndmask_b32_sdwa v4, v0, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GCN-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc ; GCN-NEXT: v_mov_b32_e32 v5, s0 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GCN-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GCN-NEXT: v_mov_b32_e32 v4, s4 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll index 816f5c2727c6f..139dd059997f5 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll @@ -1102,7 +1102,7 @@ define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out, ; VI-LABEL: v_insertelement_v8bf16_dynamic: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 @@ -1112,42 +1112,45 @@ define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out, ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; VI-NEXT: s_cmp_eq_u32 s9, 6 ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s9, 7 +; VI-NEXT: s_cmp_eq_u32 s5, 6 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s9, 4 -; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[2:3] -; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s9, 5 -; VI-NEXT: v_mov_b32_e32 v6, s8 -; VI-NEXT: s_cselect_b64 s[10:11], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s9, 2 -; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s9, 3 -; VI-NEXT: s_cselect_b64 s[12:13], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s9, 0 -; VI-NEXT: s_cselect_b64 s[6:7], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s9, 1 -; VI-NEXT: s_cselect_b64 s[8:9], -1, 0 +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s5, 7 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cndmask_b32_e64 v7, v3, v6, s[0:1] -; VI-NEXT: v_cndmask_b32_sdwa v3, v3, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[10:11] -; VI-NEXT: v_cndmask_b32_e64 v8, v2, v6, s[2:3] -; VI-NEXT: v_cndmask_b32_sdwa v2, v2, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[12:13] -; VI-NEXT: v_cndmask_b32_e64 v9, v1, v6, s[4:5] -; VI-NEXT: v_cndmask_b32_sdwa v1, v1, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[8:9] +; VI-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s5, 4 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s5, 5 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s5, 2 +; VI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s5, 3 +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s5, 0 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s5, 1 ; VI-NEXT: v_or_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e64 v7, v0, v6, s[6:7] +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; VI-NEXT: v_or_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc +; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[2:3] +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e64 v7, v0, v6, s[0:1] ; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -1455,92 +1458,100 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out ; ; VI-LABEL: v_insertelement_v16bf16_dynamic: ; VI: ; %bb.0: -; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; VI-NEXT: s_load_dwordx2 s[16:17], s[8:9], 0x10 +; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v8 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v4 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; VI-NEXT: v_mov_b32_e32 v9, s1 ; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8 -; VI-NEXT: s_cmp_eq_u32 s17, 14 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s17, 15 +; VI-NEXT: s_cmp_eq_u32 s7, 14 ; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc +; VI-NEXT: v_mov_b32_e32 v12, s6 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 15 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cndmask_b32_e32 v13, v3, v12, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 12 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 13 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; VI-NEXT: v_cndmask_b32_e64 v2, v2, v12, s[0:1] +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 10 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s17, 12 -; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[2:3] +; VI-NEXT: s_cmp_eq_u32 s7, 11 +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; VI-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3] ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s17, 13 -; VI-NEXT: v_mov_b32_e32 v12, s16 -; VI-NEXT: s_cselect_b64 s[18:19], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s17, 10 +; VI-NEXT: s_cmp_eq_u32 s7, 8 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s17, 11 -; VI-NEXT: s_cselect_b64 s[20:21], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s17, 8 -; VI-NEXT: s_cselect_b64 s[6:7], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s17, 9 -; VI-NEXT: s_cselect_b64 s[22:23], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s17, 6 -; VI-NEXT: s_cselect_b64 s[8:9], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s17, 7 -; VI-NEXT: s_cselect_b64 s[24:25], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s17, 4 -; VI-NEXT: s_cselect_b64 s[10:11], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s17, 5 -; VI-NEXT: s_cselect_b64 s[26:27], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s17, 2 -; VI-NEXT: s_cselect_b64 s[12:13], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s17, 3 -; VI-NEXT: s_cselect_b64 s[28:29], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s17, 0 -; VI-NEXT: s_cselect_b64 s[14:15], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s17, 1 -; VI-NEXT: s_cselect_b64 s[16:17], -1, 0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cndmask_b32_e64 v13, v3, v12, s[0:1] -; VI-NEXT: v_cndmask_b32_sdwa v3, v3, v12, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[18:19] -; VI-NEXT: v_cndmask_b32_e64 v14, v2, v12, s[2:3] -; VI-NEXT: v_cndmask_b32_sdwa v2, v2, v12, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[20:21] -; VI-NEXT: v_cndmask_b32_e64 v15, v1, v12, s[4:5] -; VI-NEXT: v_cndmask_b32_sdwa v1, v1, v12, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[22:23] -; VI-NEXT: v_cndmask_b32_e64 v16, v0, v12, s[6:7] -; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v12, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[24:25] +; VI-NEXT: v_cndmask_b32_e64 v15, v15, v12, s[2:3] +; VI-NEXT: s_cmp_eq_u32 s7, 9 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 6 +; VI-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v15, v16, v12, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 7 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cndmask_b32_e64 v17, v7, v12, s[8:9] -; VI-NEXT: v_cndmask_b32_sdwa v7, v7, v12, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[26:27] +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_cndmask_b32_e64 v14, v14, v12, s[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 4 ; VI-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e64 v13, v6, v12, s[10:11] -; VI-NEXT: v_cndmask_b32_sdwa v6, v6, v12, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[28:29] -; VI-NEXT: v_or_b32_sdwa v2, v14, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e64 v14, v5, v12, s[12:13] -; VI-NEXT: v_cndmask_b32_sdwa v5, v5, v12, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[16:17] -; VI-NEXT: v_or_b32_sdwa v6, v13, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e64 v13, v4, v12, s[14:15] -; VI-NEXT: v_cndmask_b32_sdwa v4, v4, v12, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v17, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v14, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v13, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; VI-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v15, v17, v12, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 5 +; VI-NEXT: v_or_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 2 +; VI-NEXT: v_cndmask_b32_e32 v13, v13, v12, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 3 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 0 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_cndmask_b32_e32 v14, v14, v12, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 1 +; VI-NEXT: v_or_b32_sdwa v6, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_or_b32_sdwa v7, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; VI-NEXT: flat_store_dwordx4 v[10:11], v[0:3] ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index de979f4f69698..4f349a39a456e 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -2740,7 +2740,7 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ; VI-LABEL: v_insertelement_v8f16_dynamic: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 @@ -2750,42 +2750,45 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; VI-NEXT: s_cmp_eq_u32 s9, 6 ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s9, 7 +; VI-NEXT: s_cmp_eq_u32 s5, 6 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s9, 4 -; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[2:3] -; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s9, 5 -; VI-NEXT: v_mov_b32_e32 v6, s8 -; VI-NEXT: s_cselect_b64 s[10:11], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s9, 2 -; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s9, 3 -; VI-NEXT: s_cselect_b64 s[12:13], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s9, 0 -; VI-NEXT: s_cselect_b64 s[6:7], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s9, 1 -; VI-NEXT: s_cselect_b64 s[8:9], -1, 0 +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s5, 7 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cndmask_b32_e64 v7, v3, v6, s[0:1] -; VI-NEXT: v_cndmask_b32_sdwa v3, v3, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[10:11] -; VI-NEXT: v_cndmask_b32_e64 v8, v2, v6, s[2:3] -; VI-NEXT: v_cndmask_b32_sdwa v2, v2, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[12:13] -; VI-NEXT: v_cndmask_b32_e64 v9, v1, v6, s[4:5] -; VI-NEXT: v_cndmask_b32_sdwa v1, v1, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[8:9] +; VI-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s5, 4 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s5, 5 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s5, 2 +; VI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s5, 3 +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s5, 0 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s5, 1 ; VI-NEXT: v_or_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e64 v7, v0, v6, s[6:7] +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; VI-NEXT: v_or_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc +; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[2:3] +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e64 v7, v0, v6, s[0:1] ; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -3272,92 +3275,100 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; ; VI-LABEL: v_insertelement_v16f16_dynamic: ; VI: ; %bb.0: -; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; VI-NEXT: s_load_dwordx2 s[16:17], s[8:9], 0x10 +; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v8 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v4 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; VI-NEXT: v_mov_b32_e32 v9, s1 ; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8 -; VI-NEXT: s_cmp_eq_u32 s17, 14 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s17, 15 +; VI-NEXT: s_cmp_eq_u32 s7, 14 ; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc +; VI-NEXT: v_mov_b32_e32 v12, s6 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 15 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cndmask_b32_e32 v13, v3, v12, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 12 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 13 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; VI-NEXT: v_cndmask_b32_e64 v2, v2, v12, s[0:1] +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 10 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s17, 12 -; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[2:3] +; VI-NEXT: s_cmp_eq_u32 s7, 11 +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; VI-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3] ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s17, 13 -; VI-NEXT: v_mov_b32_e32 v12, s16 -; VI-NEXT: s_cselect_b64 s[18:19], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s17, 10 +; VI-NEXT: s_cmp_eq_u32 s7, 8 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s17, 11 -; VI-NEXT: s_cselect_b64 s[20:21], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s17, 8 -; VI-NEXT: s_cselect_b64 s[6:7], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s17, 9 -; VI-NEXT: s_cselect_b64 s[22:23], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s17, 6 -; VI-NEXT: s_cselect_b64 s[8:9], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s17, 7 -; VI-NEXT: s_cselect_b64 s[24:25], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s17, 4 -; VI-NEXT: s_cselect_b64 s[10:11], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s17, 5 -; VI-NEXT: s_cselect_b64 s[26:27], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s17, 2 -; VI-NEXT: s_cselect_b64 s[12:13], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s17, 3 -; VI-NEXT: s_cselect_b64 s[28:29], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s17, 0 -; VI-NEXT: s_cselect_b64 s[14:15], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s17, 1 -; VI-NEXT: s_cselect_b64 s[16:17], -1, 0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cndmask_b32_e64 v13, v3, v12, s[0:1] -; VI-NEXT: v_cndmask_b32_sdwa v3, v3, v12, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[18:19] -; VI-NEXT: v_cndmask_b32_e64 v14, v2, v12, s[2:3] -; VI-NEXT: v_cndmask_b32_sdwa v2, v2, v12, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[20:21] -; VI-NEXT: v_cndmask_b32_e64 v15, v1, v12, s[4:5] -; VI-NEXT: v_cndmask_b32_sdwa v1, v1, v12, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[22:23] -; VI-NEXT: v_cndmask_b32_e64 v16, v0, v12, s[6:7] -; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v12, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[24:25] +; VI-NEXT: v_cndmask_b32_e64 v15, v15, v12, s[2:3] +; VI-NEXT: s_cmp_eq_u32 s7, 9 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 6 +; VI-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v15, v16, v12, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 7 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cndmask_b32_e64 v17, v7, v12, s[8:9] -; VI-NEXT: v_cndmask_b32_sdwa v7, v7, v12, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[26:27] +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_cndmask_b32_e64 v14, v14, v12, s[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 4 ; VI-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e64 v13, v6, v12, s[10:11] -; VI-NEXT: v_cndmask_b32_sdwa v6, v6, v12, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[28:29] -; VI-NEXT: v_or_b32_sdwa v2, v14, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e64 v14, v5, v12, s[12:13] -; VI-NEXT: v_cndmask_b32_sdwa v5, v5, v12, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[16:17] -; VI-NEXT: v_or_b32_sdwa v6, v13, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e64 v13, v4, v12, s[14:15] -; VI-NEXT: v_cndmask_b32_sdwa v4, v4, v12, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v17, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v14, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v13, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; VI-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v15, v17, v12, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 5 +; VI-NEXT: v_or_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 2 +; VI-NEXT: v_cndmask_b32_e32 v13, v13, v12, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 3 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 0 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_cndmask_b32_e32 v14, v14, v12, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 1 +; VI-NEXT: v_or_b32_sdwa v6, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_or_b32_sdwa v7, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; VI-NEXT: flat_store_dwordx4 v[10:11], v[0:3] ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll index 59c051fb9e741..b8c1d55289c52 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll @@ -1576,37 +1576,41 @@ define <4 x half> @v_maximum_v4f16(<4 x half> %src0, <4 x half> %src1) { ; GFX8-NEXT: v_max_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX8-NEXT: v_cmp_o_f16 vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v0 ; GFX8-NEXT: v_cndmask_b32_sdwa v4, v5, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_sdwa v6, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cndmask_b32_sdwa v6, v5, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v8, v7, v6 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v7, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; GFX8-NEXT: v_max_f16_e32 v7, v1, v3 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc ; GFX8-NEXT: v_max_f16_e32 v3, v0, v2 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_maximum_v4f16: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cmp_o_f16_sdwa s[4:5], v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[4:5] ; GFX900-NEXT: v_pk_max_f16 v4, v1, v3 ; GFX900-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX900-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_sdwa v1, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX900-NEXT: v_pk_max_f16 v3, v0, v2 +; GFX900-NEXT: v_cmp_o_f16_e64 s[4:5], v1, v3 +; GFX900-NEXT: v_cndmask_b32_e64 v1, v5, v4, s[4:5] +; GFX900-NEXT: v_cndmask_b32_sdwa v3, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX900-NEXT: v_pk_max_f16 v4, v0, v2 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_sdwa v0, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 -; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4 +; GFX900-NEXT: v_perm_b32 v0, v0, v6, s4 +; GFX900-NEXT: v_perm_b32 v1, v3, v1, s4 ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_maximum_v4f16: @@ -1811,37 +1815,41 @@ define <4 x half> @v_maximum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) { ; GFX8-NEXT: v_max_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX8-NEXT: v_cmp_o_f16 vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v0 ; GFX8-NEXT: v_cndmask_b32_sdwa v4, v5, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_sdwa v6, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cndmask_b32_sdwa v6, v5, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v8, v7, v6 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v7, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; GFX8-NEXT: v_max_f16_e32 v7, v1, v3 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc ; GFX8-NEXT: v_max_f16_e32 v3, v0, v2 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_maximum_v4f16__nsz: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cmp_o_f16_sdwa s[4:5], v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[4:5] ; GFX900-NEXT: v_pk_max_f16 v4, v1, v3 ; GFX900-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX900-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_sdwa v1, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX900-NEXT: v_pk_max_f16 v3, v0, v2 +; GFX900-NEXT: v_cmp_o_f16_e64 s[4:5], v1, v3 +; GFX900-NEXT: v_cndmask_b32_e64 v1, v5, v4, s[4:5] +; GFX900-NEXT: v_cndmask_b32_sdwa v3, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX900-NEXT: v_pk_max_f16 v4, v0, v2 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_sdwa v0, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 -; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4 +; GFX900-NEXT: v_perm_b32 v0, v0, v6, s4 +; GFX900-NEXT: v_perm_b32 v1, v3, v1, s4 ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_maximum_v4f16__nsz: @@ -2071,35 +2079,47 @@ define <8 x half> @v_maximum_v8f16(<8 x half> %src0, <8 x half> %src1) { ; GFX8-LABEL: v_maximum_v8f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v8, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v9, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16 vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cndmask_b32_sdwa v8, v9, v8, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_sdwa v10, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cmp_o_f16 vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cndmask_b32_sdwa v10, v9, v10, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_sdwa v11, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cmp_o_f16 vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cndmask_b32_sdwa v11, v9, v11, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_sdwa v12, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cndmask_b32_sdwa v12, v9, v12, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; GFX8-NEXT: v_max_f16_e32 v10, v9, v8 +; GFX8-NEXT: v_mov_b32_e32 v11, 0x7e00 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v9, v8 +; GFX8-NEXT: v_cndmask_b32_e32 v8, v11, v10, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX8-NEXT: v_max_f16_e32 v12, v10, v9 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v10, v9 +; GFX8-NEXT: v_cndmask_b32_e32 v9, v11, v12, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v1 +; GFX8-NEXT: v_max_f16_e32 v13, v12, v10 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v12, v10 +; GFX8-NEXT: v_cndmask_b32_e32 v10, v11, v13, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; GFX8-NEXT: v_max_f16_e32 v14, v13, v12 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v13, v12 +; GFX8-NEXT: v_cndmask_b32_e32 v12, v11, v14, vcc ; GFX8-NEXT: v_max_f16_e32 v13, v3, v7 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v3, v7 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v9, v13, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v13, vcc ; GFX8-NEXT: v_max_f16_e32 v7, v2, v6 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v2, v6 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v9, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v11, v7, vcc ; GFX8-NEXT: v_max_f16_e32 v6, v1, v5 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v1, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v11, v6, vcc ; GFX8-NEXT: v_max_f16_e32 v5, v0, v4 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v9, v5, vcc -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v2, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e32 v0, v11, v5, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v12 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v10 +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_maximum_v8f16: @@ -2109,23 +2129,27 @@ define <8 x half> @v_maximum_v8f16(<8 x half> %src0, <8 x half> %src1) { ; GFX900-NEXT: v_mov_b32_e32 v9, 0x7e00 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v3, v7 ; GFX900-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_sdwa v3, v9, v8, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v9, v8, vcc ; GFX900-NEXT: v_pk_max_f16 v7, v2, v6 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v2, v6 ; GFX900-NEXT: v_cndmask_b32_e32 v8, v9, v7, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_sdwa v2, v9, v7, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v9, v7, vcc ; GFX900-NEXT: v_pk_max_f16 v6, v1, v5 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v5 ; GFX900-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_sdwa v1, v9, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v6, vcc ; GFX900-NEXT: v_pk_max_f16 v5, v0, v4 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v4 ; GFX900-NEXT: v_cndmask_b32_e32 v6, v9, v5, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_sdwa v0, v9, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v9, v5, vcc ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_perm_b32 v0, v0, v6, s4 ; GFX900-NEXT: v_perm_b32 v1, v1, v7, s4 @@ -2383,63 +2407,87 @@ define <16 x half> @v_maximum_v16f16(<16 x half> %src0, <16 x half> %src1) { ; GFX8-LABEL: v_maximum_v16f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v16, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v17, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16 vcc, v7, v15 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cndmask_b32_sdwa v16, v17, v16, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_sdwa v18, v6, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cmp_o_f16 vcc, v6, v14 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cndmask_b32_sdwa v18, v17, v18, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v19, v6, v14 -; GFX8-NEXT: v_cmp_o_f16_e64 s[4:5], v6, v14 -; GFX8-NEXT: v_max_f16_sdwa v6, v5, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cmp_o_f16 vcc, v5, v13 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cndmask_b32_sdwa v6, v17, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v14, v5, v13 -; GFX8-NEXT: v_cmp_o_f16_e64 s[6:7], v5, v13 -; GFX8-NEXT: v_max_f16_sdwa v5, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cmp_o_f16 vcc, v4, v12 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cndmask_b32_sdwa v5, v17, v5, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v13, v4, v12 -; GFX8-NEXT: v_cmp_o_f16_e64 s[8:9], v4, v12 -; GFX8-NEXT: v_max_f16_sdwa v4, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cmp_o_f16 vcc, v3, v11 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cndmask_b32_sdwa v4, v17, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v12, v3, v11 -; GFX8-NEXT: v_cmp_o_f16_e64 s[10:11], v3, v11 -; GFX8-NEXT: v_max_f16_sdwa v3, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cmp_o_f16 vcc, v2, v10 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cndmask_b32_sdwa v3, v17, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v11, v2, v10 -; GFX8-NEXT: v_cmp_o_f16_e64 s[12:13], v2, v10 -; GFX8-NEXT: v_max_f16_sdwa v2, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cmp_o_f16 vcc, v1, v9 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cndmask_b32_sdwa v2, v17, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v10, v7, v15 -; GFX8-NEXT: v_cmp_o_f16_e64 s[14:15], v7, v15 -; GFX8-NEXT: v_max_f16_sdwa v7, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v8 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cndmask_b32_sdwa v7, v17, v7, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_cndmask_b32_e64 v15, v17, v19, s[4:5] -; GFX8-NEXT: v_max_f16_e32 v19, v1, v9 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v14 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v6 +; GFX8-NEXT: v_max_f16_e32 v16, v18, v17 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v18, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v13 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; GFX8-NEXT: v_max_f16_e32 v20, v18, v17 +; GFX8-NEXT: v_cmp_o_f16_e64 s[4:5], v18, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v12 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX8-NEXT: v_max_f16_e32 v21, v18, v17 +; GFX8-NEXT: v_cmp_o_f16_e64 s[6:7], v18, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v11 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; GFX8-NEXT: v_max_f16_e32 v22, v18, v17 +; GFX8-NEXT: v_cmp_o_f16_e64 s[8:9], v18, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v10 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; GFX8-NEXT: v_max_f16_e32 v23, v18, v17 +; GFX8-NEXT: v_cmp_o_f16_e64 s[10:11], v18, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v9 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v1 +; GFX8-NEXT: v_max_f16_e32 v24, v18, v17 +; GFX8-NEXT: v_cmp_o_f16_e64 s[12:13], v18, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX8-NEXT: v_max_f16_e32 v25, v18, v17 +; GFX8-NEXT: v_cmp_o_f16_e64 s[14:15], v18, v17 +; GFX8-NEXT: v_max_f16_e32 v17, v6, v14 +; GFX8-NEXT: v_cmp_o_f16_e64 s[16:17], v6, v14 +; GFX8-NEXT: v_max_f16_e32 v6, v5, v13 +; GFX8-NEXT: v_cmp_o_f16_e64 s[18:19], v5, v13 +; GFX8-NEXT: v_max_f16_e32 v5, v4, v12 +; GFX8-NEXT: v_cmp_o_f16_e64 s[20:21], v4, v12 +; GFX8-NEXT: v_max_f16_e32 v4, v3, v11 +; GFX8-NEXT: v_cmp_o_f16_e64 s[22:23], v3, v11 +; GFX8-NEXT: v_max_f16_e32 v11, v7, v15 +; GFX8-NEXT: v_cmp_o_f16_e64 s[24:25], v7, v15 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX8-NEXT: v_mov_b32_e32 v19, 0x7e00 +; GFX8-NEXT: v_max_f16_e32 v13, v7, v12 +; GFX8-NEXT: v_cmp_o_f16_e64 s[26:27], v7, v12 +; GFX8-NEXT: v_max_f16_e32 v3, v2, v10 +; GFX8-NEXT: v_cndmask_b32_e64 v12, v19, v13, s[26:27] +; GFX8-NEXT: v_cndmask_b32_e32 v13, v19, v16, vcc +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v2, v10 +; GFX8-NEXT: v_max_f16_e32 v14, v1, v9 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v19, v3, vcc ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v1, v9 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v19, vcc -; GFX8-NEXT: v_max_f16_e32 v9, v0, v8 +; GFX8-NEXT: v_max_f16_e32 v7, v0, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v18, v19, v22, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v22, v19, v25, s[14:15] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v19, v14, vcc ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[14:15] -; GFX8-NEXT: v_cndmask_b32_e64 v14, v17, v14, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v13, v17, v13, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v12, v17, v12, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v17, v9, vcc -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v2, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v3, v12, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v4, v13, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v5, v14, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v6, v15, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v7, v10, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e64 v16, v19, v21, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v21, v19, v24, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v19, v7, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v22 +; GFX8-NEXT: v_cndmask_b32_e64 v15, v19, v20, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v20, v19, v23, s[10:11] +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v20 +; GFX8-NEXT: v_cndmask_b32_e64 v4, v19, v4, s[22:23] +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; GFX8-NEXT: v_cndmask_b32_e64 v5, v19, v5, s[20:21] +; GFX8-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v16 +; GFX8-NEXT: v_cndmask_b32_e64 v6, v19, v6, s[18:19] +; GFX8-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v15 +; GFX8-NEXT: v_cndmask_b32_e64 v11, v19, v11, s[24:25] +; GFX8-NEXT: v_cndmask_b32_e64 v17, v19, v17, s[16:17] +; GFX8-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v13 +; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v12 +; GFX8-NEXT: v_or_b32_sdwa v6, v17, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v7, v11, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_maximum_v16f16: @@ -2449,43 +2497,51 @@ define <16 x half> @v_maximum_v16f16(<16 x half> %src0, <16 x half> %src1) { ; GFX900-NEXT: v_mov_b32_e32 v17, 0x7e00 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v7, v15 ; GFX900-NEXT: v_cndmask_b32_e32 v18, v17, v16, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v7, v15 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_sdwa v7, v17, v16, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v17, v16, vcc ; GFX900-NEXT: v_pk_max_f16 v15, v6, v14 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v6, v14 ; GFX900-NEXT: v_cndmask_b32_e32 v16, v17, v15, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v6, v14 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_sdwa v6, v17, v15, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v17, v15, vcc ; GFX900-NEXT: v_pk_max_f16 v14, v5, v13 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v5, v13 ; GFX900-NEXT: v_cndmask_b32_e32 v15, v17, v14, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v5, v13 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_sdwa v5, v17, v14, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v17, v14, vcc ; GFX900-NEXT: v_pk_max_f16 v13, v4, v12 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v4, v12 ; GFX900-NEXT: v_cndmask_b32_e32 v14, v17, v13, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v4, v12 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_sdwa v4, v17, v13, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v17, v13, vcc ; GFX900-NEXT: v_pk_max_f16 v12, v3, v11 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v3, v11 ; GFX900-NEXT: v_cndmask_b32_e32 v13, v17, v12, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v12, 16, v12 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v3, v11 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_sdwa v3, v17, v12, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v17, v12, vcc ; GFX900-NEXT: v_pk_max_f16 v11, v2, v10 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v2, v10 ; GFX900-NEXT: v_cndmask_b32_e32 v12, v17, v11, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v2, v10 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_sdwa v2, v17, v11, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v17, v11, vcc ; GFX900-NEXT: v_pk_max_f16 v10, v1, v9 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v9 ; GFX900-NEXT: v_cndmask_b32_e32 v11, v17, v10, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v1, v9 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_sdwa v1, v17, v10, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v10, vcc ; GFX900-NEXT: v_pk_max_f16 v9, v0, v8 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v8 ; GFX900-NEXT: v_cndmask_b32_e32 v10, v17, v9, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v8 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_sdwa v0, v17, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v17, v9, vcc ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_perm_b32 v0, v0, v10, s4 ; GFX900-NEXT: v_perm_b32 v1, v1, v11, s4 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll index e33e34d7901cc..f62e1fa5fed81 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll @@ -1261,37 +1261,41 @@ define <4 x half> @v_minimum_v4f16(<4 x half> %src0, <4 x half> %src1) { ; GFX8-NEXT: v_min_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX8-NEXT: v_cmp_o_f16 vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v0 ; GFX8-NEXT: v_cndmask_b32_sdwa v4, v5, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_sdwa v6, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cndmask_b32_sdwa v6, v5, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v8, v7, v6 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v7, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; GFX8-NEXT: v_min_f16_e32 v7, v1, v3 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc ; GFX8-NEXT: v_min_f16_e32 v3, v0, v2 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_minimum_v4f16: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cmp_o_f16_sdwa s[4:5], v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[4:5] ; GFX900-NEXT: v_pk_min_f16 v4, v1, v3 ; GFX900-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX900-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_sdwa v1, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX900-NEXT: v_pk_min_f16 v3, v0, v2 +; GFX900-NEXT: v_cmp_o_f16_e64 s[4:5], v1, v3 +; GFX900-NEXT: v_cndmask_b32_e64 v1, v5, v4, s[4:5] +; GFX900-NEXT: v_cndmask_b32_sdwa v3, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX900-NEXT: v_pk_min_f16 v4, v0, v2 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_sdwa v0, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 -; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4 +; GFX900-NEXT: v_perm_b32 v0, v0, v6, s4 +; GFX900-NEXT: v_perm_b32 v1, v3, v1, s4 ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_minimum_v4f16: @@ -1437,37 +1441,41 @@ define <4 x half> @v_minimum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) { ; GFX8-NEXT: v_min_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX8-NEXT: v_cmp_o_f16 vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v0 ; GFX8-NEXT: v_cndmask_b32_sdwa v4, v5, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_sdwa v6, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cndmask_b32_sdwa v6, v5, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v8, v7, v6 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v7, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; GFX8-NEXT: v_min_f16_e32 v7, v1, v3 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc ; GFX8-NEXT: v_min_f16_e32 v3, v0, v2 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_minimum_v4f16__nsz: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cmp_o_f16_sdwa s[4:5], v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[4:5] ; GFX900-NEXT: v_pk_min_f16 v4, v1, v3 ; GFX900-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX900-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_sdwa v1, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX900-NEXT: v_pk_min_f16 v3, v0, v2 +; GFX900-NEXT: v_cmp_o_f16_e64 s[4:5], v1, v3 +; GFX900-NEXT: v_cndmask_b32_e64 v1, v5, v4, s[4:5] +; GFX900-NEXT: v_cndmask_b32_sdwa v3, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX900-NEXT: v_pk_min_f16 v4, v0, v2 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_sdwa v0, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 -; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4 +; GFX900-NEXT: v_perm_b32 v0, v0, v6, s4 +; GFX900-NEXT: v_perm_b32 v1, v3, v1, s4 ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_minimum_v4f16__nsz: @@ -1610,35 +1618,47 @@ define <8 x half> @v_minimum_v8f16(<8 x half> %src0, <8 x half> %src1) { ; GFX8-LABEL: v_minimum_v8f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_min_f16_sdwa v8, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v9, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16 vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cndmask_b32_sdwa v8, v9, v8, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_sdwa v10, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cmp_o_f16 vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cndmask_b32_sdwa v10, v9, v10, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_sdwa v11, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cmp_o_f16 vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cndmask_b32_sdwa v11, v9, v11, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_sdwa v12, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cndmask_b32_sdwa v12, v9, v12, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; GFX8-NEXT: v_min_f16_e32 v10, v9, v8 +; GFX8-NEXT: v_mov_b32_e32 v11, 0x7e00 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v9, v8 +; GFX8-NEXT: v_cndmask_b32_e32 v8, v11, v10, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX8-NEXT: v_min_f16_e32 v12, v10, v9 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v10, v9 +; GFX8-NEXT: v_cndmask_b32_e32 v9, v11, v12, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v1 +; GFX8-NEXT: v_min_f16_e32 v13, v12, v10 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v12, v10 +; GFX8-NEXT: v_cndmask_b32_e32 v10, v11, v13, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; GFX8-NEXT: v_min_f16_e32 v14, v13, v12 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v13, v12 +; GFX8-NEXT: v_cndmask_b32_e32 v12, v11, v14, vcc ; GFX8-NEXT: v_min_f16_e32 v13, v3, v7 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v3, v7 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v9, v13, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v13, vcc ; GFX8-NEXT: v_min_f16_e32 v7, v2, v6 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v2, v6 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v9, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v11, v7, vcc ; GFX8-NEXT: v_min_f16_e32 v6, v1, v5 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v1, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v11, v6, vcc ; GFX8-NEXT: v_min_f16_e32 v5, v0, v4 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v9, v5, vcc -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v2, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e32 v0, v11, v5, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v12 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v10 +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_minimum_v8f16: @@ -1648,23 +1668,27 @@ define <8 x half> @v_minimum_v8f16(<8 x half> %src0, <8 x half> %src1) { ; GFX900-NEXT: v_mov_b32_e32 v9, 0x7e00 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v3, v7 ; GFX900-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_sdwa v3, v9, v8, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v9, v8, vcc ; GFX900-NEXT: v_pk_min_f16 v7, v2, v6 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v2, v6 ; GFX900-NEXT: v_cndmask_b32_e32 v8, v9, v7, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_sdwa v2, v9, v7, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v9, v7, vcc ; GFX900-NEXT: v_pk_min_f16 v6, v1, v5 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v5 ; GFX900-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_sdwa v1, v9, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v6, vcc ; GFX900-NEXT: v_pk_min_f16 v5, v0, v4 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v4 ; GFX900-NEXT: v_cndmask_b32_e32 v6, v9, v5, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_sdwa v0, v9, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v9, v5, vcc ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_perm_b32 v0, v0, v6, s4 ; GFX900-NEXT: v_perm_b32 v1, v1, v7, s4 @@ -1802,63 +1826,87 @@ define <16 x half> @v_minimum_v16f16(<16 x half> %src0, <16 x half> %src1) { ; GFX8-LABEL: v_minimum_v16f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_min_f16_sdwa v16, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v17, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16 vcc, v7, v15 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cndmask_b32_sdwa v16, v17, v16, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_sdwa v18, v6, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cmp_o_f16 vcc, v6, v14 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cndmask_b32_sdwa v18, v17, v18, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v19, v6, v14 -; GFX8-NEXT: v_cmp_o_f16_e64 s[4:5], v6, v14 -; GFX8-NEXT: v_min_f16_sdwa v6, v5, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cmp_o_f16 vcc, v5, v13 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cndmask_b32_sdwa v6, v17, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v14, v5, v13 -; GFX8-NEXT: v_cmp_o_f16_e64 s[6:7], v5, v13 -; GFX8-NEXT: v_min_f16_sdwa v5, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cmp_o_f16 vcc, v4, v12 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cndmask_b32_sdwa v5, v17, v5, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v13, v4, v12 -; GFX8-NEXT: v_cmp_o_f16_e64 s[8:9], v4, v12 -; GFX8-NEXT: v_min_f16_sdwa v4, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cmp_o_f16 vcc, v3, v11 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cndmask_b32_sdwa v4, v17, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v12, v3, v11 -; GFX8-NEXT: v_cmp_o_f16_e64 s[10:11], v3, v11 -; GFX8-NEXT: v_min_f16_sdwa v3, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cmp_o_f16 vcc, v2, v10 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cndmask_b32_sdwa v3, v17, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v11, v2, v10 -; GFX8-NEXT: v_cmp_o_f16_e64 s[12:13], v2, v10 -; GFX8-NEXT: v_min_f16_sdwa v2, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cmp_o_f16 vcc, v1, v9 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cndmask_b32_sdwa v2, v17, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v10, v7, v15 -; GFX8-NEXT: v_cmp_o_f16_e64 s[14:15], v7, v15 -; GFX8-NEXT: v_min_f16_sdwa v7, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v8 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_cndmask_b32_sdwa v7, v17, v7, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_cndmask_b32_e64 v15, v17, v19, s[4:5] -; GFX8-NEXT: v_min_f16_e32 v19, v1, v9 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v14 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v6 +; GFX8-NEXT: v_min_f16_e32 v16, v18, v17 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v18, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v13 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; GFX8-NEXT: v_min_f16_e32 v20, v18, v17 +; GFX8-NEXT: v_cmp_o_f16_e64 s[4:5], v18, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v12 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX8-NEXT: v_min_f16_e32 v21, v18, v17 +; GFX8-NEXT: v_cmp_o_f16_e64 s[6:7], v18, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v11 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; GFX8-NEXT: v_min_f16_e32 v22, v18, v17 +; GFX8-NEXT: v_cmp_o_f16_e64 s[8:9], v18, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v10 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; GFX8-NEXT: v_min_f16_e32 v23, v18, v17 +; GFX8-NEXT: v_cmp_o_f16_e64 s[10:11], v18, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v9 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v1 +; GFX8-NEXT: v_min_f16_e32 v24, v18, v17 +; GFX8-NEXT: v_cmp_o_f16_e64 s[12:13], v18, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX8-NEXT: v_min_f16_e32 v25, v18, v17 +; GFX8-NEXT: v_cmp_o_f16_e64 s[14:15], v18, v17 +; GFX8-NEXT: v_min_f16_e32 v17, v6, v14 +; GFX8-NEXT: v_cmp_o_f16_e64 s[16:17], v6, v14 +; GFX8-NEXT: v_min_f16_e32 v6, v5, v13 +; GFX8-NEXT: v_cmp_o_f16_e64 s[18:19], v5, v13 +; GFX8-NEXT: v_min_f16_e32 v5, v4, v12 +; GFX8-NEXT: v_cmp_o_f16_e64 s[20:21], v4, v12 +; GFX8-NEXT: v_min_f16_e32 v4, v3, v11 +; GFX8-NEXT: v_cmp_o_f16_e64 s[22:23], v3, v11 +; GFX8-NEXT: v_min_f16_e32 v11, v7, v15 +; GFX8-NEXT: v_cmp_o_f16_e64 s[24:25], v7, v15 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX8-NEXT: v_mov_b32_e32 v19, 0x7e00 +; GFX8-NEXT: v_min_f16_e32 v13, v7, v12 +; GFX8-NEXT: v_cmp_o_f16_e64 s[26:27], v7, v12 +; GFX8-NEXT: v_min_f16_e32 v3, v2, v10 +; GFX8-NEXT: v_cndmask_b32_e64 v12, v19, v13, s[26:27] +; GFX8-NEXT: v_cndmask_b32_e32 v13, v19, v16, vcc +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v2, v10 +; GFX8-NEXT: v_min_f16_e32 v14, v1, v9 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v19, v3, vcc ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v1, v9 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v19, vcc -; GFX8-NEXT: v_min_f16_e32 v9, v0, v8 +; GFX8-NEXT: v_min_f16_e32 v7, v0, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v18, v19, v22, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v22, v19, v25, s[14:15] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v19, v14, vcc ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[14:15] -; GFX8-NEXT: v_cndmask_b32_e64 v14, v17, v14, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v13, v17, v13, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v12, v17, v12, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v17, v9, vcc -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v2, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v3, v12, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v4, v13, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v5, v14, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v6, v15, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v7, v10, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e64 v16, v19, v21, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v21, v19, v24, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v19, v7, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v22 +; GFX8-NEXT: v_cndmask_b32_e64 v15, v19, v20, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v20, v19, v23, s[10:11] +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v20 +; GFX8-NEXT: v_cndmask_b32_e64 v4, v19, v4, s[22:23] +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; GFX8-NEXT: v_cndmask_b32_e64 v5, v19, v5, s[20:21] +; GFX8-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v16 +; GFX8-NEXT: v_cndmask_b32_e64 v6, v19, v6, s[18:19] +; GFX8-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v15 +; GFX8-NEXT: v_cndmask_b32_e64 v11, v19, v11, s[24:25] +; GFX8-NEXT: v_cndmask_b32_e64 v17, v19, v17, s[16:17] +; GFX8-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v13 +; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v12 +; GFX8-NEXT: v_or_b32_sdwa v6, v17, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v7, v11, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_minimum_v16f16: @@ -1868,43 +1916,51 @@ define <16 x half> @v_minimum_v16f16(<16 x half> %src0, <16 x half> %src1) { ; GFX900-NEXT: v_mov_b32_e32 v17, 0x7e00 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v7, v15 ; GFX900-NEXT: v_cndmask_b32_e32 v18, v17, v16, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v7, v15 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_sdwa v7, v17, v16, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v17, v16, vcc ; GFX900-NEXT: v_pk_min_f16 v15, v6, v14 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v6, v14 ; GFX900-NEXT: v_cndmask_b32_e32 v16, v17, v15, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v6, v14 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_sdwa v6, v17, v15, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v17, v15, vcc ; GFX900-NEXT: v_pk_min_f16 v14, v5, v13 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v5, v13 ; GFX900-NEXT: v_cndmask_b32_e32 v15, v17, v14, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v5, v13 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_sdwa v5, v17, v14, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v17, v14, vcc ; GFX900-NEXT: v_pk_min_f16 v13, v4, v12 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v4, v12 ; GFX900-NEXT: v_cndmask_b32_e32 v14, v17, v13, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v4, v12 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_sdwa v4, v17, v13, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v17, v13, vcc ; GFX900-NEXT: v_pk_min_f16 v12, v3, v11 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v3, v11 ; GFX900-NEXT: v_cndmask_b32_e32 v13, v17, v12, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v12, 16, v12 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v3, v11 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_sdwa v3, v17, v12, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v17, v12, vcc ; GFX900-NEXT: v_pk_min_f16 v11, v2, v10 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v2, v10 ; GFX900-NEXT: v_cndmask_b32_e32 v12, v17, v11, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v2, v10 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_sdwa v2, v17, v11, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v17, v11, vcc ; GFX900-NEXT: v_pk_min_f16 v10, v1, v9 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v9 ; GFX900-NEXT: v_cndmask_b32_e32 v11, v17, v10, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v1, v9 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_sdwa v1, v17, v10, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v10, vcc ; GFX900-NEXT: v_pk_min_f16 v9, v0, v8 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v8 ; GFX900-NEXT: v_cndmask_b32_e32 v10, v17, v9, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v8 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_sdwa v0, v17, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v17, v9, vcc ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_perm_b32 v0, v0, v10, s4 ; GFX900-NEXT: v_perm_b32 v1, v1, v11, s4 diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.ll index e90fe93cf8d1c..718a266f49f5d 100644 --- a/llvm/test/CodeGen/AMDGPU/maximumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/maximumnum.ll @@ -211,7 +211,8 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) { ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v3, v2 -; GFX8-NEXT: v_cndmask_b32_sdwa v2, v1, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2 @@ -242,7 +243,8 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) { ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v2, v3 -; GFX9-NEXT: v_cndmask_b32_sdwa v2, v1, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.ll index ae37f4b43896c..2b4d687bb0c29 100644 --- a/llvm/test/CodeGen/AMDGPU/minimumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/minimumnum.ll @@ -211,7 +211,8 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) { ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v3, v2 -; GFX8-NEXT: v_cndmask_b32_sdwa v2, v1, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2 @@ -243,7 +244,8 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) { ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v2, v3 -; GFX9-NEXT: v_cndmask_b32_sdwa v2, v1, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff diff --git a/llvm/test/CodeGen/AMDGPU/saddsat.ll b/llvm/test/CodeGen/AMDGPU/saddsat.ll index 7ff0d3f511c60..4e27cf20d3c98 100644 --- a/llvm/test/CodeGen/AMDGPU/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/saddsat.ll @@ -184,17 +184,17 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX8-NEXT: v_add_u16_e32 v4, v3, v2 ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v4, v3 ; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v2 -; GFX8-NEXT: s_xor_b64 s[4:5], s[4:5], vcc ; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v4 -; GFX8-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[4:5] ; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2 -; GFX8-NEXT: v_cndmask_b32_sdwa v2, v4, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v1 ; GFX8-NEXT: v_add_u16_e32 v1, v0, v1 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v1, v0 ; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v1 ; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -247,11 +247,10 @@ define <3 x i16> @v_saddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) { ; GFX8-NEXT: v_add_u16_e32 v6, v5, v4 ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5 ; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v4 -; GFX8-NEXT: s_xor_b64 s[4:5], s[4:5], vcc ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 15, v6 -; GFX8-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[4:5] ; GFX8-NEXT: v_xor_b32_e32 v4, 0xffff8000, v4 -; GFX8-NEXT: v_cndmask_b32_sdwa v4, v6, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v3 ; GFX8-NEXT: v_add_u16_e32 v3, v1, v3 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v3, v1 @@ -266,7 +265,8 @@ define <3 x i16> @v_saddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) { ; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_saddsat_v3i16: @@ -324,17 +324,17 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX8-NEXT: v_add_u16_e32 v6, v5, v4 ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5 ; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v4 -; GFX8-NEXT: s_xor_b64 s[4:5], s[4:5], vcc ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 15, v6 -; GFX8-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[4:5] ; GFX8-NEXT: v_xor_b32_e32 v4, 0xffff8000, v4 -; GFX8-NEXT: v_cndmask_b32_sdwa v4, v6, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v2 ; GFX8-NEXT: v_add_u16_e32 v2, v0, v2 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v2, v0 ; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v2 ; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3 @@ -342,17 +342,17 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX8-NEXT: v_add_u16_e32 v5, v4, v2 ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v5, v4 ; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v2 -; GFX8-NEXT: s_xor_b64 s[4:5], s[4:5], vcc ; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v5 -; GFX8-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[4:5] ; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2 -; GFX8-NEXT: v_cndmask_b32_sdwa v2, v5, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v3 ; GFX8-NEXT: v_add_u16_e32 v3, v1, v3 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v3, v1 ; GFX8-NEXT: v_ashrrev_i16_e32 v1, 15, v3 ; GFX8-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll index aa65c6afd4eb9..57b4dba19d3e1 100644 --- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll @@ -171,12 +171,14 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_lhs_fabs_fabs_v2f16(<2 x ; VI-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc ; VI-NEXT: v_add_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v0, v0, v5 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_add_f16_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_add_f16_sdwa v1, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v2, v2, v4 ; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -442,12 +444,14 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_rhs_fabs_fabs_v2f16(<2 x ; VI-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc ; VI-NEXT: v_add_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v0, v0, v4 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_add_f16_sdwa v1, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_add_f16_sdwa v1, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v2, v3, v5 ; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll index 862bc792680f9..b8187231ea709 100644 --- a/llvm/test/CodeGen/AMDGPU/select.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll @@ -1732,16 +1732,22 @@ define <4 x half> @v_vselect_v4f16(<4 x half> %a, <4 x half> %b, <4 x i32> %cond ; VI-LABEL: v_vselect_v4f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 -; VI-NEXT: v_cndmask_b32_sdwa v7, v3, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_cndmask_b32_e32 v7, v9, v8, vcc +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; VI-NEXT: v_cndmask_b32_sdwa v5, v2, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_cndmask_b32_e32 v5, v9, v8, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 ; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; VI-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_vselect_v4f16: @@ -1918,14 +1924,22 @@ define <8 x half> @v_vselect_v8f16(<8 x half> %a, <8 x half> %b, <8 x i32> %cond ; VI-LABEL: v_vselect_v8f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15 -; VI-NEXT: v_cndmask_b32_sdwa v15, v7, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_cndmask_b32_e32 v15, v17, v16, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v6 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13 -; VI-NEXT: v_cndmask_b32_sdwa v13, v6, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_cndmask_b32_e32 v13, v17, v16, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v11 -; VI-NEXT: v_cndmask_b32_sdwa v11, v5, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_cndmask_b32_e32 v11, v17, v16, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v4 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 -; VI-NEXT: v_cndmask_b32_sdwa v9, v4, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_cndmask_b32_e32 v9, v17, v16, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14 ; VI-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12 @@ -1934,10 +1948,14 @@ define <8 x half> @v_vselect_v8f16(<8 x half> %a, <8 x half> %b, <8 x i32> %cond ; VI-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; VI-NEXT: v_or_b32_sdwa v0, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; VI-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v11 +; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v13 +; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v15 +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_vselect_v8f16: @@ -2271,48 +2289,72 @@ define <16 x half> @v_vselect_v16f16(<16 x half> %a, <16 x half> %b, <16 x i32> ; VI-LABEL: v_vselect_v16f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v30 -; VI-NEXT: v_cndmask_b32_e64 v30, v15, v7, s[4:5] -; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v28 +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 +; VI-NEXT: v_cmp_eq_u32_e64 s[18:19], 0, v17 +; VI-NEXT: v_cmp_eq_u32_e64 s[40:41], 0, v29 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 +; VI-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v18 +; VI-NEXT: v_cmp_eq_u32_e64 s[28:29], 0, v27 +; VI-NEXT: v_cndmask_b32_e64 v16, v17, v16, s[40:41] +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; VI-NEXT: v_cmp_eq_u32_e64 s[20:21], 0, v19 +; VI-NEXT: v_cmp_eq_u32_e64 s[26:27], 0, v25 +; VI-NEXT: v_cndmask_b32_e64 v17, v18, v17, s[28:29] +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 +; VI-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v20 +; VI-NEXT: v_cmp_eq_u32_e64 s[24:25], 0, v23 +; VI-NEXT: v_cndmask_b32_e64 v18, v19, v18, s[26:27] +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 +; VI-NEXT: v_cmp_eq_u32_e64 s[22:23], 0, v21 +; VI-NEXT: v_cndmask_b32_e64 v19, v20, v19, s[24:25] +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; VI-NEXT: v_cmp_eq_u32_e64 s[10:11], 0, v22 +; VI-NEXT: v_cndmask_b32_e64 v20, v21, v20, s[22:23] +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 +; VI-NEXT: v_cndmask_b32_e64 v21, v22, v21, s[20:21] +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; VI-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[4:5] +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; VI-NEXT: v_cndmask_b32_e64 v22, v23, v22, s[18:19] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 +; VI-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[6:7] +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v22 +; VI-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[8:9] +; VI-NEXT: v_or_b32_sdwa v0, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v15, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v15 +; VI-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v24 +; VI-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[10:11] +; VI-NEXT: v_cmp_eq_u32_e64 s[14:15], 0, v26 +; VI-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[12:13] +; VI-NEXT: v_cmp_eq_u32_e64 s[16:17], 0, v28 +; VI-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[14:15] +; VI-NEXT: v_cndmask_b32_e64 v6, v14, v6, s[16:17] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 -; VI-NEXT: v_cndmask_b32_sdwa v7, v15, v7, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v29 -; VI-NEXT: v_cndmask_b32_e64 v15, v14, v6, s[4:5] -; VI-NEXT: v_cndmask_b32_sdwa v6, v14, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26 -; VI-NEXT: v_cndmask_b32_e32 v14, v13, v5, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 -; VI-NEXT: v_cndmask_b32_e32 v24, v12, v4, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v22 -; VI-NEXT: v_cndmask_b32_e32 v22, v11, v3, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v20 -; VI-NEXT: v_cndmask_b32_e32 v20, v10, v2, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 -; VI-NEXT: v_cndmask_b32_e32 v18, v9, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v8, v0, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v27 -; VI-NEXT: v_cndmask_b32_sdwa v5, v13, v5, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v25 -; VI-NEXT: v_cndmask_b32_sdwa v4, v12, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v23 -; VI-NEXT: v_cndmask_b32_sdwa v3, v11, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v21 -; VI-NEXT: v_cndmask_b32_sdwa v2, v10, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 -; VI-NEXT: v_cndmask_b32_sdwa v1, v9, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 -; VI-NEXT: v_cndmask_b32_sdwa v0, v8, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v7, v30, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v15, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v14, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v24, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v22, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v21 +; VI-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v20 +; VI-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v19 +; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v18 +; VI-NEXT: v_or_b32_sdwa v4, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 +; VI-NEXT: v_or_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v16 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_or_b32_sdwa v6, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_vselect_v16f16: @@ -3068,144 +3110,223 @@ define <32 x half> @v_vselect_v32f16(<32 x half> %a, <32 x half> %b, <32 x i32> ; VI-LABEL: v_vselect_v32f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:120 -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:116 -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:112 -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:104 -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:96 -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:88 -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:80 -; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v30 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v9 ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 -; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v32 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v36 +; VI-NEXT: v_cndmask_b32_e32 v36, v43, v38, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v35 +; VI-NEXT: v_cndmask_b32_e32 v35, v45, v44, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v34 +; VI-NEXT: v_cndmask_b32_e32 v34, v47, v46, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v33 +; VI-NEXT: v_cndmask_b32_e32 v33, v57, v56, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v25 +; VI-NEXT: v_cndmask_b32_e32 v32, v59, v58, vcc ; VI-NEXT: s_waitcnt vmcnt(13) -; VI-NEXT: v_cndmask_b32_e64 v31, v33, v15, s[4:5] -; VI-NEXT: v_cndmask_b32_sdwa v15, v33, v15, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v24 +; VI-NEXT: v_cndmask_b32_e32 v38, v38, v60, vcc ; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v34 -; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v35 -; VI-NEXT: v_cndmask_b32_e64 v32, v30, v14, s[4:5] -; VI-NEXT: v_cndmask_b32_sdwa v14, v30, v14, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:64 -; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v36 -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:56 -; VI-NEXT: v_cndmask_b32_e64 v33, v29, v13, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v37 -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:48 -; VI-NEXT: v_cndmask_b32_e64 v34, v28, v12, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v38 -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:40 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v39 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v15 +; VI-NEXT: v_cndmask_b32_e32 v39, v44, v43, vcc ; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v48 -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32 -; VI-NEXT: v_cndmask_b32_sdwa v13, v29, v13, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v49 -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 -; VI-NEXT: v_cndmask_b32_sdwa v12, v28, v12, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v50 -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16 -; VI-NEXT: v_cndmask_b32_e64 v35, v27, v11, s[4:5] -; VI-NEXT: v_cndmask_b32_sdwa v11, v27, v11, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v51 -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:8 -; VI-NEXT: v_cndmask_b32_sdwa v27, v26, v10, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v52 -; VI-NEXT: v_cndmask_b32_sdwa v28, v25, v9, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v37 ; VI-NEXT: s_waitcnt vmcnt(10) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v53 -; VI-NEXT: v_cndmask_b32_sdwa v29, v24, v8, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v55 +; VI-NEXT: v_cndmask_b32_e32 v31, v31, v45, vcc +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v23 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v54 -; VI-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v50 +; VI-NEXT: v_cndmask_b32_e32 v50, v43, v55, vcc +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v22 ; VI-NEXT: s_waitcnt vmcnt(8) -; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v39 -; VI-NEXT: v_cndmask_b32_e64 v9, v25, v9, s[4:5] -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:12 -; VI-NEXT: v_or_b32_sdwa v9, v9, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v10, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v35, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v34, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v33, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v32, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v31, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:52 -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 -; VI-NEXT: v_cndmask_b32_sdwa v30, v23, v7, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v53 +; VI-NEXT: v_cndmask_b32_e32 v53, v43, v55, vcc +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v21 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v54 +; VI-NEXT: v_cndmask_b32_e32 v54, v43, v55, vcc +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v20 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v52 +; VI-NEXT: v_cndmask_b32_e32 v52, v43, v55, vcc +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v19 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v51 +; VI-NEXT: v_cndmask_b32_e32 v51, v43, v55, vcc +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:108 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v49 +; VI-NEXT: v_cndmask_b32_e32 v49, v43, v55, vcc +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:92 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v17 +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v48 +; VI-NEXT: v_cndmask_b32_e32 v48, v46, v43, vcc +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v16 +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v40 +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 +; VI-NEXT: v_cndmask_b32_e32 v46, v58, v46, vcc +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v41 +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 +; VI-NEXT: v_cndmask_b32_e32 v15, v37, v15, vcc +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v42 +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 +; VI-NEXT: v_cndmask_b32_e32 v14, v30, v14, vcc +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:4 ; VI-NEXT: s_waitcnt vmcnt(13) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v36 -; VI-NEXT: v_cndmask_b32_sdwa v36, v22, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v44 +; VI-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc ; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v37 -; VI-NEXT: v_cndmask_b32_sdwa v37, v21, v5, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v55 +; VI-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc ; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v38 -; VI-NEXT: v_cndmask_b32_sdwa v38, v20, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v45 +; VI-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc ; VI-NEXT: s_waitcnt vmcnt(10) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v48 -; VI-NEXT: v_cndmask_b32_sdwa v48, v19, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v47 +; VI-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v49 -; VI-NEXT: v_cndmask_b32_sdwa v49, v18, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v56 +; VI-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc ; VI-NEXT: s_waitcnt vmcnt(8) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v50 -; VI-NEXT: v_cndmask_b32_sdwa v50, v17, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v43 +; VI-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v51 -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v51 -; VI-NEXT: v_cndmask_b32_e64 v8, v24, v8, s[4:5] -; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v26 -; VI-NEXT: v_cndmask_b32_e64 v7, v23, v7, s[4:5] -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:4 -; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v39 -; VI-NEXT: v_cndmask_b32_e64 v6, v22, v6, s[4:5] -; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v25 -; VI-NEXT: v_cndmask_b32_e64 v5, v21, v5, s[4:5] -; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v52 -; VI-NEXT: v_cndmask_b32_e64 v4, v20, v4, s[4:5] -; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v53 -; VI-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[4:5] -; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v54 -; VI-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] -; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v55 -; VI-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[4:5] -; VI-NEXT: v_cndmask_b32_sdwa v17, v16, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v1, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v8, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v57 +; VI-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v59 +; VI-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v40 +; VI-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v58 +; VI-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v41 +; VI-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v37 +; VI-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v42 +; VI-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v23 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 ; VI-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; VI-NEXT: v_or_b32_sdwa v0, v0, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v46 +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v48 +; VI-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v49 +; VI-NEXT: v_or_b32_sdwa v2, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v51 +; VI-NEXT: v_or_b32_sdwa v3, v3, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v52 +; VI-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v54 +; VI-NEXT: v_or_b32_sdwa v5, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v53 +; VI-NEXT: v_or_b32_sdwa v6, v6, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v50 +; VI-NEXT: v_or_b32_sdwa v7, v7, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v39 +; VI-NEXT: v_or_b32_sdwa v8, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v38 +; VI-NEXT: v_or_b32_sdwa v9, v9, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v32 +; VI-NEXT: v_or_b32_sdwa v10, v10, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v33 +; VI-NEXT: v_or_b32_sdwa v11, v11, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v34 +; VI-NEXT: v_or_b32_sdwa v12, v12, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v35 +; VI-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v36 +; VI-NEXT: v_or_b32_sdwa v14, v14, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v31 +; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_vselect_v32f16: diff --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll index 6972aa3a0529b..40d80f5e83e36 100644 --- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll @@ -184,17 +184,17 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX8-NEXT: v_sub_u16_e32 v4, v3, v2 ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v4, v3 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], 0, v2 -; GFX8-NEXT: s_xor_b64 s[4:5], s[4:5], vcc ; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v4 -; GFX8-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[4:5] ; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2 -; GFX8-NEXT: v_cndmask_b32_sdwa v2, v4, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v1 ; GFX8-NEXT: v_sub_u16_e32 v1, v0, v1 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v1, v0 ; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v1 ; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -247,11 +247,10 @@ define <3 x i16> @v_ssubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) { ; GFX8-NEXT: v_sub_u16_e32 v6, v5, v4 ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], 0, v4 -; GFX8-NEXT: s_xor_b64 s[4:5], s[4:5], vcc ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 15, v6 -; GFX8-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[4:5] ; GFX8-NEXT: v_xor_b32_e32 v4, 0xffff8000, v4 -; GFX8-NEXT: v_cndmask_b32_sdwa v4, v6, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v3 ; GFX8-NEXT: v_sub_u16_e32 v3, v1, v3 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v3, v1 @@ -266,7 +265,8 @@ define <3 x i16> @v_ssubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) { ; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_v3i16: @@ -324,17 +324,17 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX8-NEXT: v_sub_u16_e32 v6, v5, v4 ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], 0, v4 -; GFX8-NEXT: s_xor_b64 s[4:5], s[4:5], vcc ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 15, v6 -; GFX8-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[4:5] ; GFX8-NEXT: v_xor_b32_e32 v4, 0xffff8000, v4 -; GFX8-NEXT: v_cndmask_b32_sdwa v4, v6, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v2 ; GFX8-NEXT: v_sub_u16_e32 v2, v0, v2 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v2, v0 ; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v2 ; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3 @@ -342,17 +342,17 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX8-NEXT: v_sub_u16_e32 v5, v4, v2 ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v5, v4 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], 0, v2 -; GFX8-NEXT: s_xor_b64 s[4:5], s[4:5], vcc ; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v5 -; GFX8-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[4:5] ; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2 -; GFX8-NEXT: v_cndmask_b32_sdwa v2, v5, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v3 ; GFX8-NEXT: v_sub_u16_e32 v3, v1, v3 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v3, v1 ; GFX8-NEXT: v_ashrrev_i16_e32 v1, 15, v3 ; GFX8-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] From c344d1439857e151733a1bf33f162a5ec773368a Mon Sep 17 00:00:00 2001 From: Frederik Harwath Date: Wed, 30 Apr 2025 05:42:58 -0400 Subject: [PATCH 04/26] Stop moving carry-in def instruction --- llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 5 +- llvm/test/CodeGen/AMDGPU/bf16.ll | 44 +- llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll | 14 +- .../CodeGen/AMDGPU/dagcombine-fmul-sel.ll | 76 +- llvm/test/CodeGen/AMDGPU/fmaximum3.ll | 41 +- llvm/test/CodeGen/AMDGPU/fminimum3.ll | 41 +- .../CodeGen/AMDGPU/fneg-modifier-casting.ll | 54 +- llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll | 26 +- llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll | 26 +- .../AMDGPU/sdwa-peephole-vcnd_mask-1.mir | 2 +- .../AMDGPU/select-fabs-fneg-extract.v2f16.ll | 658 +++++++++--------- llvm/test/CodeGen/AMDGPU/select.f16.ll | 78 +-- 12 files changed, 533 insertions(+), 532 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index 7ad18356864a0..b3e9b424cf1b0 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -1113,10 +1113,9 @@ void SIPeepholeSDWA::convertToImplicitVcc(MachineInstr &MI, MachineInstr &CarryDef = *MRI->getVRegDef(CarryReg); if (CarryDef.isCompare() && TII->isVOP3(CarryDef) && - MRI->hasOneUse(CarryIn.getReg())) { + MRI->hasOneUse(CarryIn.getReg())) CarryDef.substituteRegister(CarryIn.getReg(), Vcc, 0, *TRI); - CarryDef.moveBefore(&MI); - } else { + else { // Add write: VCC[lanedId] <- (CarryIn[laneId] == 1) const TargetRegisterClass *Class = TRI->getRegClassForOperandReg(*MRI, CarryIn); diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index bc8665ffb9b63..0d91305917b3e 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -38574,11 +38574,11 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo ; GFX8-LABEL: v_vselect_v2bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[4:5] ; GFX8-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -38586,11 +38586,11 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo ; GFX9-LABEL: v_vselect_v2bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[4:5] ; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 @@ -38874,10 +38874,10 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: v_mov_b32_e32 v2, s3 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX8-NEXT: v_cndmask_b32_sdwa v1, v2, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -40783,21 +40783,23 @@ define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bflo ; GFX9-LABEL: v_vselect_v4bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_cndmask_b32_sdwa v3, v7, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc +; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v6, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_e32 v1, 1, v3 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[4:5] +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, v3, s[4:5] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, v4, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v3, v6, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4 +; GFX9-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v1, v2, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_vselect_v4bf16: diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll index 0feefdf145639..afaabe08d6d6d 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -1503,14 +1503,14 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xff ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_or_b32_e32 v3, 0x100, v1 -; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v3, v3 +; GFX9-GISEL-NEXT: v_or_b32_e32 v2, 0x100, v1 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v1, v2 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xff +; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i8, ptr addrspace(1) %arrayidx, align 1 @@ -1604,16 +1604,16 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: v_or_b32_e32 v2, 0x10000, v1 -; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v1, v2 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %arrayidx, align 1 diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll index 1429d8e96ac7c..9dd5fd91d186a 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll @@ -2775,10 +2775,10 @@ define bfloat @fmul_select_bf16_test1(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX9-LABEL: fmul_select_bf16_test1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0x3f80 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x4000 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v1, 0x3f80 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x4000 +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -2909,10 +2909,10 @@ define bfloat @fmul_select_bf16_test2(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX9-LABEL: fmul_select_bf16_test2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0x3f80 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x3f00 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v1, 0x3f80 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x3f00 +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -3049,15 +3049,16 @@ define <2 x bfloat> @fmul_select_v2bf16_test3(<2 x bfloat> %x, <2 x i32> %bool.a ; GFX9-LABEL: fmul_select_v2bf16_test3: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v5, 0x3f80 -; GFX9-NEXT: v_mov_b32_e32 v6, 0x4000 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 -; GFX9-NEXT: v_cndmask_b32_sdwa v2, v5, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v5, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v1, 0x3f80 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x4000 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], v2, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v1, v3, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v1, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 @@ -3257,15 +3258,16 @@ define <2 x bfloat> @fmul_select_v2bf16_test4(<2 x bfloat> %x, <2 x i32> %bool.a ; GFX9-LABEL: fmul_select_v2bf16_test4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v5, 0x3f80 -; GFX9-NEXT: v_mov_b32_e32 v6, 0x3f00 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 -; GFX9-NEXT: v_cndmask_b32_sdwa v2, v5, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v5, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v1, 0x3f80 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x3f00 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], v2, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v1, v3, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v1, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 @@ -3460,10 +3462,10 @@ define bfloat @fmul_select_bf16_test5(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX9-LABEL: fmul_select_bf16_test5: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0x4100 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x4000 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v1, 0x4100 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x4000 +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -3596,10 +3598,10 @@ define bfloat @fmul_select_bf16_test6(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX9-LABEL: fmul_select_bf16_test6: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0x4040 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffffc100 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v1, 0x4040 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffffc100 +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -3731,10 +3733,10 @@ define bfloat @fmul_select_bf16_test7(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX9-LABEL: fmul_select_bf16_test7: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffffc080 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x4100 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffc080 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x4100 +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -3866,10 +3868,10 @@ define bfloat @fmul_select_bf16_test8(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX9-LABEL: fmul_select_bf16_test8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff8000 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v1, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -3995,10 +3997,10 @@ define bfloat @fmul_select_bf16_test9(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX9-LABEL: fmul_select_bf16_test9: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffffc200 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffffc180 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffc200 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffffc180 +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -4131,10 +4133,10 @@ define bfloat @fmul_select_bf16_test10_sel_log2val_pos65_pos56(bfloat %x, i32 %b ; GFX9-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffffdb80 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffffe000 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffdb80 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffffe000 +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -4267,10 +4269,10 @@ define bfloat @fmul_select_bf16_test11_sel_log2val_neg22_pos25(bfloat %x, i32 %b ; GFX9-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0x4c00 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x3480 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v1, 0x4c00 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x3480 +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll index 05d45d2b2bfac..222287e62209f 100644 --- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll @@ -2187,9 +2187,9 @@ define <2 x half> @v_fmaximum3_v2f16__fabs_all(<2 x half> %a, <2 x half> %b, <2 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX942-NEXT: v_perm_b32 v1, v6, v0, s0 -; GFX942-NEXT: v_pk_max_f16 v1, v1, v5 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v6, |v2| src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_pk_max_f16 v1, v1, v5 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_sdwa v3, v4, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2| ; GFX942-NEXT: s_nop 1 @@ -2279,19 +2279,18 @@ define <2 x half> @v_fmaximum3_v2f16__inlineimm1(<2 x half> %a, <2 x half> %c) { ; GFX942-LABEL: v_fmaximum3_v2f16__inlineimm1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_max_f16 v2, v0, 2.0 op_sel_hi:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: s_mov_b32 s0, 0x5040100 -; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_sdwa v4, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX942-NEXT: v_perm_b32 v2, v4, v0, s0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v1 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v4, v1 src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_sdwa v4, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 ; GFX942-NEXT: s_nop 1 @@ -2645,15 +2644,14 @@ define <3 x half> @v_fmaximum3_v3f16__inlineimm1(<3 x half> %a, <3 x half> %c) { ; GFX942-LABEL: v_fmaximum3_v3f16__inlineimm1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_max_f16 v4, v0, 2.0 op_sel_hi:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: v_pk_max_f16 v7, v1, 2.0 -; GFX942-NEXT: s_mov_b32 s1, 0x5040100 ; GFX942-NEXT: v_cndmask_b32_sdwa v6, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_pk_max_f16 v7, v1, 2.0 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 +; GFX942-NEXT: s_mov_b32 s1, 0x5040100 ; GFX942-NEXT: s_movk_i32 s0, 0x7e00 -; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 ; GFX942-NEXT: s_nop 1 @@ -3054,15 +3052,15 @@ define <4 x half> @v_fmaximum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) { ; GFX942-LABEL: v_fmaximum3_v4f16__inlineimm1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_max_f16 v4, v0, 2.0 op_sel_hi:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_max_f16 v7, v1, 2.0 op_sel_hi:[1,0] -; GFX942-NEXT: s_mov_b32 s2, 0x5040100 ; GFX942-NEXT: v_cndmask_b32_sdwa v6, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v7 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: s_mov_b32 s2, 0x5040100 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v8, v5, v8, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 ; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v8, v3 src0_sel:DWORD src1_sel:WORD_1 @@ -3074,20 +3072,21 @@ define <4 x half> @v_fmaximum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) { ; GFX942-NEXT: v_perm_b32 v4, v8, v1, s2 ; GFX942-NEXT: v_pk_max_f16 v4, v4, v3 ; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] -; GFX942-NEXT: v_perm_b32 v8, v6, v0, s2 -; GFX942-NEXT: v_pk_max_f16 v8, v8, v2 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v7, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v6, v2 src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_sdwa v6, v5, v8, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_perm_b32 v6, v6, v0, s2 +; GFX942-NEXT: v_pk_max_f16 v6, v6, v2 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_sdwa v8, v5, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 ; GFX942-NEXT: v_perm_b32 v1, v7, v1, s2 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc -; GFX942-NEXT: v_perm_b32 v0, v6, v0, s2 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc +; GFX942-NEXT: v_perm_b32 v0, v8, v0, s2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v4f16__inlineimm1: diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll index 44cc7428f888d..df0e7a03b4ab6 100644 --- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll @@ -2187,9 +2187,9 @@ define <2 x half> @v_fminimum3_v2f16__fabs_all(<2 x half> %a, <2 x half> %b, <2 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX942-NEXT: v_perm_b32 v1, v6, v0, s0 -; GFX942-NEXT: v_pk_min_f16 v1, v1, v5 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v6, |v2| src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_pk_min_f16 v1, v1, v5 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_sdwa v3, v4, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2| ; GFX942-NEXT: s_nop 1 @@ -2279,19 +2279,18 @@ define <2 x half> @v_fminimum3_v2f16__inlineimm1(<2 x half> %a, <2 x half> %c) { ; GFX942-LABEL: v_fminimum3_v2f16__inlineimm1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_min_f16 v2, v0, 2.0 op_sel_hi:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: s_mov_b32 s0, 0x5040100 -; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_sdwa v4, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX942-NEXT: v_perm_b32 v2, v4, v0, s0 -; GFX942-NEXT: v_pk_min_f16 v2, v2, v1 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v4, v1 src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_sdwa v4, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 ; GFX942-NEXT: s_nop 1 @@ -2645,15 +2644,14 @@ define <3 x half> @v_fminimum3_v3f16__inlineimm1(<3 x half> %a, <3 x half> %c) { ; GFX942-LABEL: v_fminimum3_v3f16__inlineimm1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_min_f16 v4, v0, 2.0 op_sel_hi:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: v_pk_min_f16 v7, v1, 2.0 -; GFX942-NEXT: s_mov_b32 s1, 0x5040100 ; GFX942-NEXT: v_cndmask_b32_sdwa v6, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_pk_min_f16 v7, v1, 2.0 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 +; GFX942-NEXT: s_mov_b32 s1, 0x5040100 ; GFX942-NEXT: s_movk_i32 s0, 0x7e00 -; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 ; GFX942-NEXT: s_nop 1 @@ -3054,15 +3052,15 @@ define <4 x half> @v_fminimum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) { ; GFX942-LABEL: v_fminimum3_v4f16__inlineimm1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_min_f16 v4, v0, 2.0 op_sel_hi:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_min_f16 v7, v1, 2.0 op_sel_hi:[1,0] -; GFX942-NEXT: s_mov_b32 s2, 0x5040100 ; GFX942-NEXT: v_cndmask_b32_sdwa v6, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v7 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: s_mov_b32 s2, 0x5040100 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v8, v5, v8, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 ; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v8, v3 src0_sel:DWORD src1_sel:WORD_1 @@ -3074,20 +3072,21 @@ define <4 x half> @v_fminimum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) { ; GFX942-NEXT: v_perm_b32 v4, v8, v1, s2 ; GFX942-NEXT: v_pk_min_f16 v4, v4, v3 ; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] -; GFX942-NEXT: v_perm_b32 v8, v6, v0, s2 -; GFX942-NEXT: v_pk_min_f16 v8, v8, v2 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v7, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v6, v2 src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_sdwa v6, v5, v8, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_perm_b32 v6, v6, v0, s2 +; GFX942-NEXT: v_pk_min_f16 v6, v6, v2 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_sdwa v8, v5, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 ; GFX942-NEXT: v_perm_b32 v1, v7, v1, s2 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc -; GFX942-NEXT: v_perm_b32 v0, v6, v0, s2 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc +; GFX942-NEXT: v_perm_b32 v0, v8, v0, s2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v4f16__inlineimm1: diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll index 1b508af610d74..d03e6eef2e364 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll @@ -204,11 +204,11 @@ define <2 x i16> @fneg_xor_select_v2i16(<2 x i1> %cond, <2 x i16> %arg0, <2 x i1 ; GFX9-LABEL: fneg_xor_select_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[4:5] ; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 @@ -743,22 +743,24 @@ define <2 x half> @select_fneg_select_v2f16(<2 x i1> %cond0, <2 x i1> %cond1, <2 ; GFX9-LABEL: select_fneg_select_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX9-NEXT: v_xor_b32_e32 v3, 0x80008000, v4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_xor_b32_e32 v4, 0x80008000, v4 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v4, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v6, v4, s[4:5] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v5, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX9-NEXT: v_perm_b32 v4, v1, v0, s4 +; GFX9-NEXT: v_perm_b32 v3, v1, v0, s4 ; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_xor_b32_e32 v4, 0x80008000, v4 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_xor_b32_e32 v3, 0x80008000, v3 +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v1, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -848,22 +850,24 @@ define <2 x i16> @select_fneg_xor_select_v2i16(<2 x i1> %cond0, <2 x i1> %cond1, ; GFX9-LABEL: select_fneg_xor_select_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX9-NEXT: v_xor_b32_e32 v3, 0x80008000, v4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_xor_b32_e32 v4, 0x80008000, v4 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v4, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v6, v4, s[4:5] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v5, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX9-NEXT: v_perm_b32 v4, v1, v0, s4 +; GFX9-NEXT: v_perm_b32 v3, v1, v0, s4 ; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_xor_b32_e32 v4, 0x80008000, v4 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_xor_b32_e32 v3, 0x80008000, v3 +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v1, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll index b8c1d55289c52..6920924fa1547 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll @@ -664,9 +664,9 @@ define <2 x half> @v_maximum_v2f16(<2 x half> %src0, <2 x half> %src1) { ; GFX8-LABEL: v_maximum_v2f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_cndmask_b32_sdwa v2, v3, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_max_f16_e32 v4, v0, v1 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 @@ -830,9 +830,9 @@ define <2 x half> @v_maximum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) { ; GFX8-LABEL: v_maximum_v2f16__nsz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_cndmask_b32_sdwa v2, v3, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_max_f16_e32 v4, v0, v1 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 @@ -1006,14 +1006,14 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX8-NEXT: s_lshr_b32 s4, s17, 16 ; GFX8-NEXT: s_lshr_b32 s5, s16, 16 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_max_f16_e32 v1, s5, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 0x7e00 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s5, v0 -; GFX8-NEXT: v_cndmask_b32_sdwa v0, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_mov_b32_e32 v1, s17 -; GFX8-NEXT: v_max_f16_e32 v3, s16, v1 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s16, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX8-NEXT: v_max_f16_e32 v0, s5, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0x7e00 +; GFX8-NEXT: v_mov_b32_e32 v2, s17 +; GFX8-NEXT: v_cndmask_b32_sdwa v0, v1, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v3, s16, v2 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s16, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use v0 @@ -1160,9 +1160,9 @@ define <3 x half> @v_maximum_v3f16(<3 x half> %src0, <3 x half> %src1) { ; GFX8-LABEL: v_maximum_v3f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_sdwa v4, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_cndmask_b32_sdwa v4, v5, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_max_f16_e32 v6, v1, v3 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 @@ -1363,9 +1363,9 @@ define <3 x half> @v_maximum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) { ; GFX8-LABEL: v_maximum_v3f16__nsz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_sdwa v4, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_cndmask_b32_sdwa v4, v5, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_max_f16_e32 v6, v1, v3 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 @@ -1573,9 +1573,9 @@ define <4 x half> @v_maximum_v4f16(<4 x half> %src0, <4 x half> %src1) { ; GFX8-LABEL: v_maximum_v4f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_o_f16 vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16 vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v0 ; GFX8-NEXT: v_cndmask_b32_sdwa v4, v5, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -1812,9 +1812,9 @@ define <4 x half> @v_maximum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) { ; GFX8-LABEL: v_maximum_v4f16__nsz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_o_f16 vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16 vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v0 ; GFX8-NEXT: v_cndmask_b32_sdwa v4, v5, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll index f62e1fa5fed81..b81c1f9a99aea 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll @@ -554,9 +554,9 @@ define <2 x half> @v_minimum_v2f16(<2 x half> %src0, <2 x half> %src1) { ; GFX8-LABEL: v_minimum_v2f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_min_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_cndmask_b32_sdwa v2, v3, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_min_f16_e32 v4, v0, v1 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 @@ -685,9 +685,9 @@ define <2 x half> @v_minimum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) { ; GFX8-LABEL: v_minimum_v2f16__nsz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_min_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_cndmask_b32_sdwa v2, v3, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_min_f16_e32 v4, v0, v1 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 @@ -819,14 +819,14 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX8-NEXT: s_lshr_b32 s4, s17, 16 ; GFX8-NEXT: s_lshr_b32 s5, s16, 16 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_min_f16_e32 v1, s5, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 0x7e00 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s5, v0 -; GFX8-NEXT: v_cndmask_b32_sdwa v0, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_mov_b32_e32 v1, s17 -; GFX8-NEXT: v_min_f16_e32 v3, s16, v1 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s16, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX8-NEXT: v_min_f16_e32 v0, s5, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0x7e00 +; GFX8-NEXT: v_mov_b32_e32 v2, s17 +; GFX8-NEXT: v_cndmask_b32_sdwa v0, v1, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v3, s16, v2 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s16, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use v0 @@ -946,9 +946,9 @@ define <3 x half> @v_minimum_v3f16(<3 x half> %src0, <3 x half> %src1) { ; GFX8-LABEL: v_minimum_v3f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_min_f16_sdwa v4, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_cndmask_b32_sdwa v4, v5, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_min_f16_e32 v6, v1, v3 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 @@ -1102,9 +1102,9 @@ define <3 x half> @v_minimum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) { ; GFX8-LABEL: v_minimum_v3f16__nsz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_min_f16_sdwa v4, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_cndmask_b32_sdwa v4, v5, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_min_f16_e32 v6, v1, v3 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 @@ -1258,9 +1258,9 @@ define <4 x half> @v_minimum_v4f16(<4 x half> %src0, <4 x half> %src1) { ; GFX8-LABEL: v_minimum_v4f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_o_f16 vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_min_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16 vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v0 ; GFX8-NEXT: v_cndmask_b32_sdwa v4, v5, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -1438,9 +1438,9 @@ define <4 x half> @v_minimum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) { ; GFX8-LABEL: v_minimum_v4f16__nsz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_o_f16 vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_min_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16 vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v0 ; GFX8-NEXT: v_cndmask_b32_sdwa v4, v5, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-vcnd_mask-1.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-vcnd_mask-1.mir index 011bbf4dbe75d..6529f190294df 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-vcnd_mask-1.mir +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-vcnd_mask-1.mir @@ -19,11 +19,11 @@ body: | ; gfx9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; gfx9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; gfx9-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 1, [[COPY1]], implicit $exec + ; gfx9-NEXT: $vcc = V_CMP_EQ_U32_e64 killed [[V_AND_B32_e64_]], 1, implicit $exec ; gfx9-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 1, [[COPY2]], implicit $exec ; gfx9-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 killed [[V_AND_B32_e64_1]], 1, implicit $exec ; gfx9-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, [[COPY]], killed [[V_CMP_EQ_U32_e64_]], implicit $exec ; gfx9-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[COPY]], implicit $exec - ; gfx9-NEXT: $vcc = V_CMP_EQ_U32_e64 killed [[V_AND_B32_e64_]], 1, implicit $exec ; gfx9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; gfx9-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, [[COPY]], 0, 6, 0, 6, 5, implicit $vcc, implicit $exec ; gfx9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 84148480 diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll index 57b4dba19d3e1..7ed27f008083e 100644 --- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll @@ -38,12 +38,12 @@ define <2 x half> @add_select_fabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; VI-LABEL: add_select_fabs_fabs_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; VI-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v2 +; VI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v3 +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_add_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v0, v0, v4 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -52,12 +52,12 @@ define <2 x half> @add_select_fabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX9-LABEL: add_select_fabs_fabs_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v3 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v4 @@ -186,16 +186,16 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_lhs_fabs_fabs_v2f16(<2 x ; GFX9-LABEL: add_select_multi_use_lhs_fabs_fabs_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v3 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v2, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v5 -; GFX9-NEXT: v_pk_add_f16 v1, v2, v4 +; GFX9-NEXT: v_pk_add_f16 v1, v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SAFE-TRUE16-LABEL: add_select_multi_use_lhs_fabs_fabs_v2f16: @@ -303,31 +303,29 @@ define { <2 x half>, <2 x half> } @add_select_multi_store_use_lhs_fabs_fabs_v2f1 ; VI-LABEL: add_select_multi_store_use_lhs_fabs_fabs_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; VI-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_add_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v2 +; VI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v3 +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; VI-NEXT: v_cndmask_b32_sdwa v2, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_add_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v0, v0, v4 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_mov_b32_e32 v1, v2 +; VI-NEXT: v_or_b32_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: add_select_multi_store_use_lhs_fabs_fabs_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v3 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v2, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SAFE-TRUE16-LABEL: add_select_multi_store_use_lhs_fabs_fabs_v2f16: @@ -459,16 +457,16 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_rhs_fabs_fabs_v2f16(<2 x ; GFX9-LABEL: add_select_multi_use_rhs_fabs_fabs_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v3 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v4 -; GFX9-NEXT: v_pk_add_f16 v1, v3, v5 +; GFX9-NEXT: v_pk_add_f16 v1, v2, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SAFE-TRUE16-LABEL: add_select_multi_use_rhs_fabs_fabs_v2f16: @@ -576,12 +574,12 @@ define <2 x half> @add_select_fabs_var_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; VI-LABEL: add_select_fabs_var_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v2 +; VI-NEXT: v_cndmask_b32_sdwa v2, v3, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; VI-NEXT: v_add_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; VI-NEXT: v_add_f16_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v0, v0, v4 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -589,13 +587,13 @@ define <2 x half> @add_select_fabs_var_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; GFX9-LABEL: add_select_fabs_var_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v2 +; GFX9-NEXT: v_cndmask_b32_sdwa v2, v3, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -686,12 +684,12 @@ define <2 x half> @add_select_fabs_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; VI-LABEL: add_select_fabs_negk_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; VI-NEXT: v_mov_b32_e32 v4, 0xbc00 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v2 +; VI-NEXT: v_mov_b32_e32 v2, 0xbc00 +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v0, v0, v3 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -700,12 +698,12 @@ define <2 x half> @add_select_fabs_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX9-LABEL: add_select_fabs_negk_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xbc00 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xbc00 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v3 @@ -1238,12 +1236,12 @@ define <2 x half> @add_select_fabs_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; VI-LABEL: add_select_fabs_posk_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; VI-NEXT: v_mov_b32_e32 v4, 0x3c00 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v2 +; VI-NEXT: v_mov_b32_e32 v2, 0x3c00 +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v0, v0, v3 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1252,12 +1250,12 @@ define <2 x half> @add_select_fabs_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX9-LABEL: add_select_fabs_posk_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x3c00 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x3c00 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v3 @@ -1696,28 +1694,26 @@ define { <2 x half>, <2 x half> } @add_select_multi_store_use_lhs_fneg_fneg_v2f1 ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_cndmask_b32_sdwa v5, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; VI-NEXT: v_xor_b32_e32 v5, 0x80008000, v2 -; VI-NEXT: v_sub_f16_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_xor_b32_e32 v1, 0x80008000, v2 +; VI-NEXT: v_sub_f16_sdwa v2, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_sub_f16_e32 v0, v4, v0 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_mov_b32_e32 v1, v5 +; VI-NEXT: v_or_b32_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: add_select_multi_store_use_lhs_fneg_fneg_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_cndmask_b32_sdwa v5, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_xor_b32_e32 v5, 0x80008000, v2 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: v_perm_b32 v0, v5, v0, s4 +; GFX9-NEXT: v_xor_b32_e32 v1, 0x80008000, v2 ; GFX9-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] -; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SAFE-TRUE16-LABEL: add_select_multi_store_use_lhs_fneg_fneg_v2f16: @@ -1940,12 +1936,12 @@ define <2 x half> @add_select_fneg_var_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; VI-LABEL: add_select_fneg_var_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_xor_b32_e32 v1, 0x80008000, v2 +; VI-NEXT: v_cndmask_b32_sdwa v2, v3, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; VI-NEXT: v_add_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; VI-NEXT: v_add_f16_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v0, v0, v4 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -1953,13 +1949,13 @@ define <2 x half> @add_select_fneg_var_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; GFX9-LABEL: add_select_fneg_var_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_xor_b32_e32 v1, 0x80008000, v2 +; GFX9-NEXT: v_cndmask_b32_sdwa v2, v3, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2050,12 +2046,12 @@ define <2 x half> @add_select_fneg_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; VI-LABEL: add_select_fneg_negk_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, 0x3c00 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_mov_b32_e32 v1, 0x3c00 +; VI-NEXT: v_cndmask_b32_sdwa v4, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; VI-NEXT: v_sub_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_sub_f16_sdwa v1, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_sub_f16_e32 v0, v3, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -2063,13 +2059,13 @@ define <2 x half> @add_select_fneg_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX9-LABEL: add_select_fneg_negk_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, 0x3c00 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x3c00 +; GFX9-NEXT: v_cndmask_b32_sdwa v4, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: v_perm_b32 v0, v4, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2153,12 +2149,12 @@ define <2 x half> @add_select_fneg_inv2pi_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; VI-LABEL: add_select_fneg_inv2pi_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, 0xb118 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_mov_b32_e32 v1, 0xb118 +; VI-NEXT: v_cndmask_b32_sdwa v4, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; VI-NEXT: v_sub_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_sub_f16_sdwa v1, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_sub_f16_e32 v0, v3, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -2166,13 +2162,13 @@ define <2 x half> @add_select_fneg_inv2pi_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX9-LABEL: add_select_fneg_inv2pi_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, 0xb118 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xb118 +; GFX9-NEXT: v_cndmask_b32_sdwa v4, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: v_perm_b32 v0, v4, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2256,12 +2252,12 @@ define <2 x half> @add_select_fneg_neginv2pi_v2f16(<2 x i32> %c, <2 x half> %x, ; VI-LABEL: add_select_fneg_neginv2pi_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, 0x3118 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_mov_b32_e32 v1, 0x3118 +; VI-NEXT: v_cndmask_b32_sdwa v4, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; VI-NEXT: v_sub_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_sub_f16_sdwa v1, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_sub_f16_e32 v0, v3, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -2269,13 +2265,13 @@ define <2 x half> @add_select_fneg_neginv2pi_v2f16(<2 x i32> %c, <2 x half> %x, ; GFX9-LABEL: add_select_fneg_neginv2pi_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, 0x3118 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x3118 +; GFX9-NEXT: v_cndmask_b32_sdwa v4, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: v_perm_b32 v0, v4, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2673,12 +2669,12 @@ define <2 x half> @add_select_negk_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; VI-LABEL: add_select_negk_fneg_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, 0x3c00 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_mov_b32_e32 v1, 0x3c00 +; VI-NEXT: v_cndmask_b32_sdwa v4, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; VI-NEXT: v_sub_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_sub_f16_sdwa v1, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_sub_f16_e32 v0, v3, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -2686,13 +2682,13 @@ define <2 x half> @add_select_negk_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX9-LABEL: add_select_negk_fneg_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, 0x3c00 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x3c00 +; GFX9-NEXT: v_cndmask_b32_sdwa v4, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: v_perm_b32 v0, v4, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2775,12 +2771,12 @@ define <2 x half> @add_select_fneg_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; VI-LABEL: add_select_fneg_posk_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, 0xbc00 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_mov_b32_e32 v1, 0xbc00 +; VI-NEXT: v_cndmask_b32_sdwa v4, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; VI-NEXT: v_sub_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_sub_f16_sdwa v1, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_sub_f16_e32 v0, v3, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -2788,13 +2784,13 @@ define <2 x half> @add_select_fneg_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX9-LABEL: add_select_fneg_posk_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, 0xbc00 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xbc00 +; GFX9-NEXT: v_cndmask_b32_sdwa v4, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: v_perm_b32 v0, v4, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2877,12 +2873,12 @@ define <2 x half> @add_select_posk_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; VI-LABEL: add_select_posk_fneg_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, 0xbc00 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_mov_b32_e32 v1, 0xbc00 +; VI-NEXT: v_cndmask_b32_sdwa v4, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; VI-NEXT: v_sub_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_sub_f16_sdwa v1, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_sub_f16_e32 v0, v3, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -2890,13 +2886,13 @@ define <2 x half> @add_select_posk_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX9-LABEL: add_select_posk_fneg_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, 0xbc00 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xbc00 +; GFX9-NEXT: v_cndmask_b32_sdwa v4, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: v_perm_b32 v0, v4, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2987,12 +2983,12 @@ define <2 x half> @add_select_negfabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; VI-LABEL: add_select_negfabs_fabs_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_or_b32_e32 v2, 0x80008000, v2 -; VI-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v1, 0x80008000, v2 +; VI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v3 +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_add_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v0, v0, v4 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3001,12 +2997,12 @@ define <2 x half> @add_select_negfabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX9-LABEL: add_select_negfabs_fabs_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_or_b32_e32 v2, 0x80008000, v2 -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_or_b32_e32 v1, 0x80008000, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v3 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v4 @@ -3115,12 +3111,12 @@ define <2 x half> @add_select_fabs_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; VI-LABEL: add_select_fabs_negfabs_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x80008000, v3 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v2 +; VI-NEXT: v_or_b32_e32 v2, 0x80008000, v3 +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_add_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v0, v0, v4 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3129,12 +3125,12 @@ define <2 x half> @add_select_fabs_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX9-LABEL: add_select_fabs_negfabs_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x80008000, v3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v2, 0x80008000, v3 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v4 @@ -3243,12 +3239,12 @@ define <2 x half> @add_select_neg_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; VI-LABEL: add_select_neg_fabs_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; VI-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_xor_b32_e32 v1, 0x80008000, v2 +; VI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v3 +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_add_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v0, v0, v4 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3257,12 +3253,12 @@ define <2 x half> @add_select_neg_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; GFX9-LABEL: add_select_neg_fabs_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_xor_b32_e32 v1, 0x80008000, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v3 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v4 @@ -3370,12 +3366,12 @@ define <2 x half> @add_select_fabs_neg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; VI-LABEL: add_select_fabs_neg_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; VI-NEXT: v_xor_b32_e32 v3, 0x80008000, v3 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v2 +; VI-NEXT: v_xor_b32_e32 v2, 0x80008000, v3 +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_add_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v0, v0, v4 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3384,12 +3380,12 @@ define <2 x half> @add_select_fabs_neg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; GFX9-LABEL: add_select_fabs_neg_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; GFX9-NEXT: v_xor_b32_e32 v3, 0x80008000, v3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v2 +; GFX9-NEXT: v_xor_b32_e32 v2, 0x80008000, v3 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v4 @@ -3493,12 +3489,12 @@ define <2 x half> @add_select_neg_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; VI-LABEL: add_select_neg_negfabs_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v3 +; VI-NEXT: v_cndmask_b32_sdwa v3, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; VI-NEXT: v_sub_f16_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_sub_f16_sdwa v1, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_sub_f16_e32 v0, v4, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -3506,13 +3502,13 @@ define <2 x half> @add_select_neg_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX9-LABEL: add_select_neg_negfabs_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v3 +; GFX9-NEXT: v_cndmask_b32_sdwa v3, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: v_perm_b32 v0, v3, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3609,12 +3605,12 @@ define <2 x half> @add_select_negfabs_neg_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; VI-LABEL: add_select_negfabs_neg_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v2 +; VI-NEXT: v_cndmask_b32_sdwa v2, v1, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_sub_f16_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_sub_f16_sdwa v1, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_sub_f16_e32 v0, v4, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -3622,13 +3618,13 @@ define <2 x half> @add_select_negfabs_neg_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX9-LABEL: add_select_negfabs_neg_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v2 +; GFX9-NEXT: v_cndmask_b32_sdwa v2, v1, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3725,12 +3721,12 @@ define <2 x half> @mul_select_negfabs_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; VI-LABEL: mul_select_negfabs_posk_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_or_b32_e32 v2, 0x80008000, v2 -; VI-NEXT: v_mov_b32_e32 v4, 0x4400 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v1, 0x80008000, v2 +; VI-NEXT: v_mov_b32_e32 v2, 0x4400 +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_mul_f16_e32 v0, v0, v3 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3739,12 +3735,12 @@ define <2 x half> @mul_select_negfabs_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX9-LABEL: mul_select_negfabs_posk_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_or_b32_e32 v2, 0x80008000, v2 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x4400 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_or_b32_e32 v1, 0x80008000, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x4400 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v3 @@ -3955,12 +3951,12 @@ define <2 x half> @mul_select_negfabs_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; VI-LABEL: mul_select_negfabs_negk_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_or_b32_e32 v2, 0x80008000, v2 -; VI-NEXT: v_mov_b32_e32 v4, 0xc400 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v1, 0x80008000, v2 +; VI-NEXT: v_mov_b32_e32 v2, 0xc400 +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_mul_f16_e32 v0, v0, v3 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3969,12 +3965,12 @@ define <2 x half> @mul_select_negfabs_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX9-LABEL: mul_select_negfabs_negk_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_or_b32_e32 v2, 0x80008000, v2 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xc400 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_or_b32_e32 v1, 0x80008000, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xc400 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v3 @@ -4189,29 +4185,29 @@ define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, < ; VI-SAFE-LABEL: select_fneg_posk_src_add_v2f16: ; VI-SAFE: ; %bb.0: ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SAFE-NEXT: v_mov_b32_e32 v3, 0x4400 -; VI-SAFE-NEXT: v_add_f16_sdwa v3, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-SAFE-NEXT: v_add_f16_e32 v2, 4.0, v2 -; VI-SAFE-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-SAFE-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; VI-SAFE-NEXT: v_mov_b32_e32 v3, 0x4000 -; VI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; VI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-SAFE-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-SAFE-NEXT: v_mov_b32_e32 v1, 0x4400 +; VI-SAFE-NEXT: v_add_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-SAFE-NEXT: v_add_f16_e32 v2, 4.0, v2 +; VI-SAFE-NEXT: v_or_b32_e32 v1, v2, v1 +; VI-SAFE-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; VI-SAFE-NEXT: v_mov_b32_e32 v2, 0x4000 +; VI-SAFE-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; VI-SAFE-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SAFE-LABEL: select_fneg_posk_src_add_v2f16: ; GFX9-SAFE: ; %bb.0: ; GFX9-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SAFE-NEXT: v_pk_add_f16 v2, v2, 4.0 op_sel_hi:[1,0] -; GFX9-SAFE-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX9-SAFE-NEXT: v_mov_b32_e32 v3, 0x4000 -; GFX9-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-SAFE-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-SAFE-NEXT: v_pk_add_f16 v1, v2, 4.0 op_sel_hi:[1,0] +; GFX9-SAFE-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX9-SAFE-NEXT: v_mov_b32_e32 v2, 0x4000 +; GFX9-SAFE-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-SAFE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-SAFE-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-SAFE-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-SAFE-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] @@ -4262,26 +4258,26 @@ define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, < ; VI-NSZ-LABEL: select_fneg_posk_src_add_v2f16: ; VI-NSZ: ; %bb.0: ; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NSZ-NEXT: v_mov_b32_e32 v3, 0xc400 -; VI-NSZ-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NSZ-NEXT: v_sub_f16_e32 v2, -4.0, v2 -; VI-NSZ-NEXT: v_mov_b32_e32 v4, 0x4000 -; VI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; VI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NSZ-NEXT: v_cndmask_b32_sdwa v1, v4, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NSZ-NEXT: v_mov_b32_e32 v1, 0xc400 +; VI-NSZ-NEXT: v_sub_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NSZ-NEXT: v_sub_f16_e32 v2, -4.0, v2 +; VI-NSZ-NEXT: v_mov_b32_e32 v3, 0x4000 +; VI-NSZ-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; VI-NSZ-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[4:5] +; VI-NSZ-NEXT: v_cndmask_b32_sdwa v1, v3, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NSZ-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-NSZ-LABEL: select_fneg_posk_src_add_v2f16: ; GFX9-NSZ: ; %bb.0: ; GFX9-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NSZ-NEXT: v_pk_add_f16 v2, v2, -4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] -; GFX9-NSZ-NEXT: v_mov_b32_e32 v3, 0x4000 -; GFX9-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NSZ-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NSZ-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NSZ-NEXT: v_pk_add_f16 v1, v2, -4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] +; GFX9-NSZ-NEXT: v_mov_b32_e32 v2, 0x4000 +; GFX9-NSZ-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NSZ-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-NSZ-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NSZ-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NSZ-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NSZ-NEXT: s_setpc_b64 s[30:31] @@ -4344,29 +4340,29 @@ define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) { ; VI-SAFE-LABEL: select_fneg_posk_src_sub_v2f16: ; VI-SAFE: ; %bb.0: ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SAFE-NEXT: v_mov_b32_e32 v3, 0xc400 -; VI-SAFE-NEXT: v_add_f16_sdwa v3, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-SAFE-NEXT: v_add_f16_e32 v2, -4.0, v2 -; VI-SAFE-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-SAFE-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; VI-SAFE-NEXT: v_mov_b32_e32 v3, 0x4000 -; VI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; VI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-SAFE-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-SAFE-NEXT: v_mov_b32_e32 v1, 0xc400 +; VI-SAFE-NEXT: v_add_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-SAFE-NEXT: v_add_f16_e32 v2, -4.0, v2 +; VI-SAFE-NEXT: v_or_b32_e32 v1, v2, v1 +; VI-SAFE-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; VI-SAFE-NEXT: v_mov_b32_e32 v2, 0x4000 +; VI-SAFE-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; VI-SAFE-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SAFE-LABEL: select_fneg_posk_src_sub_v2f16: ; GFX9-SAFE: ; %bb.0: ; GFX9-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SAFE-NEXT: v_pk_add_f16 v2, v2, -4.0 op_sel_hi:[1,0] -; GFX9-SAFE-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX9-SAFE-NEXT: v_mov_b32_e32 v3, 0x4000 -; GFX9-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-SAFE-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-SAFE-NEXT: v_pk_add_f16 v1, v2, -4.0 op_sel_hi:[1,0] +; GFX9-SAFE-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX9-SAFE-NEXT: v_mov_b32_e32 v2, 0x4000 +; GFX9-SAFE-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-SAFE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-SAFE-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-SAFE-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-SAFE-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] @@ -4417,26 +4413,26 @@ define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) { ; VI-NSZ-LABEL: select_fneg_posk_src_sub_v2f16: ; VI-NSZ: ; %bb.0: ; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NSZ-NEXT: v_mov_b32_e32 v3, 0x4400 -; VI-NSZ-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NSZ-NEXT: v_sub_f16_e32 v2, 4.0, v2 -; VI-NSZ-NEXT: v_mov_b32_e32 v4, 0x4000 -; VI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; VI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NSZ-NEXT: v_cndmask_b32_sdwa v1, v4, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NSZ-NEXT: v_mov_b32_e32 v1, 0x4400 +; VI-NSZ-NEXT: v_sub_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NSZ-NEXT: v_sub_f16_e32 v2, 4.0, v2 +; VI-NSZ-NEXT: v_mov_b32_e32 v3, 0x4000 +; VI-NSZ-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; VI-NSZ-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[4:5] +; VI-NSZ-NEXT: v_cndmask_b32_sdwa v1, v3, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NSZ-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-NSZ-LABEL: select_fneg_posk_src_sub_v2f16: ; GFX9-NSZ: ; %bb.0: ; GFX9-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NSZ-NEXT: v_pk_add_f16 v2, v2, 4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] -; GFX9-NSZ-NEXT: v_mov_b32_e32 v3, 0x4000 -; GFX9-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NSZ-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NSZ-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NSZ-NEXT: v_pk_add_f16 v1, v2, 4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] +; GFX9-NSZ-NEXT: v_mov_b32_e32 v2, 0x4000 +; GFX9-NSZ-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NSZ-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-NSZ-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NSZ-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NSZ-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NSZ-NEXT: s_setpc_b64 s[30:31] @@ -4491,26 +4487,26 @@ define <2 x half> @select_fneg_posk_src_mul_v2f16(<2 x i32> %c, <2 x half> %x) { ; VI-LABEL: select_fneg_posk_src_mul_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, 0xc400 -; VI-NEXT: v_mul_f16_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_mul_f16_e32 v2, -4.0, v2 -; VI-NEXT: v_mov_b32_e32 v4, 0x4000 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_sdwa v1, v4, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v1, 0xc400 +; VI-NEXT: v_mul_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v2, -4.0, v2 +; VI-NEXT: v_mov_b32_e32 v3, 0x4000 +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[4:5] +; VI-NEXT: v_cndmask_b32_sdwa v1, v3, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: select_fneg_posk_src_mul_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_mul_f16 v2, v2, -4.0 op_sel_hi:[1,0] -; GFX9-NEXT: v_mov_b32_e32 v3, 0x4000 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_pk_mul_f16 v1, v2, -4.0 op_sel_hi:[1,0] +; GFX9-NEXT: v_mov_b32_e32 v2, 0x4000 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4602,31 +4598,31 @@ define <2 x half> @select_fneg_posk_src_fma_v2f16(<2 x i32> %c, <2 x half> %x, < ; VI-SAFE-LABEL: select_fneg_posk_src_fma_v2f16: ; VI-SAFE: ; %bb.0: ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; VI-SAFE-NEXT: v_fma_f16 v4, v5, 4.0, v4 -; VI-SAFE-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; VI-SAFE-NEXT: v_fma_f16 v2, v2, 4.0, v3 -; VI-SAFE-NEXT: v_or_b32_e32 v2, v2, v4 -; VI-SAFE-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; VI-SAFE-NEXT: v_mov_b32_e32 v3, 0x4000 -; VI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; VI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-SAFE-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-SAFE-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; VI-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; VI-SAFE-NEXT: v_fma_f16 v1, v4, 4.0, v1 +; VI-SAFE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-SAFE-NEXT: v_fma_f16 v2, v2, 4.0, v3 +; VI-SAFE-NEXT: v_or_b32_e32 v1, v2, v1 +; VI-SAFE-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; VI-SAFE-NEXT: v_mov_b32_e32 v2, 0x4000 +; VI-SAFE-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; VI-SAFE-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SAFE-LABEL: select_fneg_posk_src_fma_v2f16: ; GFX9-SAFE: ; %bb.0: ; GFX9-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SAFE-NEXT: v_pk_fma_f16 v2, v2, 4.0, v3 op_sel_hi:[1,0,1] -; GFX9-SAFE-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX9-SAFE-NEXT: v_mov_b32_e32 v3, 0x4000 -; GFX9-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-SAFE-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-SAFE-NEXT: v_pk_fma_f16 v1, v2, 4.0, v3 op_sel_hi:[1,0,1] +; GFX9-SAFE-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX9-SAFE-NEXT: v_mov_b32_e32 v2, 0x4000 +; GFX9-SAFE-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-SAFE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-SAFE-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-SAFE-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-SAFE-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] @@ -4662,27 +4658,27 @@ define <2 x half> @select_fneg_posk_src_fma_v2f16(<2 x i32> %c, <2 x half> %x, < ; VI-NSZ-LABEL: select_fneg_posk_src_fma_v2f16: ; VI-NSZ: ; %bb.0: ; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NSZ-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; VI-NSZ-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; VI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; VI-NSZ-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; VI-NSZ-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; VI-NSZ-NEXT: v_fma_f16 v1, v4, -4.0, -v1 ; VI-NSZ-NEXT: v_fma_f16 v2, v2, -4.0, -v3 ; VI-NSZ-NEXT: v_mov_b32_e32 v3, 0x4000 -; VI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NSZ-NEXT: v_fma_f16 v4, v5, -4.0, -v4 -; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; VI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NSZ-NEXT: v_cndmask_b32_sdwa v1, v3, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NSZ-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; VI-NSZ-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[4:5] +; VI-NSZ-NEXT: v_cndmask_b32_sdwa v1, v3, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NSZ-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-NSZ-LABEL: select_fneg_posk_src_fma_v2f16: ; GFX9-NSZ: ; %bb.0: ; GFX9-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NSZ-NEXT: v_pk_fma_f16 v2, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1] -; GFX9-NSZ-NEXT: v_mov_b32_e32 v3, 0x4000 -; GFX9-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NSZ-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NSZ-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NSZ-NEXT: v_pk_fma_f16 v1, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX9-NSZ-NEXT: v_mov_b32_e32 v2, 0x4000 +; GFX9-NSZ-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NSZ-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-NSZ-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NSZ-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NSZ-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NSZ-NEXT: s_setpc_b64 s[30:31] @@ -4751,31 +4747,31 @@ define <2 x half> @select_fneg_posk_src_fmad_v2f16(<2 x i32> %c, <2 x half> %x, ; VI-SAFE-LABEL: select_fneg_posk_src_fmad_v2f16: ; VI-SAFE: ; %bb.0: ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; VI-SAFE-NEXT: v_fma_f16 v4, v5, 4.0, v4 -; VI-SAFE-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; VI-SAFE-NEXT: v_fma_f16 v2, v2, 4.0, v3 -; VI-SAFE-NEXT: v_or_b32_e32 v2, v2, v4 -; VI-SAFE-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; VI-SAFE-NEXT: v_mov_b32_e32 v3, 0x4000 -; VI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; VI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-SAFE-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-SAFE-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; VI-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; VI-SAFE-NEXT: v_fma_f16 v1, v4, 4.0, v1 +; VI-SAFE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-SAFE-NEXT: v_fma_f16 v2, v2, 4.0, v3 +; VI-SAFE-NEXT: v_or_b32_e32 v1, v2, v1 +; VI-SAFE-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; VI-SAFE-NEXT: v_mov_b32_e32 v2, 0x4000 +; VI-SAFE-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; VI-SAFE-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SAFE-LABEL: select_fneg_posk_src_fmad_v2f16: ; GFX9-SAFE: ; %bb.0: ; GFX9-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SAFE-NEXT: v_pk_fma_f16 v2, v2, 4.0, v3 op_sel_hi:[1,0,1] -; GFX9-SAFE-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX9-SAFE-NEXT: v_mov_b32_e32 v3, 0x4000 -; GFX9-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-SAFE-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-SAFE-NEXT: v_pk_fma_f16 v1, v2, 4.0, v3 op_sel_hi:[1,0,1] +; GFX9-SAFE-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX9-SAFE-NEXT: v_mov_b32_e32 v2, 0x4000 +; GFX9-SAFE-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-SAFE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-SAFE-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-SAFE-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-SAFE-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] @@ -4832,27 +4828,27 @@ define <2 x half> @select_fneg_posk_src_fmad_v2f16(<2 x i32> %c, <2 x half> %x, ; VI-NSZ-LABEL: select_fneg_posk_src_fmad_v2f16: ; VI-NSZ: ; %bb.0: ; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NSZ-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; VI-NSZ-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; VI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; VI-NSZ-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; VI-NSZ-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; VI-NSZ-NEXT: v_fma_f16 v1, v4, -4.0, -v1 ; VI-NSZ-NEXT: v_fma_f16 v2, v2, -4.0, -v3 ; VI-NSZ-NEXT: v_mov_b32_e32 v3, 0x4000 -; VI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NSZ-NEXT: v_fma_f16 v4, v5, -4.0, -v4 -; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; VI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NSZ-NEXT: v_cndmask_b32_sdwa v1, v3, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NSZ-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; VI-NSZ-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[4:5] +; VI-NSZ-NEXT: v_cndmask_b32_sdwa v1, v3, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NSZ-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-NSZ-LABEL: select_fneg_posk_src_fmad_v2f16: ; GFX9-NSZ: ; %bb.0: ; GFX9-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NSZ-NEXT: v_pk_fma_f16 v2, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1] -; GFX9-NSZ-NEXT: v_mov_b32_e32 v3, 0x4000 -; GFX9-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NSZ-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NSZ-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NSZ-NEXT: v_pk_fma_f16 v1, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX9-NSZ-NEXT: v_mov_b32_e32 v2, 0x4000 +; GFX9-NSZ-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NSZ-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-NSZ-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NSZ-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NSZ-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NSZ-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll index b8187231ea709..cf2f84bbfa97d 100644 --- a/llvm/test/CodeGen/AMDGPU/select.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll @@ -806,38 +806,38 @@ define amdgpu_kernel void @select_v2f16( ; VI-LABEL: select_v2f16: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x44 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s18, s2 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s18, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s16, s10 ; VI-NEXT: s_mov_b32 s17, s11 -; VI-NEXT: s_mov_b32 s19, s3 +; VI-NEXT: s_mov_b32 s19, s7 ; VI-NEXT: s_mov_b32 s20, s12 ; VI-NEXT: s_mov_b32 s21, s13 -; VI-NEXT: s_mov_b32 s22, s2 -; VI-NEXT: s_mov_b32 s23, s3 +; VI-NEXT: s_mov_b32 s22, s6 +; VI-NEXT: s_mov_b32 s23, s7 ; VI-NEXT: s_mov_b32 s12, s14 ; VI-NEXT: s_mov_b32 s13, s15 -; VI-NEXT: s_mov_b32 s14, s2 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_mov_b32 s14, s6 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: s_mov_b32 s2, s6 +; VI-NEXT: s_mov_b32 s3, s7 ; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; VI-NEXT: buffer_load_dword v1, off, s[20:23], 0 -; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], 0 ; VI-NEXT: buffer_load_dword v3, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s0, s8 -; VI-NEXT: s_mov_b32 s1, s9 +; VI-NEXT: s_mov_b32 s4, s8 +; VI-NEXT: s_mov_b32 s5, s9 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc ; VI-NEXT: v_cmp_lt_f16 vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_cndmask_b32_sdwa v0, v2, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: v_cmp_lt_f16_e64 s[0:1], v0, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cndmask_b32_e64 v0, v2, v3, s[0:1] +; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX11-TRUE16-LABEL: select_v2f16: @@ -1020,12 +1020,12 @@ define amdgpu_kernel void @select_v2f16_imm_a( ; VI-NEXT: s_mov_b32 s8, s0 ; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_cmp_lt_f16_e32 vcc, 0.5, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cndmask_b32_e32 v4, v2, v1, vcc ; VI-NEXT: v_cmp_gt_f16 vcc, v0, v3 src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_sdwa v0, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cmp_lt_f16_e64 s[0:1], 0.5, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[0:1] +; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; @@ -1195,12 +1195,12 @@ define amdgpu_kernel void @select_v2f16_imm_b( ; VI-NEXT: s_mov_b32 s8, s0 ; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_cmp_gt_f16_e32 vcc, 0.5, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cndmask_b32_e32 v4, v2, v1, vcc ; VI-NEXT: v_cmp_lt_f16 vcc, v0, v3 src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_sdwa v0, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cmp_gt_f16_e64 s[0:1], 0.5, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[0:1] +; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; @@ -1367,7 +1367,6 @@ define amdgpu_kernel void @select_v2f16_imm_c( ; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 ; VI-NEXT: v_mov_b32_e32 v3, 0x3800 -; VI-NEXT: v_mov_b32_e32 v4, 0x3900 ; VI-NEXT: s_mov_b32 s8, s0 ; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -1375,7 +1374,8 @@ define amdgpu_kernel void @select_v2f16_imm_c( ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc ; VI-NEXT: v_cmp_nlt_f16 vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_cndmask_b32_sdwa v0, v4, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_mov_b32_e32 v0, 0x3900 +; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm @@ -1538,21 +1538,21 @@ define amdgpu_kernel void @select_v2f16_imm_d( ; VI-NEXT: s_mov_b32 s4, s6 ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 ; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; VI-NEXT: s_mov_b32 s7, s11 ; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 ; VI-NEXT: v_mov_b32_e32 v3, 0x3800 -; VI-NEXT: v_mov_b32_e32 v4, 0x3900 ; VI-NEXT: s_mov_b32 s8, s0 ; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc ; VI-NEXT: v_cmp_lt_f16 vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_cndmask_b32_sdwa v0, v4, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cmp_lt_f16_e64 s[0:1], v0, v1 +; VI-NEXT: v_mov_b32_e32 v1, 0x3900 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[0:1] +; VI-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; From c10059473653bacc32725c9abcaedc18351f21f5 Mon Sep 17 00:00:00 2001 From: Frederik Harwath Date: Wed, 30 Apr 2025 05:46:24 -0400 Subject: [PATCH 05/26] Handle undef carry-in operand --- llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index b3e9b424cf1b0..fe4b054da9c91 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -1110,11 +1110,13 @@ void SIPeepholeSDWA::convertToImplicitVcc(MachineInstr &MI, // Change destination of compare instruction to VCC // or copy to VCC if carry-in is not a compare inst. Register CarryReg = CarryIn.getReg(); - MachineInstr &CarryDef = *MRI->getVRegDef(CarryReg); + MachineInstr *CarryDef = MRI->getVRegDef(CarryReg); + if (!CarryDef) + return; - if (CarryDef.isCompare() && TII->isVOP3(CarryDef) && + if (CarryDef->isCompare() && TII->isVOP3(*CarryDef) && MRI->hasOneUse(CarryIn.getReg())) - CarryDef.substituteRegister(CarryIn.getReg(), Vcc, 0, *TRI); + CarryDef->substituteRegister(CarryIn.getReg(), Vcc, 0, *TRI); else { // Add write: VCC[lanedId] <- (CarryIn[laneId] == 1) const TargetRegisterClass *Class = From b2a5bab5c5a976443761051df86d6f4232c07ada Mon Sep 17 00:00:00 2001 From: Frederik Harwath Date: Wed, 30 Apr 2025 05:48:06 -0400 Subject: [PATCH 06/26] Remove extra newline from debug output --- llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index fe4b054da9c91..e0917e42af233 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -1134,7 +1134,7 @@ void SIPeepholeSDWA::convertToImplicitVcc(MachineInstr &MI, .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)) .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1)) .setMIFlags(MI.getFlags()); - LLVM_DEBUG(dbgs() << "Converted to VOP2: " << *Converted << '\n'); + LLVM_DEBUG(dbgs() << "Converted to VOP2: " << *Converted); MI.eraseFromParent(); } From 65d7dd1e7948b7e663c723ede4566e68b40bd9f2 Mon Sep 17 00:00:00 2001 From: Frederik Harwath Date: Wed, 30 Apr 2025 05:59:48 -0400 Subject: [PATCH 07/26] Rename test files to indicate the different ISAs being tested --- ...a-peephole-vcnd_mask-2.mir => sdwa-peephole-cndmask-gfx10.mir} | 0 ...wa-peephole-vcnd_mask-1.mir => sdwa-peephole-cndmask-gfx9.mir} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename llvm/test/CodeGen/AMDGPU/{sdwa-peephole-vcnd_mask-2.mir => sdwa-peephole-cndmask-gfx10.mir} (100%) rename llvm/test/CodeGen/AMDGPU/{sdwa-peephole-vcnd_mask-1.mir => sdwa-peephole-cndmask-gfx9.mir} (100%) diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-vcnd_mask-2.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-gfx10.mir similarity index 100% rename from llvm/test/CodeGen/AMDGPU/sdwa-peephole-vcnd_mask-2.mir rename to llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-gfx10.mir diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-vcnd_mask-1.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-gfx9.mir similarity index 100% rename from llvm/test/CodeGen/AMDGPU/sdwa-peephole-vcnd_mask-1.mir rename to llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-gfx9.mir From b0e665e0140b108af20a1aba12a55f0ebaa65bd4 Mon Sep 17 00:00:00 2001 From: Frederik Harwath Date: Wed, 30 Apr 2025 06:27:37 -0400 Subject: [PATCH 08/26] Use COPY instead of V_CMP_EQ for copy to VCC --- llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 26 +- llvm/test/CodeGen/AMDGPU/bf16.ll | 13 +- .../CodeGen/AMDGPU/extract_vector_elt-f16.ll | 42 +-- llvm/test/CodeGen/AMDGPU/fmaximum3.ll | 160 ++++----- llvm/test/CodeGen/AMDGPU/fminimum3.ll | 160 ++++----- .../CodeGen/AMDGPU/insert_vector_dynelt.ll | 25 +- .../AMDGPU/insert_vector_elt.v2bf16.ll | 337 +++++++++--------- .../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll | 183 +++++----- llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll | 38 +- llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll | 38 +- 10 files changed, 482 insertions(+), 540 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index e0917e42af233..228b16e0ad70a 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -1063,17 +1063,6 @@ void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI, MISucc.substituteRegister(CarryIn->getReg(), TRI->getVCC(), 0, *TRI); } -static unsigned getVCmpEqOpcode(unsigned Bits) { - if (Bits == 64) - return AMDGPU::V_CMP_EQ_U64_e64; - if (Bits == 32) - return AMDGPU::V_CMP_EQ_U32_e64; - if (Bits == 16) - return AMDGPU::V_CMP_EQ_U16_e64; - - llvm_unreachable("Unexpected register bit width."); -}; - /// Try to convert an \p MI in VOP3 which takes an src2 carry-in /// operand into the corresponding VOP2 form which expects the /// argument in VCC. To this end, either try to change the definition @@ -1107,26 +1096,21 @@ void SIPeepholeSDWA::convertToImplicitVcc(MachineInstr &MI, LLVM_DEBUG(dbgs() << "VCC not known to be dead before instruction.\n"); return; } - // Change destination of compare instruction to VCC - // or copy to VCC if carry-in is not a compare inst. + Register CarryReg = CarryIn.getReg(); MachineInstr *CarryDef = MRI->getVRegDef(CarryReg); if (!CarryDef) return; + // Change destination of compare instruction to VCC + // or copy to VCC if carry-in is not a compare inst. if (CarryDef->isCompare() && TII->isVOP3(*CarryDef) && MRI->hasOneUse(CarryIn.getReg())) CarryDef->substituteRegister(CarryIn.getReg(), Vcc, 0, *TRI); - else { - // Add write: VCC[lanedId] <- (CarryIn[laneId] == 1) - const TargetRegisterClass *Class = - TRI->getRegClassForOperandReg(*MRI, CarryIn); - unsigned RegSize = Class->MC->getSizeInBits(); - BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(getVCmpEqOpcode(RegSize))) + else + BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY)) .addReg(Vcc, RegState::Define) - .addImm(1) .add(CarryIn); - } auto Converted = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::getVOPe32(MI.getOpcode()))) diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 0d91305917b3e..cb089b15e0f4c 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -38481,7 +38481,6 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b) ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX8-NEXT: v_cmp_eq_u64_e64 vcc, 1, vcc ; GFX8-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -38492,9 +38491,8 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b) ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: v_cmp_eq_u64_e64 vcc, 1, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -38759,15 +38757,14 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg ; ; GFX8-LABEL: s_select_v2bf16: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX8-NEXT: v_cmp_eq_u64_e64 vcc, 1, vcc ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; GFX8-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll index ea2c82f2679f3..35fe6ebaf1b12 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll @@ -605,33 +605,29 @@ define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(ptr addrspace(1) ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; VI-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; VI-NEXT: v_mov_b32_e32 v6, s1 -; VI-NEXT: s_cmp_eq_u32 s4, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, s0, v0 +; VI-NEXT: s_cmp_eq_u32 s4, 1 ; VI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] +; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s4, 2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_sdwa v0, v1, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s4, 3 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] +; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s4, 4 ; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s4, 5 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s4, 6 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s4, 7 -; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s4, 6 ; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[2:3] -; VI-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s4, 7 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: flat_store_short v[5:6], v0 ; VI-NEXT: s_endpgm @@ -863,31 +859,27 @@ define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1) ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s4, 9 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] +; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s4, 10 ; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s4, 11 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc -; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] +; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s4, 12 ; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s4, 13 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s4, 14 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc -; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s4, 15 -; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s4, 14 ; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v7, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[2:3] -; VI-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[0:1] +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s4, 15 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v8, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: flat_store_short v[9:10], v0 ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll index 222287e62209f..567202be69fa6 100644 --- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll @@ -2080,12 +2080,11 @@ define <2 x half> @v_fmaximum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c ; GFX942-NEXT: v_pk_max_f16 v3, v0, v1 ; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v1, v0, v5, s0 ; GFX942-NEXT: v_pk_max_f16 v1, v2, v1 @@ -2127,12 +2126,11 @@ define <2 x half> @v_fmaximum3_v2f16_commute(<2 x half> %a, <2 x half> %b, <2 x ; GFX942-NEXT: v_pk_max_f16 v3, v0, v1 ; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v1, v0, v5, s0 ; GFX942-NEXT: v_pk_max_f16 v1, v1, v2 @@ -2176,15 +2174,13 @@ define <2 x half> @v_fmaximum3_v2f16__fabs_all(<2 x half> %a, <2 x half> %b, <2 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v0 ; GFX942-NEXT: v_and_b32_e32 v4, 0x7fff7fff, v1 -; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], |v0|, |v1| src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v1| src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_max_f16 v3, v3, v4 ; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] -; GFX942-NEXT: s_mov_b32 s0, 0x5040100 -; GFX942-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v2 ; GFX942-NEXT: v_cndmask_b32_sdwa v6, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v1| -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v2 ; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX942-NEXT: v_perm_b32 v1, v6, v0, s0 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v6, |v2| src0_sel:DWORD src1_sel:WORD_1 @@ -2232,12 +2228,11 @@ define <2 x half> @v_fmaximum3_v2f16__fneg_all(<2 x half> %a, <2 x half> %b, <2 ; GFX942-NEXT: v_pk_max_f16 v3, v0, v1 neg_lo:[1,1] neg_hi:[1,1] ; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v1 -; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], -v0, -v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v1, v0, v5, s0 ; GFX942-NEXT: v_pk_max_f16 v1, v1, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -2767,38 +2762,37 @@ define <4 x half> @v_fmaximum3_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c ; GFX942-NEXT: v_pk_max_f16 v6, v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v7, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] -; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: s_mov_b32 s2, 0x5040100 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v2, v1, v6, s2 +; GFX942-NEXT: v_perm_b32 v2, v1, v6, s0 ; GFX942-NEXT: v_pk_max_f16 v2, v5, v2 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v5, v6 -; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v5, v1 src0_sel:WORD_1 src1_sel:DWORD -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v5, v1 src0_sel:WORD_1 src1_sel:DWORD ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v2, v0, v8, s2 +; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 ; GFX942-NEXT: v_pk_max_f16 v2, v4, v2 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v8 -; GFX942-NEXT: v_perm_b32 v1, v1, v3, s2 +; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v0, v0, v5, s2 +; GFX942-NEXT: v_perm_b32 v0, v0, v5, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v4f16: @@ -2833,38 +2827,37 @@ define <4 x half> @v_fmaximum3_v4f16_commute(<4 x half> %a, <4 x half> %b, <4 x ; GFX942-NEXT: v_pk_max_f16 v6, v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v7, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] -; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: s_mov_b32 s2, 0x5040100 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v2, v1, v6, s2 +; GFX942-NEXT: v_perm_b32 v2, v1, v6, s0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v5 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 -; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v1, v5 src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v2, v0, v8, s2 +; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v8, v4 -; GFX942-NEXT: v_perm_b32 v1, v1, v3, s2 +; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v0, v0, v5, s2 +; GFX942-NEXT: v_perm_b32 v0, v0, v5, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v4f16_commute: @@ -2912,7 +2905,7 @@ define <4 x half> @v_fmaximum3_v4f16__fabs_all(<4 x half> %a, <4 x half> %b, <4 ; GFX942-NEXT: v_lshrrev_b32_e32 v9, 16, v7 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_max_f16 v6, v6, v8 -; GFX942-NEXT: s_mov_b32 s2, 0x5040100 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc ; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1 @@ -2920,29 +2913,28 @@ define <4 x half> @v_fmaximum3_v4f16__fabs_all(<4 x half> %a, <4 x half> %b, <4 ; GFX942-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v4 ; GFX942-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2| -; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v8, |v5| src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v7, vcc ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3| ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v6, vcc -; GFX942-NEXT: v_perm_b32 v2, v8, v1, s2 +; GFX942-NEXT: v_perm_b32 v2, v8, v1, s0 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v5| src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v11 -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v9, v0, s2 -; GFX942-NEXT: v_pk_max_f16 v6, v6, v10 +; GFX942-NEXT: v_perm_b32 v6, v9, v0, s0 ; GFX942-NEXT: v_cndmask_b32_sdwa v3, v12, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v9, |v4| src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_pk_max_f16 v6, v6, v10 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_sdwa v7, v12, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v1, |v5| ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v2, vcc ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v4| -; GFX942-NEXT: v_perm_b32 v1, v3, v1, s2 +; GFX942-NEXT: v_perm_b32 v1, v3, v1, s0 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v6, vcc -; GFX942-NEXT: v_perm_b32 v0, v7, v0, s2 +; GFX942-NEXT: v_perm_b32 v0, v7, v0, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v4f16__fabs_all: @@ -2986,38 +2978,37 @@ define <4 x half> @v_fmaximum3_v4f16__fneg_all(<4 x half> %a, <4 x half> %b, <4 ; GFX942-NEXT: v_pk_max_f16 v6, v0, v2 neg_lo:[1,1] neg_hi:[1,1] ; GFX942-NEXT: v_mov_b32_e32 v7, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v2 -; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1] +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] -; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: s_mov_b32 s2, 0x5040100 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1] +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v2, v1, v6, s2 +; GFX942-NEXT: v_perm_b32 v2, v1, v6, s0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v5 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v6, -v5 -; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v1, -v5 src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, -v5 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v2, v0, v8, s2 +; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v8, -v4 -; GFX942-NEXT: v_perm_b32 v1, v1, v3, s2 +; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v0, v0, v5, s2 +; GFX942-NEXT: v_perm_b32 v0, v0, v5, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v4f16__fneg_all: @@ -3059,23 +3050,22 @@ define <4 x half> @v_fmaximum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) { ; GFX942-NEXT: v_cndmask_b32_sdwa v6, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v7 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: s_mov_b32 s2, 0x5040100 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v8, v5, v8, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v8, v3 src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc -; GFX942-NEXT: v_perm_b32 v4, v8, v1, s2 +; GFX942-NEXT: v_perm_b32 v4, v8, v1, s0 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v8, v3 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_pk_max_f16 v4, v4, v3 -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_sdwa v7, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v6, v2 src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v6, v6, v0, s2 +; GFX942-NEXT: v_perm_b32 v6, v6, v0, s0 ; GFX942-NEXT: v_pk_max_f16 v6, v6, v2 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_sdwa v8, v5, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -3083,10 +3073,10 @@ define <4 x half> @v_fmaximum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) { ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX942-NEXT: v_perm_b32 v1, v7, v1, s2 +; GFX942-NEXT: v_perm_b32 v1, v7, v1, s0 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc -; GFX942-NEXT: v_perm_b32 v0, v8, v0, s2 +; GFX942-NEXT: v_perm_b32 v0, v8, v0, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v4f16__inlineimm1: @@ -3121,19 +3111,18 @@ define <4 x half> @v_fmaximum3_v4f16__inlineimm2(<4 x half> %a, <4 x half> %b) { ; GFX942-NEXT: v_pk_max_f16 v4, v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] -; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] -; GFX942-NEXT: s_mov_b32 s0, 0x5040100 -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v1, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v2, v1, v4, s0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, 4.0 op_sel_hi:[1,0] @@ -3972,12 +3961,11 @@ define <4 x half> @v_no_fmaximum3_v2f16__multi_use(<2 x half> %a, <2 x half> %b, ; GFX942-NEXT: v_pk_max_f16 v3, v0, v1 ; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v1, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v1, v5, s0 ; GFX942-NEXT: v_pk_max_f16 v3, v0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll index df0e7a03b4ab6..81b8e8ebd10e3 100644 --- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll @@ -2080,12 +2080,11 @@ define <2 x half> @v_fminimum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c ; GFX942-NEXT: v_pk_min_f16 v3, v0, v1 ; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v1, v0, v5, s0 ; GFX942-NEXT: v_pk_min_f16 v1, v2, v1 @@ -2127,12 +2126,11 @@ define <2 x half> @v_fminimum3_v2f16_commute(<2 x half> %a, <2 x half> %b, <2 x ; GFX942-NEXT: v_pk_min_f16 v3, v0, v1 ; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v1, v0, v5, s0 ; GFX942-NEXT: v_pk_min_f16 v1, v1, v2 @@ -2176,15 +2174,13 @@ define <2 x half> @v_fminimum3_v2f16__fabs_all(<2 x half> %a, <2 x half> %b, <2 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v0 ; GFX942-NEXT: v_and_b32_e32 v4, 0x7fff7fff, v1 -; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], |v0|, |v1| src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v1| src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_min_f16 v3, v3, v4 ; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] -; GFX942-NEXT: s_mov_b32 s0, 0x5040100 -; GFX942-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v2 ; GFX942-NEXT: v_cndmask_b32_sdwa v6, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v1| -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v2 ; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX942-NEXT: v_perm_b32 v1, v6, v0, s0 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v6, |v2| src0_sel:DWORD src1_sel:WORD_1 @@ -2232,12 +2228,11 @@ define <2 x half> @v_fminimum3_v2f16__fneg_all(<2 x half> %a, <2 x half> %b, <2 ; GFX942-NEXT: v_pk_min_f16 v3, v0, v1 neg_lo:[1,1] neg_hi:[1,1] ; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v1 -; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], -v0, -v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v1, v0, v5, s0 ; GFX942-NEXT: v_pk_min_f16 v1, v1, v2 neg_lo:[0,1] neg_hi:[0,1] @@ -2767,38 +2762,37 @@ define <4 x half> @v_fminimum3_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c ; GFX942-NEXT: v_pk_min_f16 v6, v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v7, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] -; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: s_mov_b32 s2, 0x5040100 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v2, v1, v6, s2 +; GFX942-NEXT: v_perm_b32 v2, v1, v6, s0 ; GFX942-NEXT: v_pk_min_f16 v2, v5, v2 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v5, v6 -; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v5, v1 src0_sel:WORD_1 src1_sel:DWORD -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v5, v1 src0_sel:WORD_1 src1_sel:DWORD ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v2, v0, v8, s2 +; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 ; GFX942-NEXT: v_pk_min_f16 v2, v4, v2 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v8 -; GFX942-NEXT: v_perm_b32 v1, v1, v3, s2 +; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v0, v0, v5, s2 +; GFX942-NEXT: v_perm_b32 v0, v0, v5, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v4f16: @@ -2833,38 +2827,37 @@ define <4 x half> @v_fminimum3_v4f16_commute(<4 x half> %a, <4 x half> %b, <4 x ; GFX942-NEXT: v_pk_min_f16 v6, v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v7, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] -; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: s_mov_b32 s2, 0x5040100 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v2, v1, v6, s2 +; GFX942-NEXT: v_perm_b32 v2, v1, v6, s0 ; GFX942-NEXT: v_pk_min_f16 v2, v2, v5 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 -; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v1, v5 src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v2, v0, v8, s2 +; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 ; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v8, v4 -; GFX942-NEXT: v_perm_b32 v1, v1, v3, s2 +; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v0, v0, v5, s2 +; GFX942-NEXT: v_perm_b32 v0, v0, v5, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v4f16_commute: @@ -2912,7 +2905,7 @@ define <4 x half> @v_fminimum3_v4f16__fabs_all(<4 x half> %a, <4 x half> %b, <4 ; GFX942-NEXT: v_lshrrev_b32_e32 v9, 16, v7 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_min_f16 v6, v6, v8 -; GFX942-NEXT: s_mov_b32 s2, 0x5040100 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc ; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1 @@ -2920,29 +2913,28 @@ define <4 x half> @v_fminimum3_v4f16__fabs_all(<4 x half> %a, <4 x half> %b, <4 ; GFX942-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v4 ; GFX942-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2| -; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v8, |v5| src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v7, vcc ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3| ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v6, vcc -; GFX942-NEXT: v_perm_b32 v2, v8, v1, s2 +; GFX942-NEXT: v_perm_b32 v2, v8, v1, s0 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v5| src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_pk_min_f16 v2, v2, v11 -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v9, v0, s2 -; GFX942-NEXT: v_pk_min_f16 v6, v6, v10 +; GFX942-NEXT: v_perm_b32 v6, v9, v0, s0 ; GFX942-NEXT: v_cndmask_b32_sdwa v3, v12, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v9, |v4| src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_pk_min_f16 v6, v6, v10 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_sdwa v7, v12, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v1, |v5| ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v2, vcc ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v4| -; GFX942-NEXT: v_perm_b32 v1, v3, v1, s2 +; GFX942-NEXT: v_perm_b32 v1, v3, v1, s0 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v6, vcc -; GFX942-NEXT: v_perm_b32 v0, v7, v0, s2 +; GFX942-NEXT: v_perm_b32 v0, v7, v0, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v4f16__fabs_all: @@ -2986,38 +2978,37 @@ define <4 x half> @v_fminimum3_v4f16__fneg_all(<4 x half> %a, <4 x half> %b, <4 ; GFX942-NEXT: v_pk_min_f16 v6, v0, v2 neg_lo:[1,1] neg_hi:[1,1] ; GFX942-NEXT: v_mov_b32_e32 v7, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v2 -; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1] +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] -; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: s_mov_b32 s2, 0x5040100 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1] +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v2, v1, v6, s2 +; GFX942-NEXT: v_perm_b32 v2, v1, v6, s0 ; GFX942-NEXT: v_pk_min_f16 v2, v2, v5 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v6, -v5 -; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v1, -v5 src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, -v5 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v2, v0, v8, s2 +; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 ; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v8, -v4 -; GFX942-NEXT: v_perm_b32 v1, v1, v3, s2 +; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v0, v0, v5, s2 +; GFX942-NEXT: v_perm_b32 v0, v0, v5, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v4f16__fneg_all: @@ -3059,23 +3050,22 @@ define <4 x half> @v_fminimum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) { ; GFX942-NEXT: v_cndmask_b32_sdwa v6, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v7 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: s_mov_b32 s2, 0x5040100 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v8, v5, v8, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v8, v3 src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc -; GFX942-NEXT: v_perm_b32 v4, v8, v1, s2 +; GFX942-NEXT: v_perm_b32 v4, v8, v1, s0 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v8, v3 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_pk_min_f16 v4, v4, v3 -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_sdwa v7, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v6, v2 src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v6, v6, v0, s2 +; GFX942-NEXT: v_perm_b32 v6, v6, v0, s0 ; GFX942-NEXT: v_pk_min_f16 v6, v6, v2 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_sdwa v8, v5, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -3083,10 +3073,10 @@ define <4 x half> @v_fminimum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) { ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX942-NEXT: v_perm_b32 v1, v7, v1, s2 +; GFX942-NEXT: v_perm_b32 v1, v7, v1, s0 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc -; GFX942-NEXT: v_perm_b32 v0, v8, v0, s2 +; GFX942-NEXT: v_perm_b32 v0, v8, v0, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v4f16__inlineimm1: @@ -3121,19 +3111,18 @@ define <4 x half> @v_fminimum3_v4f16__inlineimm2(<4 x half> %a, <4 x half> %b) { ; GFX942-NEXT: v_pk_min_f16 v4, v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] -; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] -; GFX942-NEXT: s_mov_b32 s0, 0x5040100 -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v1, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v2, v1, v4, s0 ; GFX942-NEXT: v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0] @@ -3972,12 +3961,11 @@ define <4 x half> @v_no_fminimum3_v2f16__multi_use(<2 x half> %a, <2 x half> %b, ; GFX942-NEXT: v_pk_min_f16 v3, v0, v1 ; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX942-NEXT: v_cmp_o_f16_sdwa s[0:1], v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[0:1] ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_sdwa v1, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v1, v5, s0 ; GFX942-NEXT: v_pk_min_f16 v3, v0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index 573fdd392267b..6925a98f643b9 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -336,48 +336,47 @@ define amdgpu_kernel void @half8_inselt(ptr addrspace(1) %out, <8 x half> %vec, ; GCN-LABEL: half8_inselt: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 -; GCN-NEXT: s_load_dword s8, s[4:5], 0x44 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x44 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s6, s3, 16 -; GCN-NEXT: s_cmp_lg_u32 s8, 7 -; GCN-NEXT: v_mov_b32_e32 v1, s6 -; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GCN-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[6:7] -; GCN-NEXT: s_cmp_lg_u32 s8, 6 +; GCN-NEXT: s_lshr_b32 s7, s3, 16 +; GCN-NEXT: s_cmp_lg_u32 s6, 7 +; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_lg_u32 s6, 6 ; GCN-NEXT: v_cndmask_b32_sdwa v1, v0, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GCN-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_lshr_b32 s3, s2, 16 ; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc -; GCN-NEXT: s_cmp_lg_u32 s8, 5 +; GCN-NEXT: s_cmp_lg_u32 s6, 5 ; GCN-NEXT: v_or_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 4 +; GCN-NEXT: s_cmp_lg_u32 s6, 4 ; GCN-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_lshr_b32 s2, s1, 16 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc -; GCN-NEXT: s_cmp_lg_u32 s8, 3 +; GCN-NEXT: s_cmp_lg_u32 s6, 3 ; GCN-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 2 +; GCN-NEXT: s_cmp_lg_u32 s6, 2 ; GCN-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v4, s1 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_lshr_b32 s1, s0, 16 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc -; GCN-NEXT: s_cmp_lg_u32 s8, 1 +; GCN-NEXT: s_cmp_lg_u32 s6, 1 ; GCN-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GCN-NEXT: v_mov_b32_e32 v4, s1 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 +; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc ; GCN-NEXT: v_mov_b32_e32 v5, s0 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll index 139dd059997f5..47a371d8de07c 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll @@ -1131,119 +1131,118 @@ define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out, ; VI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s5, 2 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s5, 3 ; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; VI-NEXT: v_or_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 ; VI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s5, 0 -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 1 -; VI-NEXT: v_or_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; VI-NEXT: v_or_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc -; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[2:3] +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s5, 1 +; VI-NEXT: v_cndmask_b32_e32 v8, v0, v6, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e64 v7, v0, v6, s[0:1] ; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX900-LABEL: v_insertelement_v8bf16_dynamic: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0x0 -; GFX900-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x10 +; GFX900-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 +; GFX900-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x10 ; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; GFX900-NEXT: s_mov_b32 s16, 0x5040100 +; GFX900-NEXT: s_mov_b32 s14, 0x5040100 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: global_load_dwordx4 v[0:3], v4, s[14:15] -; GFX900-NEXT: s_cmp_eq_u32 s11, 6 +; GFX900-NEXT: global_load_dwordx4 v[0:3], v4, s[18:19] +; GFX900-NEXT: s_cmp_eq_u32 s13, 6 +; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s13, 7 ; GFX900-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s11, 7 +; GFX900-NEXT: s_cmp_eq_u32 s13, 4 ; GFX900-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s11, 4 -; GFX900-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[2:3] -; GFX900-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s11, 5 -; GFX900-NEXT: v_mov_b32_e32 v5, s10 -; GFX900-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s11, 2 +; GFX900-NEXT: s_cmp_eq_u32 s13, 5 ; GFX900-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s11, 3 -; GFX900-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s11, 0 +; GFX900-NEXT: s_cmp_eq_u32 s13, 2 +; GFX900-NEXT: v_mov_b32_e32 v5, s12 ; GFX900-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s11, 1 +; GFX900-NEXT: s_cmp_eq_u32 s13, 3 +; GFX900-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s13, 0 ; GFX900-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s13, 1 +; GFX900-NEXT: s_cselect_b64 s[12:13], -1, 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_cndmask_b32_e64 v6, v3, v5, s[0:1] +; GFX900-NEXT: v_cndmask_b32_e32 v6, v3, v5, vcc +; GFX900-NEXT: s_mov_b64 vcc, s[0:1] ; GFX900-NEXT: v_cndmask_b32_sdwa v3, v3, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX900-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[8:9] +; GFX900-NEXT: s_mov_b64 vcc, s[4:5] ; GFX900-NEXT: v_cndmask_b32_e64 v7, v2, v5, s[2:3] ; GFX900-NEXT: v_cndmask_b32_sdwa v2, v2, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX900-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[14:15] -; GFX900-NEXT: v_cndmask_b32_e64 v8, v1, v5, s[4:5] +; GFX900-NEXT: s_mov_b64 vcc, s[8:9] +; GFX900-NEXT: v_cndmask_b32_e64 v8, v1, v5, s[6:7] ; GFX900-NEXT: v_cndmask_b32_sdwa v1, v1, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX900-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[10:11] -; GFX900-NEXT: v_perm_b32 v3, v3, v6, s16 -; GFX900-NEXT: v_cndmask_b32_e64 v6, v0, v5, s[6:7] +; GFX900-NEXT: s_mov_b64 vcc, s[12:13] +; GFX900-NEXT: v_cndmask_b32_e64 v9, v0, v5, s[10:11] ; GFX900-NEXT: v_cndmask_b32_sdwa v0, v0, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX900-NEXT: v_perm_b32 v2, v2, v7, s16 -; GFX900-NEXT: v_perm_b32 v1, v1, v8, s16 -; GFX900-NEXT: v_perm_b32 v0, v0, v6, s16 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] +; GFX900-NEXT: v_perm_b32 v3, v3, v6, s14 +; GFX900-NEXT: v_perm_b32 v2, v2, v7, s14 +; GFX900-NEXT: v_perm_b32 v1, v1, v8, s14 +; GFX900-NEXT: v_perm_b32 v0, v0, v9, s14 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX900-NEXT: s_endpgm ; ; GFX942-LABEL: v_insertelement_v8bf16_dynamic: ; GFX942: ; %bb.0: -; GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX942-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 ; GFX942-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 ; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; GFX942-NEXT: s_mov_b32 s16, 0x5040100 +; GFX942-NEXT: s_mov_b32 s14, 0x5040100 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx4 v[0:3], v4, s[10:11] +; GFX942-NEXT: global_load_dwordx4 v[0:3], v4, s[18:19] ; GFX942-NEXT: s_cmp_eq_u32 s13, 6 -; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX942-NEXT: s_cmp_eq_u32 s13, 7 -; GFX942-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX942-NEXT: s_cmp_eq_u32 s13, 4 -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[2:3] ; GFX942-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX942-NEXT: s_cmp_eq_u32 s13, 5 -; GFX942-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s13, 2 ; GFX942-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s13, 3 +; GFX942-NEXT: s_cmp_eq_u32 s13, 2 ; GFX942-NEXT: v_mov_b32_e32 v5, s12 -; GFX942-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s13, 0 ; GFX942-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s13, 3 +; GFX942-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s13, 0 +; GFX942-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GFX942-NEXT: s_cmp_eq_u32 s13, 1 ; GFX942-NEXT: s_cselect_b64 s[12:13], -1, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_cndmask_b32_e64 v6, v3, v5, s[0:1] +; GFX942-NEXT: v_cndmask_b32_e32 v6, v3, v5, vcc +; GFX942-NEXT: s_mov_b64 vcc, s[0:1] ; GFX942-NEXT: v_cndmask_b32_sdwa v3, v3, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[10:11] +; GFX942-NEXT: s_mov_b64 vcc, s[4:5] ; GFX942-NEXT: v_cndmask_b32_e64 v7, v2, v5, s[2:3] -; GFX942-NEXT: v_cndmask_b32_e64 v8, v1, v5, s[4:5] ; GFX942-NEXT: v_cndmask_b32_sdwa v2, v2, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[14:15] -; GFX942-NEXT: v_cndmask_b32_e64 v9, v0, v5, s[6:7] -; GFX942-NEXT: v_perm_b32 v3, v3, v6, s16 +; GFX942-NEXT: s_mov_b64 vcc, s[8:9] +; GFX942-NEXT: v_cndmask_b32_e64 v8, v1, v5, s[6:7] ; GFX942-NEXT: v_cndmask_b32_sdwa v1, v1, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[12:13] -; GFX942-NEXT: v_perm_b32 v2, v2, v7, s16 -; GFX942-NEXT: v_perm_b32 v1, v1, v8, s16 +; GFX942-NEXT: s_mov_b64 vcc, s[12:13] +; GFX942-NEXT: v_cndmask_b32_e64 v9, v0, v5, s[10:11] ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v0, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX942-NEXT: v_perm_b32 v0, v0, v9, s16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] +; GFX942-NEXT: v_perm_b32 v3, v3, v6, s14 +; GFX942-NEXT: v_perm_b32 v2, v2, v7, s14 +; GFX942-NEXT: v_perm_b32 v1, v1, v8, s14 +; GFX942-NEXT: v_perm_b32 v0, v0, v9, s14 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX942-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1558,163 +1557,163 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out ; ; GFX900-LABEL: v_insertelement_v16bf16_dynamic: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 -; GFX900-NEXT: s_load_dwordx2 s[20:21], s[8:9], 0x10 +; GFX900-NEXT: s_load_dwordx4 s[36:39], s[8:9], 0x0 +; GFX900-NEXT: s_load_dwordx2 s[28:29], s[8:9], 0x10 ; GFX900-NEXT: v_lshlrev_b32_e32 v8, 5, v0 -; GFX900-NEXT: s_mov_b32 s33, 0x5040100 +; GFX900-NEXT: s_mov_b32 s30, 0x5040100 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: global_load_dwordx4 v[0:3], v8, s[18:19] -; GFX900-NEXT: global_load_dwordx4 v[4:7], v8, s[18:19] offset:16 -; GFX900-NEXT: s_cmp_eq_u32 s21, 6 +; GFX900-NEXT: global_load_dwordx4 v[0:3], v8, s[38:39] +; GFX900-NEXT: global_load_dwordx4 v[4:7], v8, s[38:39] offset:16 +; GFX900-NEXT: s_cmp_eq_u32 s29, 6 +; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s29, 7 ; GFX900-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s21, 7 -; GFX900-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s21, 4 -; GFX900-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[2:3] +; GFX900-NEXT: s_cmp_eq_u32 s29, 4 ; GFX900-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s21, 5 -; GFX900-NEXT: v_mov_b32_e32 v9, s20 -; GFX900-NEXT: s_cselect_b64 s[18:19], -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s21, 2 +; GFX900-NEXT: s_cmp_eq_u32 s29, 5 ; GFX900-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s21, 3 -; GFX900-NEXT: s_cselect_b64 s[22:23], -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s21, 0 +; GFX900-NEXT: s_cmp_eq_u32 s29, 2 ; GFX900-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s21, 1 -; GFX900-NEXT: s_cselect_b64 s[24:25], -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s21, 14 +; GFX900-NEXT: s_cmp_eq_u32 s29, 3 ; GFX900-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s21, 15 -; GFX900-NEXT: s_cselect_b64 s[26:27], -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s21, 12 +; GFX900-NEXT: s_cmp_eq_u32 s29, 0 ; GFX900-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s21, 13 -; GFX900-NEXT: s_cselect_b64 s[28:29], -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s21, 10 +; GFX900-NEXT: s_cmp_eq_u32 s29, 1 ; GFX900-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s21, 11 -; GFX900-NEXT: s_cselect_b64 s[30:31], -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s21, 8 +; GFX900-NEXT: s_cmp_eq_u32 s29, 14 +; GFX900-NEXT: v_mov_b32_e32 v9, s28 ; GFX900-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s21, 9 +; GFX900-NEXT: s_cmp_eq_u32 s29, 15 +; GFX900-NEXT: s_cselect_b64 s[16:17], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s29, 12 +; GFX900-NEXT: s_cselect_b64 s[18:19], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s29, 13 ; GFX900-NEXT: s_cselect_b64 s[20:21], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s29, 10 +; GFX900-NEXT: s_cselect_b64 s[22:23], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s29, 11 +; GFX900-NEXT: s_cselect_b64 s[24:25], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s29, 8 +; GFX900-NEXT: s_cselect_b64 s[26:27], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s29, 9 +; GFX900-NEXT: s_cselect_b64 s[28:29], -1, 0 ; GFX900-NEXT: s_waitcnt vmcnt(1) -; GFX900-NEXT: v_cndmask_b32_e64 v10, v3, v9, s[0:1] +; GFX900-NEXT: v_cndmask_b32_e32 v10, v3, v9, vcc +; GFX900-NEXT: s_mov_b64 vcc, s[0:1] ; GFX900-NEXT: v_cndmask_b32_sdwa v3, v3, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX900-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[18:19] +; GFX900-NEXT: s_mov_b64 vcc, s[4:5] ; GFX900-NEXT: v_cndmask_b32_e64 v11, v2, v9, s[2:3] ; GFX900-NEXT: v_cndmask_b32_sdwa v2, v2, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX900-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[22:23] -; GFX900-NEXT: v_cndmask_b32_e64 v12, v1, v9, s[4:5] +; GFX900-NEXT: s_mov_b64 vcc, s[8:9] +; GFX900-NEXT: v_cndmask_b32_e64 v12, v1, v9, s[6:7] ; GFX900-NEXT: v_cndmask_b32_sdwa v1, v1, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX900-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[24:25] -; GFX900-NEXT: v_cndmask_b32_e64 v13, v0, v9, s[6:7] +; GFX900-NEXT: s_mov_b64 vcc, s[12:13] +; GFX900-NEXT: v_cndmask_b32_e64 v13, v0, v9, s[10:11] ; GFX900-NEXT: v_cndmask_b32_sdwa v0, v0, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX900-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[26:27] +; GFX900-NEXT: s_mov_b64 vcc, s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_cndmask_b32_e64 v14, v7, v9, s[8:9] +; GFX900-NEXT: v_cndmask_b32_e64 v14, v7, v9, s[14:15] ; GFX900-NEXT: v_cndmask_b32_sdwa v7, v7, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX900-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[28:29] -; GFX900-NEXT: v_cndmask_b32_e64 v15, v6, v9, s[10:11] +; GFX900-NEXT: s_mov_b64 vcc, s[20:21] +; GFX900-NEXT: v_cndmask_b32_e64 v15, v6, v9, s[18:19] ; GFX900-NEXT: v_cndmask_b32_sdwa v6, v6, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX900-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[30:31] -; GFX900-NEXT: v_perm_b32 v3, v3, v10, s33 -; GFX900-NEXT: v_cndmask_b32_e64 v10, v5, v9, s[12:13] +; GFX900-NEXT: s_mov_b64 vcc, s[24:25] +; GFX900-NEXT: v_perm_b32 v3, v3, v10, s30 +; GFX900-NEXT: v_cndmask_b32_e64 v10, v5, v9, s[22:23] ; GFX900-NEXT: v_cndmask_b32_sdwa v5, v5, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX900-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[20:21] -; GFX900-NEXT: v_perm_b32 v2, v2, v11, s33 -; GFX900-NEXT: v_cndmask_b32_e64 v11, v4, v9, s[14:15] +; GFX900-NEXT: s_mov_b64 vcc, s[28:29] +; GFX900-NEXT: v_perm_b32 v2, v2, v11, s30 +; GFX900-NEXT: v_cndmask_b32_e64 v11, v4, v9, s[26:27] ; GFX900-NEXT: v_cndmask_b32_sdwa v4, v4, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX900-NEXT: v_perm_b32 v7, v7, v14, s33 -; GFX900-NEXT: v_perm_b32 v6, v6, v15, s33 -; GFX900-NEXT: v_perm_b32 v5, v5, v10, s33 -; GFX900-NEXT: v_perm_b32 v4, v4, v11, s33 -; GFX900-NEXT: v_perm_b32 v1, v1, v12, s33 -; GFX900-NEXT: v_perm_b32 v0, v0, v13, s33 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_perm_b32 v7, v7, v14, s30 +; GFX900-NEXT: v_perm_b32 v6, v6, v15, s30 +; GFX900-NEXT: v_perm_b32 v5, v5, v10, s30 +; GFX900-NEXT: v_perm_b32 v4, v4, v11, s30 +; GFX900-NEXT: v_perm_b32 v1, v1, v12, s30 +; GFX900-NEXT: v_perm_b32 v0, v0, v13, s30 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[36:37] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[36:37] ; GFX900-NEXT: s_endpgm ; ; GFX942-LABEL: v_insertelement_v16bf16_dynamic: ; GFX942: ; %bb.0: -; GFX942-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 -; GFX942-NEXT: s_load_dwordx2 s[20:21], s[4:5], 0x10 +; GFX942-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x0 +; GFX942-NEXT: s_load_dwordx2 s[28:29], s[4:5], 0x10 ; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX942-NEXT: v_lshlrev_b32_e32 v8, 5, v0 -; GFX942-NEXT: s_mov_b32 s33, 0x5040100 +; GFX942-NEXT: s_mov_b32 s30, 0x5040100 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx4 v[0:3], v8, s[18:19] -; GFX942-NEXT: global_load_dwordx4 v[4:7], v8, s[18:19] offset:16 -; GFX942-NEXT: s_cmp_eq_u32 s21, 6 +; GFX942-NEXT: global_load_dwordx4 v[0:3], v8, s[38:39] +; GFX942-NEXT: global_load_dwordx4 v[4:7], v8, s[38:39] offset:16 +; GFX942-NEXT: s_cmp_eq_u32 s29, 6 +; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s29, 7 ; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s21, 7 +; GFX942-NEXT: s_cmp_eq_u32 s29, 4 ; GFX942-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s21, 4 -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[2:3] -; GFX942-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s21, 5 -; GFX942-NEXT: s_cselect_b64 s[18:19], -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s21, 2 +; GFX942-NEXT: s_cmp_eq_u32 s29, 5 ; GFX942-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s21, 3 -; GFX942-NEXT: s_cselect_b64 s[22:23], -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s21, 0 +; GFX942-NEXT: s_cmp_eq_u32 s29, 2 ; GFX942-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s21, 1 -; GFX942-NEXT: s_cselect_b64 s[24:25], -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s21, 14 +; GFX942-NEXT: s_cmp_eq_u32 s29, 3 ; GFX942-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s21, 15 -; GFX942-NEXT: v_mov_b32_e32 v9, s20 -; GFX942-NEXT: s_cselect_b64 s[26:27], -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s21, 12 +; GFX942-NEXT: s_cmp_eq_u32 s29, 0 ; GFX942-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s21, 13 -; GFX942-NEXT: s_cselect_b64 s[28:29], -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s21, 10 +; GFX942-NEXT: s_cmp_eq_u32 s29, 1 ; GFX942-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s21, 11 -; GFX942-NEXT: s_cselect_b64 s[30:31], -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s21, 8 +; GFX942-NEXT: s_cmp_eq_u32 s29, 14 +; GFX942-NEXT: v_mov_b32_e32 v9, s28 ; GFX942-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s21, 9 +; GFX942-NEXT: s_cmp_eq_u32 s29, 15 +; GFX942-NEXT: s_cselect_b64 s[16:17], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s29, 12 +; GFX942-NEXT: s_cselect_b64 s[18:19], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s29, 13 ; GFX942-NEXT: s_cselect_b64 s[20:21], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s29, 10 +; GFX942-NEXT: s_cselect_b64 s[22:23], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s29, 11 +; GFX942-NEXT: s_cselect_b64 s[24:25], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s29, 8 +; GFX942-NEXT: s_cselect_b64 s[26:27], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s29, 9 +; GFX942-NEXT: s_cselect_b64 s[28:29], -1, 0 ; GFX942-NEXT: s_waitcnt vmcnt(1) -; GFX942-NEXT: v_cndmask_b32_e64 v10, v3, v9, s[0:1] +; GFX942-NEXT: v_cndmask_b32_e32 v10, v3, v9, vcc +; GFX942-NEXT: s_mov_b64 vcc, s[0:1] ; GFX942-NEXT: v_cndmask_b32_sdwa v3, v3, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[18:19] +; GFX942-NEXT: s_mov_b64 vcc, s[4:5] ; GFX942-NEXT: v_cndmask_b32_e64 v11, v2, v9, s[2:3] -; GFX942-NEXT: v_cndmask_b32_e64 v12, v1, v9, s[4:5] ; GFX942-NEXT: v_cndmask_b32_sdwa v2, v2, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[22:23] -; GFX942-NEXT: v_cndmask_b32_e64 v13, v0, v9, s[6:7] -; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_cndmask_b32_e64 v14, v7, v9, s[8:9] +; GFX942-NEXT: s_mov_b64 vcc, s[8:9] +; GFX942-NEXT: v_cndmask_b32_e64 v12, v1, v9, s[6:7] ; GFX942-NEXT: v_cndmask_b32_sdwa v1, v1, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[24:25] -; GFX942-NEXT: v_cndmask_b32_e64 v15, v6, v9, s[10:11] -; GFX942-NEXT: v_cndmask_b32_e64 v16, v5, v9, s[12:13] +; GFX942-NEXT: s_mov_b64 vcc, s[12:13] +; GFX942-NEXT: v_cndmask_b32_e64 v13, v0, v9, s[10:11] ; GFX942-NEXT: v_cndmask_b32_sdwa v0, v0, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[26:27] -; GFX942-NEXT: v_cndmask_b32_e64 v17, v4, v9, s[14:15] -; GFX942-NEXT: v_perm_b32 v3, v3, v10, s33 +; GFX942-NEXT: s_mov_b64 vcc, s[16:17] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_cndmask_b32_e64 v14, v7, v9, s[14:15] ; GFX942-NEXT: v_cndmask_b32_sdwa v7, v7, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[28:29] -; GFX942-NEXT: v_perm_b32 v7, v7, v14, s33 -; GFX942-NEXT: v_perm_b32 v2, v2, v11, s33 +; GFX942-NEXT: s_mov_b64 vcc, s[20:21] +; GFX942-NEXT: v_cndmask_b32_e64 v15, v6, v9, s[18:19] ; GFX942-NEXT: v_cndmask_b32_sdwa v6, v6, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[30:31] -; GFX942-NEXT: v_perm_b32 v6, v6, v15, s33 -; GFX942-NEXT: v_perm_b32 v1, v1, v12, s33 +; GFX942-NEXT: s_mov_b64 vcc, s[24:25] +; GFX942-NEXT: v_cndmask_b32_e64 v16, v5, v9, s[22:23] ; GFX942-NEXT: v_cndmask_b32_sdwa v5, v5, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX942-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[20:21] -; GFX942-NEXT: v_perm_b32 v5, v5, v16, s33 -; GFX942-NEXT: v_perm_b32 v0, v0, v13, s33 +; GFX942-NEXT: s_mov_b64 vcc, s[28:29] +; GFX942-NEXT: v_cndmask_b32_e64 v17, v4, v9, s[26:27] ; GFX942-NEXT: v_cndmask_b32_sdwa v4, v4, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX942-NEXT: v_perm_b32 v4, v4, v17, s33 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX942-NEXT: v_perm_b32 v7, v7, v14, s30 +; GFX942-NEXT: v_perm_b32 v6, v6, v15, s30 +; GFX942-NEXT: v_perm_b32 v5, v5, v16, s30 +; GFX942-NEXT: v_perm_b32 v4, v4, v17, s30 +; GFX942-NEXT: v_perm_b32 v3, v3, v10, s30 +; GFX942-NEXT: v_perm_b32 v2, v2, v11, s30 +; GFX942-NEXT: v_perm_b32 v1, v1, v12, s30 +; GFX942-NEXT: v_perm_b32 v0, v0, v13, s30 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[36:37] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[36:37] ; GFX942-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index 4f349a39a456e..e0dacb7a59a42 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -2694,47 +2694,47 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) { ; GFX9-LABEL: v_insertelement_v8f16_dynamic: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; GFX9-NEXT: s_mov_b32 s16, 0x5040100 +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[14:15] -; GFX9-NEXT: s_cmp_eq_u32 s11, 6 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[18:19] +; GFX9-NEXT: s_cmp_eq_u32 s13, 6 +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s13, 7 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s11, 7 -; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s11, 4 -; GFX9-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[2:3] +; GFX9-NEXT: s_cmp_eq_u32 s13, 4 ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s11, 5 -; GFX9-NEXT: v_mov_b32_e32 v5, s10 -; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s11, 2 +; GFX9-NEXT: s_cmp_eq_u32 s13, 5 ; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s11, 3 -; GFX9-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s11, 0 +; GFX9-NEXT: s_cmp_eq_u32 s13, 2 +; GFX9-NEXT: v_mov_b32_e32 v5, s12 ; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s11, 1 +; GFX9-NEXT: s_cmp_eq_u32 s13, 3 +; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s13, 0 ; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s13, 1 +; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v6, v3, v5, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v5, vcc +; GFX9-NEXT: s_mov_b64 vcc, s[0:1] ; GFX9-NEXT: v_cndmask_b32_sdwa v3, v3, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[8:9] +; GFX9-NEXT: s_mov_b64 vcc, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v7, v2, v5, s[2:3] ; GFX9-NEXT: v_cndmask_b32_sdwa v2, v2, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v8, v1, v5, s[4:5] +; GFX9-NEXT: s_mov_b64 vcc, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v8, v1, v5, s[6:7] ; GFX9-NEXT: v_cndmask_b32_sdwa v1, v1, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[10:11] -; GFX9-NEXT: v_perm_b32 v3, v3, v6, s16 -; GFX9-NEXT: v_cndmask_b32_e64 v6, v0, v5, s[6:7] +; GFX9-NEXT: s_mov_b64 vcc, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e64 v9, v0, v5, s[10:11] ; GFX9-NEXT: v_cndmask_b32_sdwa v0, v0, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v2, v2, v7, s16 -; GFX9-NEXT: v_perm_b32 v1, v1, v8, s16 -; GFX9-NEXT: v_perm_b32 v0, v0, v6, s16 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] +; GFX9-NEXT: v_perm_b32 v3, v3, v6, s14 +; GFX9-NEXT: v_perm_b32 v2, v2, v7, s14 +; GFX9-NEXT: v_perm_b32 v1, v1, v8, s14 +; GFX9-NEXT: v_perm_b32 v0, v0, v9, s14 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v8f16_dynamic: @@ -2769,27 +2769,26 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ; VI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s5, 2 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s5, 3 ; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; VI-NEXT: v_or_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 ; VI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s5, 0 -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 1 -; VI-NEXT: v_or_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; VI-NEXT: v_or_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc -; VI-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[2:3] +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s5, 1 +; VI-NEXT: v_cndmask_b32_e32 v8, v0, v6, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e64 v7, v0, v6, s[0:1] ; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; @@ -3195,82 +3194,82 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) { ; GFX9-LABEL: v_insertelement_v16f16_dynamic: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[20:21], s[8:9], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[36:39], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[28:29], s[8:9], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 5, v0 -; GFX9-NEXT: s_mov_b32 s33, 0x5040100 +; GFX9-NEXT: s_mov_b32 s30, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[18:19] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[18:19] offset:16 -; GFX9-NEXT: s_cmp_eq_u32 s21, 6 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[38:39] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[38:39] offset:16 +; GFX9-NEXT: s_cmp_eq_u32 s29, 6 +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s29, 7 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s21, 7 -; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s21, 4 -; GFX9-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[2:3] +; GFX9-NEXT: s_cmp_eq_u32 s29, 4 ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s21, 5 -; GFX9-NEXT: v_mov_b32_e32 v9, s20 -; GFX9-NEXT: s_cselect_b64 s[18:19], -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s21, 2 +; GFX9-NEXT: s_cmp_eq_u32 s29, 5 ; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s21, 3 -; GFX9-NEXT: s_cselect_b64 s[22:23], -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s21, 0 +; GFX9-NEXT: s_cmp_eq_u32 s29, 2 ; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s21, 1 -; GFX9-NEXT: s_cselect_b64 s[24:25], -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s21, 14 +; GFX9-NEXT: s_cmp_eq_u32 s29, 3 ; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s21, 15 -; GFX9-NEXT: s_cselect_b64 s[26:27], -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s21, 12 +; GFX9-NEXT: s_cmp_eq_u32 s29, 0 ; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s21, 13 -; GFX9-NEXT: s_cselect_b64 s[28:29], -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s21, 10 +; GFX9-NEXT: s_cmp_eq_u32 s29, 1 ; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s21, 11 -; GFX9-NEXT: s_cselect_b64 s[30:31], -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s21, 8 +; GFX9-NEXT: s_cmp_eq_u32 s29, 14 +; GFX9-NEXT: v_mov_b32_e32 v9, s28 ; GFX9-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s21, 9 +; GFX9-NEXT: s_cmp_eq_u32 s29, 15 +; GFX9-NEXT: s_cselect_b64 s[16:17], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s29, 12 +; GFX9-NEXT: s_cselect_b64 s[18:19], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s29, 13 ; GFX9-NEXT: s_cselect_b64 s[20:21], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s29, 10 +; GFX9-NEXT: s_cselect_b64 s[22:23], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s29, 11 +; GFX9-NEXT: s_cselect_b64 s[24:25], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s29, 8 +; GFX9-NEXT: s_cselect_b64 s[26:27], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s29, 9 +; GFX9-NEXT: s_cselect_b64 s[28:29], -1, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e64 v10, v3, v9, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v10, v3, v9, vcc +; GFX9-NEXT: s_mov_b64 vcc, s[0:1] ; GFX9-NEXT: v_cndmask_b32_sdwa v3, v3, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[18:19] +; GFX9-NEXT: s_mov_b64 vcc, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v11, v2, v9, s[2:3] ; GFX9-NEXT: v_cndmask_b32_sdwa v2, v2, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[22:23] -; GFX9-NEXT: v_cndmask_b32_e64 v12, v1, v9, s[4:5] +; GFX9-NEXT: s_mov_b64 vcc, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v12, v1, v9, s[6:7] ; GFX9-NEXT: v_cndmask_b32_sdwa v1, v1, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[24:25] -; GFX9-NEXT: v_cndmask_b32_e64 v13, v0, v9, s[6:7] +; GFX9-NEXT: s_mov_b64 vcc, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e64 v13, v0, v9, s[10:11] ; GFX9-NEXT: v_cndmask_b32_sdwa v0, v0, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[26:27] +; GFX9-NEXT: s_mov_b64 vcc, s[16:17] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v14, v7, v9, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v14, v7, v9, s[14:15] ; GFX9-NEXT: v_cndmask_b32_sdwa v7, v7, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[28:29] -; GFX9-NEXT: v_cndmask_b32_e64 v15, v6, v9, s[10:11] +; GFX9-NEXT: s_mov_b64 vcc, s[20:21] +; GFX9-NEXT: v_cndmask_b32_e64 v15, v6, v9, s[18:19] ; GFX9-NEXT: v_cndmask_b32_sdwa v6, v6, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[30:31] -; GFX9-NEXT: v_perm_b32 v3, v3, v10, s33 -; GFX9-NEXT: v_cndmask_b32_e64 v10, v5, v9, s[12:13] +; GFX9-NEXT: s_mov_b64 vcc, s[24:25] +; GFX9-NEXT: v_perm_b32 v3, v3, v10, s30 +; GFX9-NEXT: v_cndmask_b32_e64 v10, v5, v9, s[22:23] ; GFX9-NEXT: v_cndmask_b32_sdwa v5, v5, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[20:21] -; GFX9-NEXT: v_perm_b32 v2, v2, v11, s33 -; GFX9-NEXT: v_cndmask_b32_e64 v11, v4, v9, s[14:15] +; GFX9-NEXT: s_mov_b64 vcc, s[28:29] +; GFX9-NEXT: v_perm_b32 v2, v2, v11, s30 +; GFX9-NEXT: v_cndmask_b32_e64 v11, v4, v9, s[26:27] ; GFX9-NEXT: v_cndmask_b32_sdwa v4, v4, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v7, v7, v14, s33 -; GFX9-NEXT: v_perm_b32 v6, v6, v15, s33 -; GFX9-NEXT: v_perm_b32 v5, v5, v10, s33 -; GFX9-NEXT: v_perm_b32 v4, v4, v11, s33 -; GFX9-NEXT: v_perm_b32 v1, v1, v12, s33 -; GFX9-NEXT: v_perm_b32 v0, v0, v13, s33 -; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX9-NEXT: v_perm_b32 v7, v7, v14, s30 +; GFX9-NEXT: v_perm_b32 v6, v6, v15, s30 +; GFX9-NEXT: v_perm_b32 v5, v5, v10, s30 +; GFX9-NEXT: v_perm_b32 v4, v4, v11, s30 +; GFX9-NEXT: v_perm_b32 v1, v1, v12, s30 +; GFX9-NEXT: v_perm_b32 v0, v0, v13, s30 +; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[36:37] offset:16 +; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[36:37] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v16f16_dynamic: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll index 6920924fa1547..be09e3451445e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll @@ -1596,21 +1596,20 @@ define <4 x half> @v_maximum_v4f16(<4 x half> %src0, <4 x half> %src1) { ; GFX900-LABEL: v_maximum_v4f16: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_cmp_o_f16_sdwa s[4:5], v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[4:5] ; GFX900-NEXT: v_pk_max_f16 v4, v1, v3 ; GFX900-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX900-NEXT: v_cmp_o_f16_e64 s[4:5], v1, v3 -; GFX900-NEXT: v_cndmask_b32_e64 v1, v5, v4, s[4:5] -; GFX900-NEXT: v_cndmask_b32_sdwa v3, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX900-NEXT: v_pk_max_f16 v4, v0, v2 -; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX900-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_sdwa v1, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX900-NEXT: v_pk_max_f16 v3, v0, v2 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_sdwa v0, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_perm_b32 v0, v0, v6, s4 -; GFX900-NEXT: v_perm_b32 v1, v3, v1, s4 +; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4 ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_maximum_v4f16: @@ -1835,21 +1834,20 @@ define <4 x half> @v_maximum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) { ; GFX900-LABEL: v_maximum_v4f16__nsz: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_cmp_o_f16_sdwa s[4:5], v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[4:5] ; GFX900-NEXT: v_pk_max_f16 v4, v1, v3 ; GFX900-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX900-NEXT: v_cmp_o_f16_e64 s[4:5], v1, v3 -; GFX900-NEXT: v_cndmask_b32_e64 v1, v5, v4, s[4:5] -; GFX900-NEXT: v_cndmask_b32_sdwa v3, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX900-NEXT: v_pk_max_f16 v4, v0, v2 -; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX900-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_sdwa v1, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX900-NEXT: v_pk_max_f16 v3, v0, v2 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_sdwa v0, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_perm_b32 v0, v0, v6, s4 -; GFX900-NEXT: v_perm_b32 v1, v3, v1, s4 +; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4 ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_maximum_v4f16__nsz: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll index b81c1f9a99aea..236fc7aa76c2b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll @@ -1281,21 +1281,20 @@ define <4 x half> @v_minimum_v4f16(<4 x half> %src0, <4 x half> %src1) { ; GFX900-LABEL: v_minimum_v4f16: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_cmp_o_f16_sdwa s[4:5], v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[4:5] ; GFX900-NEXT: v_pk_min_f16 v4, v1, v3 ; GFX900-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX900-NEXT: v_cmp_o_f16_e64 s[4:5], v1, v3 -; GFX900-NEXT: v_cndmask_b32_e64 v1, v5, v4, s[4:5] -; GFX900-NEXT: v_cndmask_b32_sdwa v3, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX900-NEXT: v_pk_min_f16 v4, v0, v2 -; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX900-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_sdwa v1, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX900-NEXT: v_pk_min_f16 v3, v0, v2 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_sdwa v0, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_perm_b32 v0, v0, v6, s4 -; GFX900-NEXT: v_perm_b32 v1, v3, v1, s4 +; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4 ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_minimum_v4f16: @@ -1461,21 +1460,20 @@ define <4 x half> @v_minimum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) { ; GFX900-LABEL: v_minimum_v4f16__nsz: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_cmp_o_f16_sdwa s[4:5], v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cmp_eq_u64_e64 vcc, 1, s[4:5] ; GFX900-NEXT: v_pk_min_f16 v4, v1, v3 ; GFX900-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX900-NEXT: v_cmp_o_f16_e64 s[4:5], v1, v3 -; GFX900-NEXT: v_cndmask_b32_e64 v1, v5, v4, s[4:5] -; GFX900-NEXT: v_cndmask_b32_sdwa v3, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX900-NEXT: v_pk_min_f16 v4, v0, v2 -; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX900-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_sdwa v1, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX900-NEXT: v_pk_min_f16 v3, v0, v2 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_sdwa v0, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_perm_b32 v0, v0, v6, s4 -; GFX900-NEXT: v_perm_b32 v1, v3, v1, s4 +; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4 ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_minimum_v4f16__nsz: From fc50f87698c0cad076e9156bda5e24b21154eeed Mon Sep 17 00:00:00 2001 From: Frederik Harwath Date: Wed, 30 Apr 2025 08:49:03 -0400 Subject: [PATCH 09/26] Handle wave32 --- llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 10 +- llvm/test/CodeGen/AMDGPU/bf16.ll | 92 +++++----- .../CodeGen/AMDGPU/dagcombine-fmul-sel.ll | 48 ++--- llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll | 165 +++++++++--------- llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll | 165 +++++++++--------- .../AMDGPU/sdwa-peephole-cndmask-gfx10.mir | 51 +++--- .../AMDGPU/sdwa-peephole-cndmask-gfx9.mir | 39 ++--- 7 files changed, 263 insertions(+), 307 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index 228b16e0ad70a..dca303eec0572 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -1074,11 +1074,6 @@ void SIPeepholeSDWA::convertToImplicitVcc(MachineInstr &MI, const GCNSubtarget &ST) const { assert(MI.getOpcode() == AMDGPU::V_CNDMASK_B32_e64); - MCRegister Vcc = TRI->getVCC(); - // FIXME Conversion introduces implicit vcc_hi use - if (Vcc == AMDGPU::VCC_LO) - return; - LLVM_DEBUG(dbgs() << "Attempting VOP2 conversion: " << MI); if (!TII->canShrink(MI, *MRI)) { LLVM_DEBUG(dbgs() << "Cannot shrink instruction\n"); @@ -1090,6 +1085,7 @@ void SIPeepholeSDWA::convertToImplicitVcc(MachineInstr &MI, // Make sure VCC or its subregs are dead before MI. MachineBasicBlock &MBB = *MI.getParent(); + MCRegister Vcc = TRI->getVCC(); MachineBasicBlock::LivenessQueryResult Liveness = MBB.computeRegisterLiveness(TRI, Vcc, MI); if (Liveness != MachineBasicBlock::LQR_Dead) { @@ -1328,7 +1324,9 @@ MachineInstr *SIPeepholeSDWA::createSDWAVersion(MachineInstr &MI) { SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1); } - return SDWAInst.getInstr(); + MachineInstr *Ret = SDWAInst.getInstr(); + TII->fixImplicitOperands(*Ret); + return Ret; } bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index cb089b15e0f4c..c656a20c10945 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -38500,11 +38500,9 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b) ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -38597,14 +38595,12 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo ; GFX10-LABEL: v_vselect_v2bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v0 +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v2, s4 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -40803,23 +40799,17 @@ define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bflo ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX10-NEXT: v_and_b32_e32 v1, 1, v3 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX10-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v6 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v2 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 -; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc_lo -; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v7, v5, s4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v0 +; GFX10-NEXT: v_cndmask_b32_sdwa v2, v6, v4, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_cndmask_b32_sdwa v3, v7, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v4, s4 +; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 +; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11TRUE16-LABEL: v_vselect_v4bf16: @@ -41067,42 +41057,40 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo ; GFX10-LABEL: v_vselect_v8bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX10-NEXT: v_and_b32_e32 v4, 1, v4 ; GFX10-NEXT: v_and_b32_e32 v5, 1, v5 ; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v16, 16, v10 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 -; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v14 +; GFX10-NEXT: v_lshrrev_b32_e32 v16, 16, v14 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 1, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 1, v6 +; GFX10-NEXT: v_and_b32_e32 v6, 1, v7 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v11 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX10-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v15, v11, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 -; GFX10-NEXT: v_and_b32_e32 v7, 1, v7 -; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v15, v11, s4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v10 ; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v14, v10, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v8 -; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v17, v16, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v13, v9, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v14, v10, s4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v13 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v16, v11, s4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v13, v9, s4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v12, v8, s4 +; GFX10-NEXT: v_cndmask_b32_sdwa v8, v12, v8, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v14, v10, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 -; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v13, v9, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7 -; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v15, v11, vcc_lo +; GFX10-NEXT: v_perm_b32 v0, v8, v0, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v10, v9, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 +; GFX10-NEXT: v_perm_b32 v1, v1, v2, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v15, v7, vcc_lo ; GFX10-NEXT: v_perm_b32 v2, v5, v4, 0x5040100 -; GFX10-NEXT: v_perm_b32 v3, v7, v6, 0x5040100 +; GFX10-NEXT: v_perm_b32 v3, v6, v3, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11TRUE16-LABEL: v_vselect_v8bf16: diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll index 9dd5fd91d186a..4e10f461beebf 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll @@ -2794,10 +2794,10 @@ define bfloat @fmul_select_bf16_test1(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, 0x4000 +; GFX10-NEXT: v_mov_b32_e32 v4, 0x3f80 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v4, v3, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 @@ -2928,10 +2928,10 @@ define bfloat @fmul_select_bf16_test2(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, 0x3f00 +; GFX10-NEXT: v_mov_b32_e32 v4, 0x3f80 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v4, v3, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 @@ -3080,12 +3080,12 @@ define <2 x bfloat> @fmul_select_v2bf16_test3(<2 x bfloat> %x, <2 x i32> %bool.a ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, 0x4000 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x3f80 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, v2, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v1, v5, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0x3f80, v5, s4 ; GFX10-NEXT: v_mul_f32_e32 v1, v3, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1 @@ -3289,12 +3289,12 @@ define <2 x bfloat> @fmul_select_v2bf16_test4(<2 x bfloat> %x, <2 x i32> %bool.a ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, 0x3f00 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x3f80 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, v2, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v1, v5, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0x3f80, v5, s4 ; GFX10-NEXT: v_mul_f32_e32 v1, v3, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1 @@ -3481,10 +3481,10 @@ define bfloat @fmul_select_bf16_test5(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, 0x4000 +; GFX10-NEXT: v_mov_b32_e32 v4, 0x4100 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x4100, v3, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v4, v3, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 @@ -3617,10 +3617,10 @@ define bfloat @fmul_select_bf16_test6(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, 0xffffc100 +; GFX10-NEXT: v_mov_b32_e32 v4, 0x4040 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x4040, v3, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v4, v3, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 @@ -3752,10 +3752,10 @@ define bfloat @fmul_select_bf16_test7(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, 0x4100 +; GFX10-NEXT: v_mov_b32_e32 v4, 0xffffc080 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffffc080, v3, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v4, v3, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 @@ -4016,10 +4016,10 @@ define bfloat @fmul_select_bf16_test9(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, 0xffffc180 +; GFX10-NEXT: v_mov_b32_e32 v4, 0xffffc200 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffffc200, v3, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v4, v3, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 @@ -4152,10 +4152,10 @@ define bfloat @fmul_select_bf16_test10_sel_log2val_pos65_pos56(bfloat %x, i32 %b ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, 0xffffe000 +; GFX10-NEXT: v_mov_b32_e32 v4, 0xffffdb80 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffffdb80, v3, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v4, v3, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 @@ -4288,10 +4288,10 @@ define bfloat @fmul_select_bf16_test11_sel_log2val_neg22_pos25(bfloat %x, i32 %b ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, 0x3480 +; GFX10-NEXT: v_mov_b32_e32 v4, 0x4c00 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x4c00, v3, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v4, v3, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll index be09e3451445e..426e05236e62f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll @@ -696,13 +696,13 @@ define <2 x half> @v_maximum_v2f16(<2 x half> %src0, <2 x half> %src1) { ; GFX10-LABEL: v_maximum_v2f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v0, v1 -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v2, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v2, 0x7e00 +; GFX10-NEXT: v_pk_max_f16 v3, v0, v1 ; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v3, vcc_lo -; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 +; GFX10-NEXT: v_cmp_o_f16_e64 s4, v0, v1 +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v2, v3, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v3, s4 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_maximum_v2f16: @@ -862,13 +862,13 @@ define <2 x half> @v_maximum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) { ; GFX10-LABEL: v_maximum_v2f16__nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v0, v1 -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v2, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v2, 0x7e00 +; GFX10-NEXT: v_pk_max_f16 v3, v0, v1 ; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v3, vcc_lo -; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 +; GFX10-NEXT: v_cmp_o_f16_e64 s4, v0, v1 +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v2, v3, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v3, s4 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_maximum_v2f16__nsz: @@ -1055,16 +1055,16 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX10-LABEL: s_maximum_v2f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, s16, s17 -; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s16, s17 ; GFX10-NEXT: s_lshr_b32 s4, s17, 16 ; GFX10-NEXT: s_lshr_b32 s5, s16, 16 -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo +; GFX10-NEXT: v_pk_max_f16 v0, s16, s17 ; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s5, s4 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v1, vcc_lo -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_cmp_o_f16_e64 s4, s16, s17 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x7e00 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0x7e00, v0, s4 +; GFX10-NEXT: v_cndmask_b32_sdwa v0, v1, v0, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use v0 ; GFX10-NEXT: ;;#ASMEND @@ -1199,16 +1199,16 @@ define <3 x half> @v_maximum_v3f16(<3 x half> %src0, <3 x half> %src1) { ; GFX10-LABEL: v_maximum_v3f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v4, v0, v2 -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v4, 0x7e00, v4, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX10-NEXT: v_pk_max_f16 v5, v0, v2 ; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_pk_max_f16 v2, v1, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo +; GFX10-NEXT: v_cmp_o_f16_e64 s4, v0, v2 +; GFX10-NEXT: v_cndmask_b32_sdwa v2, v4, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v5, s4 +; GFX10-NEXT: v_pk_max_f16 v4, v1, v3 ; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 -; GFX10-NEXT: v_perm_b32 v0, v0, v4, 0x5040100 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v2, vcc_lo +; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_maximum_v3f16: @@ -1402,16 +1402,16 @@ define <3 x half> @v_maximum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) { ; GFX10-LABEL: v_maximum_v3f16__nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v4, v0, v2 -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v4, 0x7e00, v4, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX10-NEXT: v_pk_max_f16 v5, v0, v2 ; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_pk_max_f16 v2, v1, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo +; GFX10-NEXT: v_cmp_o_f16_e64 s4, v0, v2 +; GFX10-NEXT: v_cndmask_b32_sdwa v2, v4, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v5, s4 +; GFX10-NEXT: v_pk_max_f16 v4, v1, v3 ; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 -; GFX10-NEXT: v_perm_b32 v0, v0, v4, 0x5040100 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v2, vcc_lo +; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_maximum_v3f16__nsz: @@ -1622,20 +1622,19 @@ define <4 x half> @v_maximum_v4f16(<4 x half> %src0, <4 x half> %src1) { ; GFX10-LABEL: v_maximum_v4f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v4, v1, v3 -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 -; GFX10-NEXT: v_pk_max_f16 v5, v0, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v4, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, 0x7e00, v5, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo ; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_perm_b32 v0, v0, v5, 0x5040100 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo -; GFX10-NEXT: v_perm_b32 v1, v1, v6, 0x5040100 +; GFX10-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX10-NEXT: v_pk_max_f16 v5, v1, v3 +; GFX10-NEXT: v_pk_max_f16 v6, v0, v2 +; GFX10-NEXT: v_cmp_o_f16_e64 s4, v0, v2 +; GFX10-NEXT: v_cndmask_b32_sdwa v7, v4, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v6, s4 +; GFX10-NEXT: v_cndmask_b32_sdwa v2, v4, v6, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 +; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v5, vcc_lo +; GFX10-NEXT: v_perm_b32 v1, v7, v1, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_maximum_v4f16: @@ -1860,20 +1859,19 @@ define <4 x half> @v_maximum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) { ; GFX10-LABEL: v_maximum_v4f16__nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v4, v1, v3 -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 -; GFX10-NEXT: v_pk_max_f16 v5, v0, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v4, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, 0x7e00, v5, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo ; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_perm_b32 v0, v0, v5, 0x5040100 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo -; GFX10-NEXT: v_perm_b32 v1, v1, v6, 0x5040100 +; GFX10-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX10-NEXT: v_pk_max_f16 v5, v1, v3 +; GFX10-NEXT: v_pk_max_f16 v6, v0, v2 +; GFX10-NEXT: v_cmp_o_f16_e64 s4, v0, v2 +; GFX10-NEXT: v_cndmask_b32_sdwa v7, v4, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v6, s4 +; GFX10-NEXT: v_cndmask_b32_sdwa v2, v4, v6, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 +; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v5, vcc_lo +; GFX10-NEXT: v_perm_b32 v1, v7, v1, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_maximum_v4f16__nsz: @@ -2167,34 +2165,33 @@ define <8 x half> @v_maximum_v8f16(<8 x half> %src0, <8 x half> %src1) { ; GFX10-LABEL: v_maximum_v8f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: v_pk_max_f16 v8, v3, v7 -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7 -; GFX10-NEXT: v_pk_max_f16 v9, v2, v6 +; GFX10-NEXT: v_mov_b32_e32 v9, 0x7e00 +; GFX10-NEXT: v_cmp_o_f16_sdwa s4, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_pk_max_f16 v11, v2, v6 +; GFX10-NEXT: v_cmp_o_f16_sdwa s5, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: v_pk_max_f16 v12, v1, v5 -; GFX10-NEXT: v_pk_max_f16 v13, v0, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v10, 0x7e00, v8, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v6 -; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v9, 0x7e00, v9, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v11, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v5 -; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v13 -; GFX10-NEXT: v_perm_b32 v2, v2, v9, 0x5040100 -; GFX10-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v12, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v4 -; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX10-NEXT: v_cndmask_b32_e32 v13, 0x7e00, v13, vcc_lo +; GFX10-NEXT: v_cndmask_b32_sdwa v10, v9, v8, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: s_mov_b32 vcc_lo, s4 +; GFX10-NEXT: v_pk_max_f16 v15, v0, v4 +; GFX10-NEXT: v_cndmask_b32_sdwa v13, v9, v11, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: s_mov_b32 vcc_lo, s5 +; GFX10-NEXT: v_cmp_o_f16_e64 s4, v2, v6 +; GFX10-NEXT: v_cndmask_b32_sdwa v14, v9, v12, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v11, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_perm_b32 v0, v0, v13, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0x7e00, v11, s4 +; GFX10-NEXT: v_cmp_o_f16_e64 s4, v0, v4 +; GFX10-NEXT: v_cndmask_b32_sdwa v4, v9, v15, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v5 +; GFX10-NEXT: v_perm_b32 v2, v13, v2, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v15, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v12, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_perm_b32 v1, v1, v6, 0x5040100 +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7 +; GFX10-NEXT: v_perm_b32 v0, v4, v0, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v14, v1, 0x5040100 ; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v8, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v3, v10, 0x5040100 +; GFX10-NEXT: v_perm_b32 v3, v10, v3, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_maximum_v8f16: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll index 236fc7aa76c2b..a8e7ccb5d326f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll @@ -586,13 +586,13 @@ define <2 x half> @v_minimum_v2f16(<2 x half> %src0, <2 x half> %src1) { ; GFX10-LABEL: v_minimum_v2f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_min_f16 v2, v0, v1 -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v2, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v2, 0x7e00 +; GFX10-NEXT: v_pk_min_f16 v3, v0, v1 ; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v3, vcc_lo -; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 +; GFX10-NEXT: v_cmp_o_f16_e64 s4, v0, v1 +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v2, v3, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v3, s4 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_minimum_v2f16: @@ -717,13 +717,13 @@ define <2 x half> @v_minimum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) { ; GFX10-LABEL: v_minimum_v2f16__nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_min_f16 v2, v0, v1 -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v2, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v2, 0x7e00 +; GFX10-NEXT: v_pk_min_f16 v3, v0, v1 ; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v3, vcc_lo -; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 +; GFX10-NEXT: v_cmp_o_f16_e64 s4, v0, v1 +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v2, v3, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v3, s4 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_minimum_v2f16__nsz: @@ -868,16 +868,16 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX10-LABEL: s_minimum_v2f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_min_f16 v0, s16, s17 -; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s16, s17 ; GFX10-NEXT: s_lshr_b32 s4, s17, 16 ; GFX10-NEXT: s_lshr_b32 s5, s16, 16 -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo +; GFX10-NEXT: v_pk_min_f16 v0, s16, s17 ; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s5, s4 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v1, vcc_lo -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_cmp_o_f16_e64 s4, s16, s17 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x7e00 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0x7e00, v0, s4 +; GFX10-NEXT: v_cndmask_b32_sdwa v0, v1, v0, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use v0 ; GFX10-NEXT: ;;#ASMEND @@ -985,16 +985,16 @@ define <3 x half> @v_minimum_v3f16(<3 x half> %src0, <3 x half> %src1) { ; GFX10-LABEL: v_minimum_v3f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_min_f16 v4, v0, v2 -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v4, 0x7e00, v4, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX10-NEXT: v_pk_min_f16 v5, v0, v2 ; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_pk_min_f16 v2, v1, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo +; GFX10-NEXT: v_cmp_o_f16_e64 s4, v0, v2 +; GFX10-NEXT: v_cndmask_b32_sdwa v2, v4, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v5, s4 +; GFX10-NEXT: v_pk_min_f16 v4, v1, v3 ; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 -; GFX10-NEXT: v_perm_b32 v0, v0, v4, 0x5040100 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v2, vcc_lo +; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_minimum_v3f16: @@ -1141,16 +1141,16 @@ define <3 x half> @v_minimum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) { ; GFX10-LABEL: v_minimum_v3f16__nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_min_f16 v4, v0, v2 -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v4, 0x7e00, v4, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX10-NEXT: v_pk_min_f16 v5, v0, v2 ; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_pk_min_f16 v2, v1, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo +; GFX10-NEXT: v_cmp_o_f16_e64 s4, v0, v2 +; GFX10-NEXT: v_cndmask_b32_sdwa v2, v4, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v5, s4 +; GFX10-NEXT: v_pk_min_f16 v4, v1, v3 ; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 -; GFX10-NEXT: v_perm_b32 v0, v0, v4, 0x5040100 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v2, vcc_lo +; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_minimum_v3f16__nsz: @@ -1307,20 +1307,19 @@ define <4 x half> @v_minimum_v4f16(<4 x half> %src0, <4 x half> %src1) { ; GFX10-LABEL: v_minimum_v4f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_min_f16 v4, v1, v3 -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 -; GFX10-NEXT: v_pk_min_f16 v5, v0, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v4, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, 0x7e00, v5, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo ; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_perm_b32 v0, v0, v5, 0x5040100 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo -; GFX10-NEXT: v_perm_b32 v1, v1, v6, 0x5040100 +; GFX10-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX10-NEXT: v_pk_min_f16 v5, v1, v3 +; GFX10-NEXT: v_pk_min_f16 v6, v0, v2 +; GFX10-NEXT: v_cmp_o_f16_e64 s4, v0, v2 +; GFX10-NEXT: v_cndmask_b32_sdwa v7, v4, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v6, s4 +; GFX10-NEXT: v_cndmask_b32_sdwa v2, v4, v6, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 +; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v5, vcc_lo +; GFX10-NEXT: v_perm_b32 v1, v7, v1, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_minimum_v4f16: @@ -1486,20 +1485,19 @@ define <4 x half> @v_minimum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) { ; GFX10-LABEL: v_minimum_v4f16__nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_min_f16 v4, v1, v3 -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 -; GFX10-NEXT: v_pk_min_f16 v5, v0, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v4, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, 0x7e00, v5, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo ; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_perm_b32 v0, v0, v5, 0x5040100 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo -; GFX10-NEXT: v_perm_b32 v1, v1, v6, 0x5040100 +; GFX10-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX10-NEXT: v_pk_min_f16 v5, v1, v3 +; GFX10-NEXT: v_pk_min_f16 v6, v0, v2 +; GFX10-NEXT: v_cmp_o_f16_e64 s4, v0, v2 +; GFX10-NEXT: v_cndmask_b32_sdwa v7, v4, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v6, s4 +; GFX10-NEXT: v_cndmask_b32_sdwa v2, v4, v6, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 +; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v5, vcc_lo +; GFX10-NEXT: v_perm_b32 v1, v7, v1, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_minimum_v4f16__nsz: @@ -1706,34 +1704,33 @@ define <8 x half> @v_minimum_v8f16(<8 x half> %src0, <8 x half> %src1) { ; GFX10-LABEL: v_minimum_v8f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: v_pk_min_f16 v8, v3, v7 -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7 -; GFX10-NEXT: v_pk_min_f16 v9, v2, v6 +; GFX10-NEXT: v_mov_b32_e32 v9, 0x7e00 +; GFX10-NEXT: v_cmp_o_f16_sdwa s4, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_pk_min_f16 v11, v2, v6 +; GFX10-NEXT: v_cmp_o_f16_sdwa s5, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: v_pk_min_f16 v12, v1, v5 -; GFX10-NEXT: v_pk_min_f16 v13, v0, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v10, 0x7e00, v8, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v6 -; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v9, 0x7e00, v9, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v11, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v5 -; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v13 -; GFX10-NEXT: v_perm_b32 v2, v2, v9, 0x5040100 -; GFX10-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v12, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v4 -; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX10-NEXT: v_cndmask_b32_e32 v13, 0x7e00, v13, vcc_lo +; GFX10-NEXT: v_cndmask_b32_sdwa v10, v9, v8, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: s_mov_b32 vcc_lo, s4 +; GFX10-NEXT: v_pk_min_f16 v15, v0, v4 +; GFX10-NEXT: v_cndmask_b32_sdwa v13, v9, v11, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: s_mov_b32 vcc_lo, s5 +; GFX10-NEXT: v_cmp_o_f16_e64 s4, v2, v6 +; GFX10-NEXT: v_cndmask_b32_sdwa v14, v9, v12, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v11, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_perm_b32 v0, v0, v13, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0x7e00, v11, s4 +; GFX10-NEXT: v_cmp_o_f16_e64 s4, v0, v4 +; GFX10-NEXT: v_cndmask_b32_sdwa v4, v9, v15, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v5 +; GFX10-NEXT: v_perm_b32 v2, v13, v2, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v15, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v12, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_perm_b32 v1, v1, v6, 0x5040100 +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7 +; GFX10-NEXT: v_perm_b32 v0, v4, v0, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v14, v1, 0x5040100 ; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v8, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v3, v10, 0x5040100 +; GFX10-NEXT: v_perm_b32 v3, v10, v3, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_minimum_v8f16: diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-gfx10.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-gfx10.mir index e66d7d80a803c..cf77cca22eb60 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-gfx10.mir +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-gfx10.mir @@ -1,43 +1,32 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 # RUN: llc %s -mtriple=amdgcn -mcpu=gfx1030 -run-pass=si-peephole-sdwa -o - | FileCheck %s -# For conversion of_CNDMASK_B32_e64 to SDWA, the destination of V_CMP_O_F16_e64 must be -# changed to vcc_lo first. This would introduce a vcc_hi use that requires special -# handling in si-peephole-sdwa. - +... --- -name: v_minimum_v2f16__nsz +name: v_maximum_v2f16 tracksRegLiveness: true body: | bb.0: - liveins: $vgpr0, $vgpr1 + liveins: $vgpr0 - ; CHECK-LABEL: name: v_minimum_v2f16__nsz - ; CHECK: liveins: $vgpr0, $vgpr1 + ; CHECK-LABEL: name: v_maximum_v2f16 + ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[V_PK_MIN_F16_:%[0-9]+]]:vgpr_32 = V_PK_MIN_F16 8, [[COPY]], 8, undef [[DEF]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 32256, 0, [[V_PK_MIN_F16_]], killed undef [[DEF1]], implicit $exec - ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[COPY]], implicit $exec - ; CHECK-NEXT: [[V_CMP_O_F16_sdwa:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_O_F16_sdwa 0, [[COPY]], 0, undef [[V_CNDMASK_B32_e64_]], 0, 5, 6, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[V_PK_MIN_F16_]], implicit $exec - ; CHECK-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 32256, 0, undef [[V_LSHRREV_B32_e64_1]], killed undef [[V_CMP_O_F16_sdwa]], implicit $exec - ; CHECK-NEXT: [[V_PERM_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERM_B32_e64 killed [[V_CNDMASK_B32_e64_1]], [[V_CNDMASK_B32_e64_]], 84148480, implicit $exec - ; CHECK-NEXT: $vgpr0 = COPY [[V_PERM_B32_e64_]] + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_F16 8, undef [[DEF1]], 8, undef [[DEF1]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[V_PK_MAX_F16_]], implicit $exec + ; CHECK-NEXT: $vcc_lo = COPY killed undef [[DEF]] + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 32256, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, undef [[V_PK_MAX_F16_]], 0, 6, 0, 6, 5, implicit $vcc_lo, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]] ; CHECK-NEXT: SI_RETURN implicit $vgpr0 - %30:vgpr_32 = IMPLICIT_DEF - %31:sreg_32_xm0_xexec = IMPLICIT_DEF - %8:vgpr_32 = COPY $vgpr0 - %13:vgpr_32 = V_PK_MIN_F16 8, %8, 8, undef %30, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - %16:vgpr_32 = V_CNDMASK_B32_e64 0, 32256, 0, %13, killed undef %31, implicit $exec - %20:vgpr_32 = V_LSHRREV_B32_e64 16, %8, implicit $exec - %22:sreg_32_xm0_xexec = V_CMP_O_F16_e64 0, undef %20, 0, undef %16, 0, implicit $mode, implicit $exec - %23:vgpr_32 = V_LSHRREV_B32_e64 16, undef %13, implicit $exec - %25:vgpr_32 = V_CNDMASK_B32_e64 0, 32256, 0, undef %23, killed undef %22, implicit $exec - %27:vgpr_32 = V_PERM_B32_e64 killed %25, killed %16, 84148480, implicit $exec - $vgpr0 = COPY %27 + %1:sreg_32_xm0_xexec = IMPLICIT_DEF + %2:vgpr_32 = IMPLICIT_DEF + %3:vgpr_32 = V_PK_MAX_F16 8, undef %2, 8, undef %2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %4:vgpr_32 = V_LSHRREV_B32_e64 16, undef %3, implicit $exec + %5:vgpr_32 = V_CNDMASK_B32_e64 0, 32256, 0, undef %4, killed undef %1, implicit $exec + $vgpr0 = COPY %5 SI_RETURN implicit $vgpr0 - ... + diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-gfx9.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-gfx9.mir index 6529f190294df..5a04ae6ea2b6b 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-gfx9.mir +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-gfx9.mir @@ -2,47 +2,34 @@ # RUN: llc %s -mtriple=amdgcn -mcpu=gfx900 -run-pass=si-peephole-sdwa -o - | FileCheck -check-prefix=gfx9 %s # Test conversion of V_CNDMASK_B32 to VOPC for enabling further conversion to SDWA. -# For this, the definition of the src2 carry-in operand must changed to write +# For this, the definition of the src2 carry-in operand must be changed to write # to VCC. --- name: v_vselect_v2bf16 tracksRegLiveness: true body: | + bb.0: - liveins: $vgpr0, $vgpr1, $vgpr2 + liveins: $vgpr0, $vgpr1 ; gfx9-LABEL: name: v_vselect_v2bf16 - ; gfx9: liveins: $vgpr0, $vgpr1, $vgpr2 + ; gfx9: liveins: $vgpr0, $vgpr1 ; gfx9-NEXT: {{ $}} - ; gfx9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; gfx9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; gfx9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; gfx9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; gfx9-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 1, [[COPY1]], implicit $exec - ; gfx9-NEXT: $vcc = V_CMP_EQ_U32_e64 killed [[V_AND_B32_e64_]], 1, implicit $exec - ; gfx9-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 1, [[COPY2]], implicit $exec - ; gfx9-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 killed [[V_AND_B32_e64_1]], 1, implicit $exec - ; gfx9-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, [[COPY]], killed [[V_CMP_EQ_U32_e64_]], implicit $exec + ; gfx9-NEXT: $vcc = V_CMP_EQ_U32_e64 killed [[COPY1]], 1, implicit $exec ; gfx9-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[COPY]], implicit $exec ; gfx9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; gfx9-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, [[COPY]], 0, 6, 0, 6, 5, implicit $vcc, implicit $exec - ; gfx9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 84148480 - ; gfx9-NEXT: [[V_PERM_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERM_B32_e64 killed [[V_CNDMASK_B32_sdwa]], killed [[V_CNDMASK_B32_e64_]], killed [[S_MOV_B32_]], implicit $exec - ; gfx9-NEXT: $vgpr0 = COPY [[V_PERM_B32_e64_]] + ; gfx9-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]] ; gfx9-NEXT: SI_RETURN implicit $vgpr0 - %10:vgpr_32 = COPY $vgpr2 - %9:vgpr_32 = COPY $vgpr1 - %8:vgpr_32 = COPY $vgpr0 - %11:vgpr_32 = V_AND_B32_e64 1, %9, implicit $exec - %12:sreg_64_xexec = V_CMP_EQ_U32_e64 killed %11, 1, implicit $exec - %13:vgpr_32 = V_AND_B32_e64 1, %8, implicit $exec - %14:sreg_64_xexec = V_CMP_EQ_U32_e64 killed %13, 1, implicit $exec - %17:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %10, killed %14, implicit $exec - %20:vgpr_32 = V_LSHRREV_B32_e64 16, %10, implicit $exec - %22:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %20, killed %12, implicit $exec - %24:sreg_32 = S_MOV_B32 84148480 - %25:vgpr_32 = V_PERM_B32_e64 killed %22, killed %17, killed %24, implicit $exec - $vgpr0 = COPY %25 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = COPY $vgpr1 + %3:sreg_64_xexec = V_CMP_EQ_U32_e64 killed %2, 1, implicit $exec + %4:vgpr_32 = V_LSHRREV_B32_e64 16, %1, implicit $exec + %5:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %4, killed %3, implicit $exec + $vgpr0 = COPY %5 SI_RETURN implicit $vgpr0 ... From f05ec81463a672c7e7447d2dddcd41cc0e37c39c Mon Sep 17 00:00:00 2001 From: Frederik Harwath Date: Wed, 30 Apr 2025 08:50:46 -0400 Subject: [PATCH 10/26] Rename sdwa-peephole-cndmask-gfx{9,10} tests --- ...eephole-cndmask-gfx10.mir => sdwa-peephole-cndmask-wave32.mir} | 0 ...peephole-cndmask-gfx9.mir => sdwa-peephole-cndmask-wave64.mir} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename llvm/test/CodeGen/AMDGPU/{sdwa-peephole-cndmask-gfx10.mir => sdwa-peephole-cndmask-wave32.mir} (100%) rename llvm/test/CodeGen/AMDGPU/{sdwa-peephole-cndmask-gfx9.mir => sdwa-peephole-cndmask-wave64.mir} (100%) diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-gfx10.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave32.mir similarity index 100% rename from llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-gfx10.mir rename to llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave32.mir diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-gfx9.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave64.mir similarity index 100% rename from llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-gfx9.mir rename to llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave64.mir From 3b2dc23b82450231b3e9315c3c279c6fa4e47030 Mon Sep 17 00:00:00 2001 From: Frederik Harwath Date: Wed, 30 Apr 2025 09:01:26 -0400 Subject: [PATCH 11/26] Unify test names --- ...{sdwa-peephole-vcndmask.mir => sdwa-peephole-cndmask-vop2.mir} | 0 ...e-cndmask-wave32.mir => sdwa-peephole-cndmask-vop3-wave32.mir} | 0 ...e-cndmask-wave64.mir => sdwa-peephole-cndmask-vop3-wave64.mir} | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename llvm/test/CodeGen/AMDGPU/{sdwa-peephole-vcndmask.mir => sdwa-peephole-cndmask-vop2.mir} (100%) rename llvm/test/CodeGen/AMDGPU/{sdwa-peephole-cndmask-wave32.mir => sdwa-peephole-cndmask-vop3-wave32.mir} (100%) rename llvm/test/CodeGen/AMDGPU/{sdwa-peephole-cndmask-wave64.mir => sdwa-peephole-cndmask-vop3-wave64.mir} (100%) diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-vcndmask.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop2.mir similarity index 100% rename from llvm/test/CodeGen/AMDGPU/sdwa-peephole-vcndmask.mir rename to llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop2.mir diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave32.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave32.mir similarity index 100% rename from llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave32.mir rename to llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave32.mir diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave64.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave64.mir similarity index 100% rename from llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave64.mir rename to llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave64.mir From f80752682384d4bad7498101f7002b815aa38c46 Mon Sep 17 00:00:00 2001 From: Frederik Harwath Date: Fri, 2 May 2025 07:49:10 +0200 Subject: [PATCH 12/26] Update llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp Co-authored-by: Matt Arsenault --- llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index dca303eec0572..f5babc560045d 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -1103,10 +1103,10 @@ void SIPeepholeSDWA::convertToImplicitVcc(MachineInstr &MI, if (CarryDef->isCompare() && TII->isVOP3(*CarryDef) && MRI->hasOneUse(CarryIn.getReg())) CarryDef->substituteRegister(CarryIn.getReg(), Vcc, 0, *TRI); - else - BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY)) - .addReg(Vcc, RegState::Define) + else { + BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), Vcc) .add(CarryIn); + } auto Converted = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::getVOPe32(MI.getOpcode()))) From 9bea2ed15b2a7b5fc5b754f1b454e4e31e87d5f4 Mon Sep 17 00:00:00 2001 From: Frederik Harwath Date: Fri, 2 May 2025 01:59:41 -0400 Subject: [PATCH 13/26] clang-format changes --- llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index f5babc560045d..746e1c21b0bed 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -1106,7 +1106,7 @@ void SIPeepholeSDWA::convertToImplicitVcc(MachineInstr &MI, else { BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), Vcc) .add(CarryIn); - } + } auto Converted = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::getVOPe32(MI.getOpcode()))) From af365ee5ae04c72083606ffd88636c4eb0da3901 Mon Sep 17 00:00:00 2001 From: Frederik Harwath Date: Fri, 2 May 2025 02:07:21 -0400 Subject: [PATCH 14/26] Rename convertToImplicitVcc and move CarryDef up --- llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index 746e1c21b0bed..91df8ee4b24da 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -62,7 +62,7 @@ class SIPeepholeSDWA { std::unique_ptr matchSDWAOperand(MachineInstr &MI); void pseudoOpConvertToVOP2(MachineInstr &MI, const GCNSubtarget &ST) const; - void convertToImplicitVcc(MachineInstr &MI, const GCNSubtarget &ST) const; + void convertVcndmaskToVOP2(MachineInstr &MI, const GCNSubtarget &ST) const; MachineInstr *createSDWAVersion(MachineInstr &MI); bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const; @@ -1070,7 +1070,7 @@ void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI, /// copies from the carry-in to VCC. The conversion will only be /// applied if \p MI can be shrunk to VOP2 and if VCC can be proven to /// be dead before \p MI. -void SIPeepholeSDWA::convertToImplicitVcc(MachineInstr &MI, +void SIPeepholeSDWA::convertVcndmaskToVOP2(MachineInstr &MI, const GCNSubtarget &ST) const { assert(MI.getOpcode() == AMDGPU::V_CNDMASK_B32_e64); @@ -1082,10 +1082,14 @@ void SIPeepholeSDWA::convertToImplicitVcc(MachineInstr &MI, const MachineOperand &CarryIn = *TII->getNamedOperand(MI, AMDGPU::OpName::src2); + Register CarryReg = CarryIn.getReg(); + MachineInstr *CarryDef = MRI->getVRegDef(CarryReg); + if (!CarryDef) + return; // Make sure VCC or its subregs are dead before MI. - MachineBasicBlock &MBB = *MI.getParent(); MCRegister Vcc = TRI->getVCC(); + MachineBasicBlock &MBB = *MI.getParent(); MachineBasicBlock::LivenessQueryResult Liveness = MBB.computeRegisterLiveness(TRI, Vcc, MI); if (Liveness != MachineBasicBlock::LQR_Dead) { @@ -1093,11 +1097,6 @@ void SIPeepholeSDWA::convertToImplicitVcc(MachineInstr &MI, return; } - Register CarryReg = CarryIn.getReg(); - MachineInstr *CarryDef = MRI->getVRegDef(CarryReg); - if (!CarryDef) - return; - // Change destination of compare instruction to VCC // or copy to VCC if carry-in is not a compare inst. if (CarryDef->isCompare() && TII->isVOP3(*CarryDef) && @@ -1453,7 +1452,7 @@ bool SIPeepholeSDWA::run(MachineFunction &MF) { pseudoOpConvertToVOP2(*PotentialMI, ST); break; case AMDGPU::V_CNDMASK_B32_e64: - convertToImplicitVcc(*PotentialMI, ST); + convertVcndmaskToVOP2(*PotentialMI, ST); break; }; } From 3c8bc54b163faf6fe91e98a07fa226cdcffb4bc4 Mon Sep 17 00:00:00 2001 From: Frederik Harwath Date: Fri, 2 May 2025 08:29:16 -0400 Subject: [PATCH 15/26] Extend tests Further changes: - Add debug output for missing carry-in def. --- llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 6 +- .../AMDGPU/sdwa-peephole-cndmask-vop2.mir | 44 ++--- .../sdwa-peephole-cndmask-vop3-wave32.mir | 160 +++++++++++++++-- .../sdwa-peephole-cndmask-vop3-wave64.mir | 169 ++++++++++++++++-- 4 files changed, 329 insertions(+), 50 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index 91df8ee4b24da..3556e79b3ec1c 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -1084,8 +1084,10 @@ void SIPeepholeSDWA::convertVcndmaskToVOP2(MachineInstr &MI, *TII->getNamedOperand(MI, AMDGPU::OpName::src2); Register CarryReg = CarryIn.getReg(); MachineInstr *CarryDef = MRI->getVRegDef(CarryReg); - if (!CarryDef) + if (!CarryDef) { + LLVM_DEBUG(dbgs() << "Missing carry-in operand definition\n"); return; + } // Make sure VCC or its subregs are dead before MI. MCRegister Vcc = TRI->getVCC(); @@ -1093,7 +1095,7 @@ void SIPeepholeSDWA::convertVcndmaskToVOP2(MachineInstr &MI, MachineBasicBlock::LivenessQueryResult Liveness = MBB.computeRegisterLiveness(TRI, Vcc, MI); if (Liveness != MachineBasicBlock::LQR_Dead) { - LLVM_DEBUG(dbgs() << "VCC not known to be dead before instruction.\n"); + LLVM_DEBUG(dbgs() << "VCC not known to be dead before instruction\n"); return; } diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop2.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop2.mir index 46af1e214aa45..b746320d7ecea 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop2.mir +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop2.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 -# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=si-peephole-sdwa -mcpu=gfx803 -o - %s | FileCheck -check-prefix=gfx8 %s -# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=si-peephole-sdwa -mcpu=gfx1100 -o - %s | FileCheck -check-prefix=gfx11 %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=si-peephole-sdwa -mcpu=gfx803 -o - %s | FileCheck -check-prefix=GFX8 %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=si-peephole-sdwa -mcpu=gfx1100 -o - %s | FileCheck -check-prefix=GFX11 %s --- name: v_cndmask_b32_test @@ -9,27 +9,27 @@ body: | bb.0: liveins: $vgpr0, $vgpr1, $vcc - ; gfx8-LABEL: name: v_cndmask_b32_test - ; gfx8: liveins: $vgpr0, $vgpr1, $vcc - ; gfx8-NEXT: {{ $}} - ; gfx8-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; gfx8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; gfx8-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[COPY]], implicit $exec - ; gfx8-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[COPY1]], implicit $exec - ; gfx8-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[COPY]], 0, [[COPY1]], 0, 6, 0, 5, 5, implicit $vcc, implicit $exec - ; gfx8-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]] - ; gfx8-NEXT: SI_RETURN implicit $vgpr0 + ; GFX8-LABEL: name: v_cndmask_b32_test + ; GFX8: liveins: $vgpr0, $vgpr1, $vcc + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[COPY]], implicit $exec + ; GFX8-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[COPY1]], implicit $exec + ; GFX8-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[COPY]], 0, [[COPY1]], 0, 6, 0, 5, 5, implicit $vcc, implicit $exec + ; GFX8-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]] + ; GFX8-NEXT: SI_RETURN implicit $vgpr0 ; - ; gfx11-LABEL: name: v_cndmask_b32_test - ; gfx11: liveins: $vgpr0, $vgpr1, $vcc - ; gfx11-NEXT: {{ $}} - ; gfx11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; gfx11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; gfx11-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[COPY]], implicit $exec - ; gfx11-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[COPY1]], implicit $exec - ; gfx11-NEXT: [[V_CNDMASK_B32_e32_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e32 killed [[V_LSHRREV_B32_e64_]], killed [[V_LSHRREV_B32_e64_1]], implicit $exec, implicit $vcc_lo - ; gfx11-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e32_]] - ; gfx11-NEXT: SI_RETURN implicit $vgpr0 + ; GFX11-LABEL: name: v_cndmask_b32_test + ; GFX11: liveins: $vgpr0, $vgpr1, $vcc + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[COPY]], implicit $exec + ; GFX11-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[COPY1]], implicit $exec + ; GFX11-NEXT: [[V_CNDMASK_B32_e32_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e32 killed [[V_LSHRREV_B32_e64_]], killed [[V_LSHRREV_B32_e64_1]], implicit $exec, implicit $vcc_lo + ; GFX11-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e32_]] + ; GFX11-NEXT: SI_RETURN implicit $vgpr0 %1:vgpr_32 = COPY $vgpr1 %2:vgpr_32 = COPY $vgpr0 %3:vgpr_32 = V_LSHRREV_B32_e64 16, %1, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave32.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave32.mir index cf77cca22eb60..92b4c07bc45d5 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave32.mir +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave32.mir @@ -1,32 +1,172 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 # RUN: llc %s -mtriple=amdgcn -mcpu=gfx1030 -run-pass=si-peephole-sdwa -o - | FileCheck %s +--- +name: change-compare-to-vopc # carry-in def is a comparison that can be changed to VOPC +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: change-compare-to-vopc + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: $vcc_lo = V_CMP_EQ_U32_e64 undef [[DEF]], 1, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[DEF1]], implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, undef [[DEF1]], 0, 6, 0, 6, 5, implicit $vcc_lo, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 + %0:sreg_32_xm0_xexec = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 undef %0, 1, implicit $exec + %3:vgpr_32 = V_LSHRREV_B32_e64 16, undef %1, implicit $exec + %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %3, killed %2, implicit $exec + $vgpr0 = COPY %4 + SI_RETURN implicit $vgpr0 +... + ... --- -name: v_maximum_v2f16 +name: carry-copy-non-compare # copy of carry-in necessary because def. instr. cannot be changed to write to VCC tracksRegLiveness: true body: | bb.0: liveins: $vgpr0 - ; CHECK-LABEL: name: v_maximum_v2f16 + ; CHECK-LABEL: name: carry-copy-non-compare ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_F16 8, undef [[DEF1]], 8, undef [[DEF1]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[V_PK_MAX_F16_]], implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[V_PK_MAX_F16_]], implicit $exec ; CHECK-NEXT: $vcc_lo = COPY killed undef [[DEF]] - ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 32256, implicit $exec - ; CHECK-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, undef [[V_PK_MAX_F16_]], 0, 6, 0, 6, 5, implicit $vcc_lo, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, [[V_PK_MAX_F16_]], 0, 6, 0, 6, 5, implicit $vcc_lo, implicit $exec ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]] ; CHECK-NEXT: SI_RETURN implicit $vgpr0 - %1:sreg_32_xm0_xexec = IMPLICIT_DEF + %0:sreg_32_xm0_xexec = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:vgpr_32 = V_PK_MAX_F16 8, undef %1, 8, undef %1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %3:vgpr_32 = V_LSHRREV_B32_e64 16, %2, implicit $exec + %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, undef %3, killed undef %0, implicit $exec + $vgpr0 = COPY %4 + SI_RETURN implicit $vgpr0 +... + +... +--- +name: carry-copy-multiuse # copy of carry-in necessary because of second use +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: carry-copy-multiuse + ; CHECK: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_F16 8, undef [[DEF1]], 8, undef [[DEF1]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[V_PK_MAX_F16_]], implicit $exec + ; CHECK-NEXT: $vcc_lo = COPY killed undef [[DEF]] + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, [[V_PK_MAX_F16_]], 0, 6, 0, 6, 5, implicit $vcc_lo, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]] + ; CHECK-NEXT: $vgpr1 = COPY [[DEF]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 + %0:sreg_32_xm0_xexec = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:vgpr_32 = V_PK_MAX_F16 8, undef %1, 8, undef %1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %3:vgpr_32 = V_LSHRREV_B32_e64 16, %2, implicit $exec + %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %3, killed undef %0, implicit $exec + $vgpr0 = COPY %4 + $vgpr1 = COPY %0 + SI_RETURN implicit $vgpr0 +... + +... +--- +name: live-vcc # cannot convert because of live VCC +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: live-vcc + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 undef [[DEF]], 1, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[DEF1]], implicit $exec + ; CHECK-NEXT: V_CMP_EQ_U32_e32 1, undef [[DEF1]], implicit-def $vcc_lo, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, [[V_LSHRREV_B32_e64_]], [[V_CMP_EQ_U32_e64_]], implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e32_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e32 killed [[V_LSHRREV_B32_e64_]], killed [[V_LSHRREV_B32_e64_]], implicit $vcc_lo, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 + %0:sreg_32_xm0_xexec = IMPLICIT_DEF %2:vgpr_32 = IMPLICIT_DEF - %3:vgpr_32 = V_PK_MAX_F16 8, undef %2, 8, undef %2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - %4:vgpr_32 = V_LSHRREV_B32_e64 16, undef %3, implicit $exec - %5:vgpr_32 = V_CNDMASK_B32_e64 0, 32256, 0, undef %4, killed undef %1, implicit $exec - $vgpr0 = COPY %5 + %3:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 undef %0, 1, implicit $exec + %5:vgpr_32 = V_LSHRREV_B32_e64 16, undef %2, implicit $exec + V_CMP_EQ_U32_e32 1, undef %2, implicit-def $vcc, implicit $exec + %6:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %5, %3, implicit $exec + %7:vgpr_32 = V_CNDMASK_B32_e32 killed %5, killed %5, implicit $vcc, implicit $exec + $vgpr0 = COPY %6 + SI_RETURN implicit $vgpr0 +... + +... +--- +name: cannot-shrink-source-mdoes # cannot shrink because of source modifiers +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: cannot-shrink-source-mdoes + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 undef [[DEF]], 0, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[DEF1]], implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 1, 0, 0, [[V_LSHRREV_B32_e64_]], killed [[V_CMP_EQ_U32_e64_]], implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 + %0:sreg_32_xm0_xexec = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 undef %0, 0, implicit $exec + %3:vgpr_32 = V_LSHRREV_B32_e64 16, undef %1, implicit $exec + %4:vgpr_32 = V_CNDMASK_B32_e64 1, 0, 0, %3, killed %2, implicit $exec + $vgpr0 = COPY %4 SI_RETURN implicit $vgpr0 ... +... +--- +name: missing-carry-def +tracksRegLiveness: true +registers: + - { id: 0, class: sreg_32_xm0_xexec } +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: missing-carry-def + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[DEF]], implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, [[V_LSHRREV_B32_e64_]], undef %0:sreg_32_xm0_xexec, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 + %1:vgpr_32 = IMPLICIT_DEF + %2:vgpr_32 = V_LSHRREV_B32_e64 16, undef %1, implicit $exec + %3:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %2, undef %0, implicit $exec + $vgpr0 = COPY %3 + SI_RETURN implicit $vgpr0 +... diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave64.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave64.mir index 5a04ae6ea2b6b..cc4a1d307930f 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave64.mir +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave64.mir @@ -1,35 +1,172 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 # RUN: llc %s -mtriple=amdgcn -mcpu=gfx900 -run-pass=si-peephole-sdwa -o - | FileCheck -check-prefix=gfx9 %s -# Test conversion of V_CNDMASK_B32 to VOPC for enabling further conversion to SDWA. -# For this, the definition of the src2 carry-in operand must be changed to write -# to VCC. +--- +name: change-compare-to-vopc # carry-in def is a comparison that can be changed to VOPC +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; gfx9-LABEL: name: change-compare-to-vopc + ; gfx9: liveins: $vgpr0 + ; gfx9-NEXT: {{ $}} + ; gfx9-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; gfx9-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; gfx9-NEXT: $vcc = V_CMP_EQ_U32_e64 undef [[DEF]], 1, implicit $exec + ; gfx9-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[DEF1]], implicit $exec + ; gfx9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; gfx9-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, undef [[DEF1]], 0, 6, 0, 6, 5, implicit $vcc, implicit $exec + ; gfx9-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]] + ; gfx9-NEXT: SI_RETURN implicit $vgpr0 + %0:vgpr_32 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:sreg_64_xexec = V_CMP_EQ_U32_e64 undef %0, 1, implicit $exec + %3:vgpr_32 = V_LSHRREV_B32_e64 16, undef %1, implicit $exec + %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %3, killed %2, implicit $exec + $vgpr0 = COPY %4 + SI_RETURN implicit $vgpr0 +... +... --- -name: v_vselect_v2bf16 +name: carry-copy-non-compare # copy of carry-in necessary because def. instr. cannot be changed to write to VCC tracksRegLiveness: true body: | + bb.0: + liveins: $vgpr0 + + ; gfx9-LABEL: name: carry-copy-non-compare + ; gfx9: liveins: $vgpr0 + ; gfx9-NEXT: {{ $}} + ; gfx9-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; gfx9-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; gfx9-NEXT: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_F16 8, undef [[DEF1]], 8, undef [[DEF1]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; gfx9-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[V_PK_MAX_F16_]], implicit $exec + ; gfx9-NEXT: $vcc = COPY killed undef [[DEF]] + ; gfx9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; gfx9-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, undef [[V_PK_MAX_F16_]], 0, 6, 0, 6, 5, implicit $vcc, implicit $exec + ; gfx9-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]] + ; gfx9-NEXT: SI_RETURN implicit $vgpr0 + %0:sreg_64_xexec = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:vgpr_32 = V_PK_MAX_F16 8, undef %1, 8, undef %1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %3:vgpr_32 = V_LSHRREV_B32_e64 16, undef %2, implicit $exec + %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %3, killed undef %0, implicit $exec + $vgpr0 = COPY %4 + SI_RETURN implicit $vgpr0 +... +... +--- +name: carry-copy-multiuse # copy of carry-in necessary because of second use +tracksRegLiveness: true +body: | bb.0: liveins: $vgpr0, $vgpr1 - ; gfx9-LABEL: name: v_vselect_v2bf16 + ; gfx9-LABEL: name: carry-copy-multiuse ; gfx9: liveins: $vgpr0, $vgpr1 ; gfx9-NEXT: {{ $}} - ; gfx9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; gfx9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; gfx9-NEXT: $vcc = V_CMP_EQ_U32_e64 killed [[COPY1]], 1, implicit $exec - ; gfx9-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[COPY]], implicit $exec + ; gfx9-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; gfx9-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; gfx9-NEXT: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_F16 8, undef [[DEF1]], 8, undef [[DEF1]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; gfx9-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[V_PK_MAX_F16_]], implicit $exec + ; gfx9-NEXT: $vcc = COPY killed undef [[DEF]] ; gfx9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; gfx9-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, [[COPY]], 0, 6, 0, 6, 5, implicit $vcc, implicit $exec + ; gfx9-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, [[V_PK_MAX_F16_]], 0, 6, 0, 6, 5, implicit $vcc, implicit $exec ; gfx9-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]] + ; gfx9-NEXT: $vgpr1 = COPY [[DEF]] + ; gfx9-NEXT: SI_RETURN implicit $vgpr0 + %0:sreg_64_xexec = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:vgpr_32 = V_PK_MAX_F16 8, undef %1, 8, undef %1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %3:vgpr_32 = V_LSHRREV_B32_e64 16, %2, implicit $exec + %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %3, killed undef %0, implicit $exec + $vgpr0 = COPY %4 + $vgpr1 = COPY %0 + SI_RETURN implicit $vgpr0 +... + +... +--- +name: live-vcc # cannot convert because of live VCC +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; gfx9-LABEL: name: live-vcc + ; gfx9: liveins: $vgpr0 + ; gfx9-NEXT: {{ $}} + ; gfx9-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; gfx9-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; gfx9-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 undef [[DEF]], 1, implicit $exec + ; gfx9-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[DEF1]], implicit $exec + ; gfx9-NEXT: V_CMP_EQ_U32_e32 1, undef [[DEF1]], implicit-def $vcc, implicit $exec + ; gfx9-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, [[V_LSHRREV_B32_e64_]], [[V_CMP_EQ_U32_e64_]], implicit $exec + ; gfx9-NEXT: [[V_CNDMASK_B32_e32_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e32 killed [[V_LSHRREV_B32_e64_]], killed [[V_LSHRREV_B32_e64_]], implicit $vcc, implicit $exec + ; gfx9-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]] + ; gfx9-NEXT: SI_RETURN implicit $vgpr0 + %0:vgpr_32 = IMPLICIT_DEF + %2:vgpr_32 = IMPLICIT_DEF + %3:sreg_64_xexec = V_CMP_EQ_U32_e64 undef %0, 1, implicit $exec + %5:vgpr_32 = V_LSHRREV_B32_e64 16, undef %2, implicit $exec + V_CMP_EQ_U32_e32 1, undef %2, implicit-def $vcc, implicit $exec + %6:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %5, %3, implicit $exec + %7:vgpr_32 = V_CNDMASK_B32_e32 killed %5, killed %5, implicit $vcc, implicit $exec + $vgpr0 = COPY %6 + SI_RETURN implicit $vgpr0 +... + +... +--- +name: cannot-shrink-source-mods # cannot shrink because of source modifiers +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; gfx9-LABEL: name: cannot-shrink-source-mods + ; gfx9: liveins: $vgpr0 + ; gfx9-NEXT: {{ $}} + ; gfx9-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; gfx9-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; gfx9-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 undef [[DEF]], 0, implicit $exec + ; gfx9-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[DEF1]], implicit $exec + ; gfx9-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 1, 0, 0, [[V_LSHRREV_B32_e64_]], killed [[V_CMP_EQ_U32_e64_]], implicit $exec + ; gfx9-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]] ; gfx9-NEXT: SI_RETURN implicit $vgpr0 - %1:vgpr_32 = COPY $vgpr0 - %2:vgpr_32 = COPY $vgpr1 - %3:sreg_64_xexec = V_CMP_EQ_U32_e64 killed %2, 1, implicit $exec - %4:vgpr_32 = V_LSHRREV_B32_e64 16, %1, implicit $exec - %5:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %4, killed %3, implicit $exec - $vgpr0 = COPY %5 + %0:vgpr_32 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:sreg_64_xexec = V_CMP_EQ_U32_e64 undef %0, 0, implicit $exec + %3:vgpr_32 = V_LSHRREV_B32_e64 16, undef %1, implicit $exec + %4:vgpr_32 = V_CNDMASK_B32_e64 1, 0, 0, %3, killed %2, implicit $exec + $vgpr0 = COPY %4 SI_RETURN implicit $vgpr0 +... + +... +--- +name: missing-carry-def +tracksRegLiveness: true +registers: + - { id: 0, class: sreg_64_xexec } +body: | + bb.0: + liveins: $vgpr0 + ; gfx9-LABEL: name: missing-carry-def + ; gfx9: liveins: $vgpr0 + ; gfx9-NEXT: {{ $}} + ; gfx9-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; gfx9-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[DEF]], implicit $exec + ; gfx9-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, [[V_LSHRREV_B32_e64_]], undef %0:sreg_64_xexec, implicit $exec + ; gfx9-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]] + ; gfx9-NEXT: SI_RETURN implicit $vgpr0 + %1:vgpr_32 = IMPLICIT_DEF + %2:vgpr_32 = V_LSHRREV_B32_e64 16, undef %1, implicit $exec + %3:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %2, undef %0, implicit $exec + $vgpr0 = COPY %3 + SI_RETURN implicit $vgpr0 ... From 952881f2a1b827154bae53fe77e407c048d77559 Mon Sep 17 00:00:00 2001 From: Frederik Harwath Date: Fri, 2 May 2025 08:34:46 -0400 Subject: [PATCH 16/26] clang-format changes --- llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index 3556e79b3ec1c..92bed084314e0 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -1071,7 +1071,7 @@ void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI, /// applied if \p MI can be shrunk to VOP2 and if VCC can be proven to /// be dead before \p MI. void SIPeepholeSDWA::convertVcndmaskToVOP2(MachineInstr &MI, - const GCNSubtarget &ST) const { + const GCNSubtarget &ST) const { assert(MI.getOpcode() == AMDGPU::V_CNDMASK_B32_e64); LLVM_DEBUG(dbgs() << "Attempting VOP2 conversion: " << MI); From 5c4cae5860619fede4614cf47ce86345dc05de51 Mon Sep 17 00:00:00 2001 From: Frederik Harwath Date: Fri, 2 May 2025 08:38:48 -0400 Subject: [PATCH 17/26] Change test prefix --- .../sdwa-peephole-cndmask-vop3-wave64.mir | 134 +++++++++--------- 1 file changed, 67 insertions(+), 67 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave64.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave64.mir index cc4a1d307930f..1e2fb30eb24c4 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave64.mir +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave64.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 -# RUN: llc %s -mtriple=amdgcn -mcpu=gfx900 -run-pass=si-peephole-sdwa -o - | FileCheck -check-prefix=gfx9 %s +# RUN: llc %s -mtriple=amdgcn -mcpu=gfx900 -run-pass=si-peephole-sdwa -o - | FileCheck %s --- name: change-compare-to-vopc # carry-in def is a comparison that can be changed to VOPC @@ -8,17 +8,17 @@ body: | bb.0: liveins: $vgpr0 - ; gfx9-LABEL: name: change-compare-to-vopc - ; gfx9: liveins: $vgpr0 - ; gfx9-NEXT: {{ $}} - ; gfx9-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; gfx9-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; gfx9-NEXT: $vcc = V_CMP_EQ_U32_e64 undef [[DEF]], 1, implicit $exec - ; gfx9-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[DEF1]], implicit $exec - ; gfx9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; gfx9-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, undef [[DEF1]], 0, 6, 0, 6, 5, implicit $vcc, implicit $exec - ; gfx9-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]] - ; gfx9-NEXT: SI_RETURN implicit $vgpr0 + ; CHECK-LABEL: name: change-compare-to-vopc + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: $vcc = V_CMP_EQ_U32_e64 undef [[DEF]], 1, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[DEF1]], implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, undef [[DEF1]], 0, 6, 0, 6, 5, implicit $vcc, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %0:vgpr_32 = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF %2:sreg_64_xexec = V_CMP_EQ_U32_e64 undef %0, 1, implicit $exec @@ -36,18 +36,18 @@ body: | bb.0: liveins: $vgpr0 - ; gfx9-LABEL: name: carry-copy-non-compare - ; gfx9: liveins: $vgpr0 - ; gfx9-NEXT: {{ $}} - ; gfx9-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF - ; gfx9-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; gfx9-NEXT: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_F16 8, undef [[DEF1]], 8, undef [[DEF1]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; gfx9-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[V_PK_MAX_F16_]], implicit $exec - ; gfx9-NEXT: $vcc = COPY killed undef [[DEF]] - ; gfx9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; gfx9-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, undef [[V_PK_MAX_F16_]], 0, 6, 0, 6, 5, implicit $vcc, implicit $exec - ; gfx9-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]] - ; gfx9-NEXT: SI_RETURN implicit $vgpr0 + ; CHECK-LABEL: name: carry-copy-non-compare + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_F16 8, undef [[DEF1]], 8, undef [[DEF1]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[V_PK_MAX_F16_]], implicit $exec + ; CHECK-NEXT: $vcc = COPY killed undef [[DEF]] + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, undef [[V_PK_MAX_F16_]], 0, 6, 0, 6, 5, implicit $vcc, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %0:sreg_64_xexec = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF %2:vgpr_32 = V_PK_MAX_F16 8, undef %1, 8, undef %1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec @@ -65,19 +65,19 @@ body: | bb.0: liveins: $vgpr0, $vgpr1 - ; gfx9-LABEL: name: carry-copy-multiuse - ; gfx9: liveins: $vgpr0, $vgpr1 - ; gfx9-NEXT: {{ $}} - ; gfx9-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF - ; gfx9-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; gfx9-NEXT: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_F16 8, undef [[DEF1]], 8, undef [[DEF1]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; gfx9-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[V_PK_MAX_F16_]], implicit $exec - ; gfx9-NEXT: $vcc = COPY killed undef [[DEF]] - ; gfx9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; gfx9-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, [[V_PK_MAX_F16_]], 0, 6, 0, 6, 5, implicit $vcc, implicit $exec - ; gfx9-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]] - ; gfx9-NEXT: $vgpr1 = COPY [[DEF]] - ; gfx9-NEXT: SI_RETURN implicit $vgpr0 + ; CHECK-LABEL: name: carry-copy-multiuse + ; CHECK: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_F16 8, undef [[DEF1]], 8, undef [[DEF1]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[V_PK_MAX_F16_]], implicit $exec + ; CHECK-NEXT: $vcc = COPY killed undef [[DEF]] + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, [[V_PK_MAX_F16_]], 0, 6, 0, 6, 5, implicit $vcc, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]] + ; CHECK-NEXT: $vgpr1 = COPY [[DEF]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %0:sreg_64_xexec = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF %2:vgpr_32 = V_PK_MAX_F16 8, undef %1, 8, undef %1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec @@ -96,18 +96,18 @@ body: | bb.0: liveins: $vgpr0 - ; gfx9-LABEL: name: live-vcc - ; gfx9: liveins: $vgpr0 - ; gfx9-NEXT: {{ $}} - ; gfx9-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; gfx9-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; gfx9-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 undef [[DEF]], 1, implicit $exec - ; gfx9-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[DEF1]], implicit $exec - ; gfx9-NEXT: V_CMP_EQ_U32_e32 1, undef [[DEF1]], implicit-def $vcc, implicit $exec - ; gfx9-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, [[V_LSHRREV_B32_e64_]], [[V_CMP_EQ_U32_e64_]], implicit $exec - ; gfx9-NEXT: [[V_CNDMASK_B32_e32_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e32 killed [[V_LSHRREV_B32_e64_]], killed [[V_LSHRREV_B32_e64_]], implicit $vcc, implicit $exec - ; gfx9-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]] - ; gfx9-NEXT: SI_RETURN implicit $vgpr0 + ; CHECK-LABEL: name: live-vcc + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 undef [[DEF]], 1, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[DEF1]], implicit $exec + ; CHECK-NEXT: V_CMP_EQ_U32_e32 1, undef [[DEF1]], implicit-def $vcc, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, [[V_LSHRREV_B32_e64_]], [[V_CMP_EQ_U32_e64_]], implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e32_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e32 killed [[V_LSHRREV_B32_e64_]], killed [[V_LSHRREV_B32_e64_]], implicit $vcc, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %0:vgpr_32 = IMPLICIT_DEF %2:vgpr_32 = IMPLICIT_DEF %3:sreg_64_xexec = V_CMP_EQ_U32_e64 undef %0, 1, implicit $exec @@ -127,16 +127,16 @@ body: | bb.0: liveins: $vgpr0 - ; gfx9-LABEL: name: cannot-shrink-source-mods - ; gfx9: liveins: $vgpr0 - ; gfx9-NEXT: {{ $}} - ; gfx9-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; gfx9-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; gfx9-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 undef [[DEF]], 0, implicit $exec - ; gfx9-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[DEF1]], implicit $exec - ; gfx9-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 1, 0, 0, [[V_LSHRREV_B32_e64_]], killed [[V_CMP_EQ_U32_e64_]], implicit $exec - ; gfx9-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]] - ; gfx9-NEXT: SI_RETURN implicit $vgpr0 + ; CHECK-LABEL: name: cannot-shrink-source-mods + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 undef [[DEF]], 0, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[DEF1]], implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 1, 0, 0, [[V_LSHRREV_B32_e64_]], killed [[V_CMP_EQ_U32_e64_]], implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %0:vgpr_32 = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF %2:sreg_64_xexec = V_CMP_EQ_U32_e64 undef %0, 0, implicit $exec @@ -156,14 +156,14 @@ body: | bb.0: liveins: $vgpr0 - ; gfx9-LABEL: name: missing-carry-def - ; gfx9: liveins: $vgpr0 - ; gfx9-NEXT: {{ $}} - ; gfx9-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; gfx9-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[DEF]], implicit $exec - ; gfx9-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, [[V_LSHRREV_B32_e64_]], undef %0:sreg_64_xexec, implicit $exec - ; gfx9-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]] - ; gfx9-NEXT: SI_RETURN implicit $vgpr0 + ; CHECK-LABEL: name: missing-carry-def + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[DEF]], implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, [[V_LSHRREV_B32_e64_]], undef %0:sreg_64_xexec, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %1:vgpr_32 = IMPLICIT_DEF %2:vgpr_32 = V_LSHRREV_B32_e64 16, undef %1, implicit $exec %3:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %2, undef %0, implicit $exec From 9e406a91ed03e2e45e15d62ab0a180033792352f Mon Sep 17 00:00:00 2001 From: Frederik Harwath Date: Fri, 2 May 2025 16:46:18 +0200 Subject: [PATCH 18/26] Apply suggestions from code review Co-authored-by: Matt Arsenault --- .../AMDGPU/sdwa-peephole-cndmask-vop3-wave32.mir | 4 +--- .../AMDGPU/sdwa-peephole-cndmask-vop3-wave64.mir | 14 +++++++------- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave32.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave32.mir index 92b4c07bc45d5..5ff9bf0b89b60 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave32.mir +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave32.mir @@ -150,8 +150,6 @@ body: | --- name: missing-carry-def tracksRegLiveness: true -registers: - - { id: 0, class: sreg_32_xm0_xexec } body: | bb.0: liveins: $vgpr0 @@ -165,7 +163,7 @@ body: | ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]] ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %1:vgpr_32 = IMPLICIT_DEF - %2:vgpr_32 = V_LSHRREV_B32_e64 16, undef %1, implicit $exec + %2:vgpr_32 = V_LSHRREV_B32_e64 16, undef %1:sreg_32_xm0_xexec, implicit $exec %3:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %2, undef %0, implicit $exec $vgpr0 = COPY %3 SI_RETURN implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave64.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave64.mir index 1e2fb30eb24c4..d0b89cc121a30 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave64.mir +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave64.mir @@ -50,9 +50,9 @@ body: | ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %0:sreg_64_xexec = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF - %2:vgpr_32 = V_PK_MAX_F16 8, undef %1, 8, undef %1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %2:vgpr_32 = V_PK_MAX_F16 8, %1, 8, %1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec %3:vgpr_32 = V_LSHRREV_B32_e64 16, undef %2, implicit $exec - %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %3, killed undef %0, implicit $exec + %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %3, killed %0, implicit $exec $vgpr0 = COPY %4 SI_RETURN implicit $vgpr0 ... @@ -82,7 +82,7 @@ body: | %1:vgpr_32 = IMPLICIT_DEF %2:vgpr_32 = V_PK_MAX_F16 8, undef %1, 8, undef %1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec %3:vgpr_32 = V_LSHRREV_B32_e64 16, %2, implicit $exec - %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %3, killed undef %0, implicit $exec + %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %3, killed %0, implicit $exec $vgpr0 = COPY %4 $vgpr1 = COPY %0 SI_RETURN implicit $vgpr0 @@ -110,9 +110,9 @@ body: | ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %0:vgpr_32 = IMPLICIT_DEF %2:vgpr_32 = IMPLICIT_DEF - %3:sreg_64_xexec = V_CMP_EQ_U32_e64 undef %0, 1, implicit $exec - %5:vgpr_32 = V_LSHRREV_B32_e64 16, undef %2, implicit $exec - V_CMP_EQ_U32_e32 1, undef %2, implicit-def $vcc, implicit $exec + %3:sreg_64_xexec = V_CMP_EQ_U32_e64 %0, 1, implicit $exec + %5:vgpr_32 = V_LSHRREV_B32_e64 16, %2, implicit $exec + V_CMP_EQ_U32_e32 1, %2, implicit-def $vcc, implicit $exec %6:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %5, %3, implicit $exec %7:vgpr_32 = V_CNDMASK_B32_e32 killed %5, killed %5, implicit $vcc, implicit $exec $vgpr0 = COPY %6 @@ -165,7 +165,7 @@ body: | ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]] ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %1:vgpr_32 = IMPLICIT_DEF - %2:vgpr_32 = V_LSHRREV_B32_e64 16, undef %1, implicit $exec + %2:vgpr_32 = V_LSHRREV_B32_e64 16, %1, implicit $exec %3:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %2, undef %0, implicit $exec $vgpr0 = COPY %3 SI_RETURN implicit $vgpr0 From a8f5dc8fcecc656d51483b197127d771f01b1a57 Mon Sep 17 00:00:00 2001 From: Frederik Harwath Date: Fri, 2 May 2025 11:15:55 -0400 Subject: [PATCH 19/26] Adjusts tests - Compact reg numbers in vop test - Remove "undef" - Readjust types in wave32 test --- .../AMDGPU/sdwa-peephole-cndmask-vop2.mir | 10 ++++---- .../sdwa-peephole-cndmask-vop3-wave32.mir | 10 ++++---- .../sdwa-peephole-cndmask-vop3-wave64.mir | 24 +++++++++---------- 3 files changed, 23 insertions(+), 21 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop2.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop2.mir index b746320d7ecea..c3bcd6129e346 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop2.mir +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop2.mir @@ -30,12 +30,12 @@ body: | ; GFX11-NEXT: [[V_CNDMASK_B32_e32_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e32 killed [[V_LSHRREV_B32_e64_]], killed [[V_LSHRREV_B32_e64_1]], implicit $exec, implicit $vcc_lo ; GFX11-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e32_]] ; GFX11-NEXT: SI_RETURN implicit $vgpr0 - %1:vgpr_32 = COPY $vgpr1 - %2:vgpr_32 = COPY $vgpr0 + %0:vgpr_32 = COPY $vgpr1 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_e64 16, %0, implicit $exec %3:vgpr_32 = V_LSHRREV_B32_e64 16, %1, implicit $exec - %4:vgpr_32 = V_LSHRREV_B32_e64 16, %2, implicit $exec - %7:vgpr_32 = V_CNDMASK_B32_e32 killed %3, killed %4, implicit $exec, implicit $vcc - $vgpr0 = COPY %7 + %4:vgpr_32 = V_CNDMASK_B32_e32 killed %2, killed %3, implicit $exec, implicit $vcc + $vgpr0 = COPY %4 SI_RETURN implicit $vgpr0 ... diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave32.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave32.mir index 5ff9bf0b89b60..bf00f204a6c39 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave32.mir +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave32.mir @@ -52,7 +52,7 @@ body: | %1:vgpr_32 = IMPLICIT_DEF %2:vgpr_32 = V_PK_MAX_F16 8, undef %1, 8, undef %1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec %3:vgpr_32 = V_LSHRREV_B32_e64 16, %2, implicit $exec - %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, undef %3, killed undef %0, implicit $exec + %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %3, killed undef %0, implicit $exec $vgpr0 = COPY %4 SI_RETURN implicit $vgpr0 ... @@ -150,6 +150,8 @@ body: | --- name: missing-carry-def tracksRegLiveness: true +registers: + - { id: 0, class: sreg_32_xm0_xexec } body: | bb.0: liveins: $vgpr0 @@ -157,14 +159,14 @@ body: | ; CHECK-LABEL: name: missing-carry-def ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[DEF]], implicit $exec ; CHECK-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, [[V_LSHRREV_B32_e64_]], undef %0:sreg_32_xm0_xexec, implicit $exec ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]] ; CHECK-NEXT: SI_RETURN implicit $vgpr0 - %1:vgpr_32 = IMPLICIT_DEF + %1:sreg_32_xm0_xexec = IMPLICIT_DEF %2:vgpr_32 = V_LSHRREV_B32_e64 16, undef %1:sreg_32_xm0_xexec, implicit $exec - %3:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %2, undef %0, implicit $exec + %3:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %2, undef %0:sreg_32_xm0_xexec, implicit $exec $vgpr0 = COPY %3 SI_RETURN implicit $vgpr0 ... diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave64.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave64.mir index d0b89cc121a30..0d8aea1405cc7 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave64.mir +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave64.mir @@ -1,5 +1,5 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 # RUN: llc %s -mtriple=amdgcn -mcpu=gfx900 -run-pass=si-peephole-sdwa -o - | FileCheck %s +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 --- name: change-compare-to-vopc # carry-in def is a comparison that can be changed to VOPC @@ -42,17 +42,17 @@ body: | ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_F16 8, undef [[DEF1]], 8, undef [[DEF1]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[V_PK_MAX_F16_]], implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[V_PK_MAX_F16_]], implicit $exec ; CHECK-NEXT: $vcc = COPY killed undef [[DEF]] ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; CHECK-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, undef [[V_PK_MAX_F16_]], 0, 6, 0, 6, 5, implicit $vcc, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, [[V_PK_MAX_F16_]], 0, 6, 0, 6, 5, implicit $vcc, implicit $exec ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]] ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %0:sreg_64_xexec = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF - %2:vgpr_32 = V_PK_MAX_F16 8, %1, 8, %1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - %3:vgpr_32 = V_LSHRREV_B32_e64 16, undef %2, implicit $exec - %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %3, killed %0, implicit $exec + %2:vgpr_32 = V_PK_MAX_F16 8, undef %1, 8, undef %1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %3:vgpr_32 = V_LSHRREV_B32_e64 16, %2, implicit $exec + %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %3, killed undef %0, implicit $exec $vgpr0 = COPY %4 SI_RETURN implicit $vgpr0 ... @@ -82,7 +82,7 @@ body: | %1:vgpr_32 = IMPLICIT_DEF %2:vgpr_32 = V_PK_MAX_F16 8, undef %1, 8, undef %1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec %3:vgpr_32 = V_LSHRREV_B32_e64 16, %2, implicit $exec - %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %3, killed %0, implicit $exec + %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %3, killed undef %0, implicit $exec $vgpr0 = COPY %4 $vgpr1 = COPY %0 SI_RETURN implicit $vgpr0 @@ -110,9 +110,9 @@ body: | ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %0:vgpr_32 = IMPLICIT_DEF %2:vgpr_32 = IMPLICIT_DEF - %3:sreg_64_xexec = V_CMP_EQ_U32_e64 %0, 1, implicit $exec - %5:vgpr_32 = V_LSHRREV_B32_e64 16, %2, implicit $exec - V_CMP_EQ_U32_e32 1, %2, implicit-def $vcc, implicit $exec + %3:sreg_64_xexec = V_CMP_EQ_U32_e64 undef %0, 1, implicit $exec + %5:vgpr_32 = V_LSHRREV_B32_e64 16, undef %2, implicit $exec + V_CMP_EQ_U32_e32 1, undef %2, implicit-def $vcc, implicit $exec %6:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %5, %3, implicit $exec %7:vgpr_32 = V_CNDMASK_B32_e32 killed %5, killed %5, implicit $vcc, implicit $exec $vgpr0 = COPY %6 @@ -165,8 +165,8 @@ body: | ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]] ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %1:vgpr_32 = IMPLICIT_DEF - %2:vgpr_32 = V_LSHRREV_B32_e64 16, %1, implicit $exec - %3:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %2, undef %0, implicit $exec + %2:vgpr_32 = V_LSHRREV_B32_e64 16, undef %1, implicit $exec + %3:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %2, undef %0:sreg_64_xexec, implicit $exec $vgpr0 = COPY %3 SI_RETURN implicit $vgpr0 ... From e943523351e81ec6d4136bdda6a070b3281b4af4 Mon Sep 17 00:00:00 2001 From: Frederik Harwath Date: Fri, 2 May 2025 11:48:59 -0400 Subject: [PATCH 20/26] Make sure that V_CND_MASK gets handled --- llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 3 +- .../sdwa-peephole-cndmask-vop3-wave32.mir | 27 +++++++++++++++++ .../sdwa-peephole-cndmask-vop3-wave64.mir | 29 ++++++++++++++++++- 3 files changed, 57 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index 92bed084314e0..c5873e1d998f1 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -1101,7 +1101,8 @@ void SIPeepholeSDWA::convertVcndmaskToVOP2(MachineInstr &MI, // Change destination of compare instruction to VCC // or copy to VCC if carry-in is not a compare inst. - if (CarryDef->isCompare() && TII->isVOP3(*CarryDef) && + if (TII->isVOP3(*CarryDef) && + TII->isVOPC(AMDGPU::getVOPe32(CarryDef->getOpcode())) && MRI->hasOneUse(CarryIn.getReg())) CarryDef->substituteRegister(CarryIn.getReg(), Vcc, 0, *TRI); else { diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave32.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave32.mir index bf00f204a6c39..57c38386f0161 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave32.mir +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave32.mir @@ -28,6 +28,33 @@ body: | SI_RETURN implicit $vgpr0 ... +--- +name: change-compare-class-to-vopc # check that non-compare instr V_CMP_CLASS is also handled +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: change-compare-class-to-vopc + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: $vcc_lo = V_CMP_CLASS_F32_e64 2, undef [[DEF]], 1, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[DEF1]], implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, undef [[DEF1]], 0, 6, 0, 6, 5, implicit $vcc_lo, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 + %0:sreg_32_xm0_xexec = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:sreg_32_xm0_xexec = V_CMP_CLASS_F32_e64 2, undef %0, 1, implicit $exec + %3:vgpr_32 = V_LSHRREV_B32_e64 16, undef %1, implicit $exec + %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %3, killed %2, implicit $exec + $vgpr0 = COPY %4 + SI_RETURN implicit $vgpr0 +... + ... --- name: carry-copy-non-compare # copy of carry-in necessary because def. instr. cannot be changed to write to VCC diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave64.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave64.mir index 0d8aea1405cc7..d19a36bf03710 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave64.mir +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave64.mir @@ -1,5 +1,5 @@ -# RUN: llc %s -mtriple=amdgcn -mcpu=gfx900 -run-pass=si-peephole-sdwa -o - | FileCheck %s # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc %s -mtriple=amdgcn -mcpu=gfx900 -run-pass=si-peephole-sdwa -o - | FileCheck %s --- name: change-compare-to-vopc # carry-in def is a comparison that can be changed to VOPC @@ -28,6 +28,33 @@ body: | SI_RETURN implicit $vgpr0 ... +--- +name: change-compare-class-to-vopc # check that non-compare instr V_CMP_CLASS is also handled +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: change-compare-class-to-vopc + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: $vcc = V_CMP_CLASS_F32_e64 2, undef [[DEF]], 1, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[DEF1]], implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, undef [[DEF1]], 0, 6, 0, 6, 5, implicit $vcc, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 + %0:sreg_32_xm0_xexec = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:sreg_64_xexec = V_CMP_CLASS_F32_e64 2, undef %0, 1, implicit $exec + %3:vgpr_32 = V_LSHRREV_B32_e64 16, undef %1, implicit $exec + %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %3, killed %2, implicit $exec + $vgpr0 = COPY %4 + SI_RETURN implicit $vgpr0 +... + ... --- name: carry-copy-non-compare # copy of carry-in necessary because def. instr. cannot be changed to write to VCC From d027b652975bf8cbad926f104a1626ed67949798 Mon Sep 17 00:00:00 2001 From: Frederik Harwath Date: Fri, 2 May 2025 11:53:44 -0400 Subject: [PATCH 21/26] Change tests to avoid the impression that the carry-in def will be rewritten to VOPC --- .../CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave32.mir | 8 ++++---- .../CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave64.mir | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave32.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave32.mir index 57c38386f0161..ec2efd5ceaa9c 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave32.mir +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave32.mir @@ -2,13 +2,13 @@ # RUN: llc %s -mtriple=amdgcn -mcpu=gfx1030 -run-pass=si-peephole-sdwa -o - | FileCheck %s --- -name: change-compare-to-vopc # carry-in def is a comparison that can be changed to VOPC +name: change-compare-dest-to-vcc # carry-in def is a comparison that can write to VCC tracksRegLiveness: true body: | bb.0: liveins: $vgpr0 - ; CHECK-LABEL: name: change-compare-to-vopc + ; CHECK-LABEL: name: change-compare-dest-to-vcc ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF @@ -29,13 +29,13 @@ body: | ... --- -name: change-compare-class-to-vopc # check that non-compare instr V_CMP_CLASS is also handled +name: change-compare-class-dest-to-vcc # check that non-compare instr V_CMP_CLASS is also handled tracksRegLiveness: true body: | bb.0: liveins: $vgpr0 - ; CHECK-LABEL: name: change-compare-class-to-vopc + ; CHECK-LABEL: name: change-compare-class-dest-to-vcc ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave64.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave64.mir index d19a36bf03710..8104f01298e47 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave64.mir +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave64.mir @@ -2,13 +2,13 @@ # RUN: llc %s -mtriple=amdgcn -mcpu=gfx900 -run-pass=si-peephole-sdwa -o - | FileCheck %s --- -name: change-compare-to-vopc # carry-in def is a comparison that can be changed to VOPC +name: change-compare-dest-to-vcc # carry-in def is a comparison that can write to VCC tracksRegLiveness: true body: | bb.0: liveins: $vgpr0 - ; CHECK-LABEL: name: change-compare-to-vopc + ; CHECK-LABEL: name: change-compare-dest-to-vcc ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF @@ -29,13 +29,13 @@ body: | ... --- -name: change-compare-class-to-vopc # check that non-compare instr V_CMP_CLASS is also handled +name: change-compare-class-dest-to-vcc # check that non-compare instr V_CMP_CLASS is also handled tracksRegLiveness: true body: | bb.0: liveins: $vgpr0 - ; CHECK-LABEL: name: change-compare-class-to-vopc + ; CHECK-LABEL: name: change-compare-class-dest-to-vcc ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF From 135d3a09921d2382c6eb2553ea7533be49a93394 Mon Sep 17 00:00:00 2001 From: Frederik Harwath Date: Mon, 5 May 2025 04:23:15 -0400 Subject: [PATCH 22/26] Always copy from carry-in operand to VCC --- llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 19 +-- llvm/test/CodeGen/AMDGPU/bf16.ll | 99 ++++++----- llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll | 10 +- llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll | 10 +- .../CodeGen/AMDGPU/dagcombine-fmul-sel.ll | 156 +++++++++--------- llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll | 12 +- llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll | 12 +- .../CodeGen/AMDGPU/fneg-modifier-casting.ll | 58 ++++--- llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll | 145 ++++++++-------- llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll | 145 ++++++++-------- .../sdwa-peephole-cndmask-vop3-wave32.mir | 6 +- .../sdwa-peephole-cndmask-vop3-wave64.mir | 6 +- llvm/test/CodeGen/AMDGPU/select.f16.ll | 97 ++++++----- 13 files changed, 407 insertions(+), 368 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index c5873e1d998f1..a320e6769a115 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -1065,11 +1065,9 @@ void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI, /// Try to convert an \p MI in VOP3 which takes an src2 carry-in /// operand into the corresponding VOP2 form which expects the -/// argument in VCC. To this end, either try to change the definition -/// of the carry-in operand to write to VCC or add an instruction that -/// copies from the carry-in to VCC. The conversion will only be -/// applied if \p MI can be shrunk to VOP2 and if VCC can be proven to -/// be dead before \p MI. +/// argument in VCC. To this end, add an copy from the carry-in to +/// VCC. The conversion will only be applied if \p MI can be shrunk +/// to VOP2 and if VCC can be proven to be dead before \p MI. void SIPeepholeSDWA::convertVcndmaskToVOP2(MachineInstr &MI, const GCNSubtarget &ST) const { assert(MI.getOpcode() == AMDGPU::V_CNDMASK_B32_e64); @@ -1099,16 +1097,7 @@ void SIPeepholeSDWA::convertVcndmaskToVOP2(MachineInstr &MI, return; } - // Change destination of compare instruction to VCC - // or copy to VCC if carry-in is not a compare inst. - if (TII->isVOP3(*CarryDef) && - TII->isVOPC(AMDGPU::getVOPe32(CarryDef->getOpcode())) && - MRI->hasOneUse(CarryIn.getReg())) - CarryDef->substituteRegister(CarryIn.getReg(), Vcc, 0, *TRI); - else { - BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), Vcc) - .add(CarryIn); - } + BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), Vcc).add(CarryIn); auto Converted = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::getVOPe32(MI.getOpcode()))) diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index c656a20c10945..c4957fd44e2be 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -40777,39 +40777,41 @@ define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bflo ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v3 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[4:5] -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, v3, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, v4, s[4:5] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 1, v2 +; GFX9-NEXT: v_cndmask_b32_sdwa v2, v7, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc +; GFX9-NEXT: s_mov_b64 vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v7, v5, s[6:7] ; GFX9-NEXT: v_cndmask_b32_sdwa v3, v6, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v3, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_vselect_v4bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX10-NEXT: v_and_b32_e32 v1, 1, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 1, v0 +; GFX10-NEXT: v_cndmask_b32_sdwa v2, v7, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: s_mov_b32 vcc_lo, s4 +; GFX10-NEXT: v_cndmask_b32_sdwa v3, v6, v4, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v7, v5, s4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v0 -; GFX10-NEXT: v_cndmask_b32_sdwa v2, v6, v4, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_sdwa v3, v7, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v4, s4 -; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 -; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v4, s5 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc_lo +; GFX10-NEXT: v_perm_b32 v0, v3, v0, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11TRUE16-LABEL: v_vselect_v4bf16: @@ -41058,39 +41060,36 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX10-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX10-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX10-NEXT: v_and_b32_e32 v7, 1, v7 +; GFX10-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v16, 16, v14 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX10-NEXT: v_and_b32_e32 v1, 1, v3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 1, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 1, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 1, v6 -; GFX10-NEXT: v_and_b32_e32 v6, 1, v7 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v11 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v15, v11, s4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v4 -; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v14, v10, s4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v5 -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v13 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v16, v11, s4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v13, v9, s4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v12, v8, s4 -; GFX10-NEXT: v_cndmask_b32_sdwa v8, v12, v8, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 1, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 1, v4 +; GFX10-NEXT: v_cndmask_b32_sdwa v4, v15, v11, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX10-NEXT: v_perm_b32 v0, v8, v0, 0x5040100 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v10, v9, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v14, v10, vcc_lo +; GFX10-NEXT: s_mov_b32 vcc_lo, s6 +; GFX10-NEXT: v_cndmask_b32_sdwa v6, v14, v10, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: s_mov_b32 vcc_lo, s5 +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v13, v9, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc_lo +; GFX10-NEXT: s_mov_b32 vcc_lo, s4 +; GFX10-NEXT: v_cndmask_b32_sdwa v7, v12, v8, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX10-NEXT: v_perm_b32 v0, v7, v0, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v13, v9, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 ; GFX10-NEXT: v_perm_b32 v1, v1, v2, 0x5040100 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v15, v7, vcc_lo -; GFX10-NEXT: v_perm_b32 v2, v5, v4, 0x5040100 -; GFX10-NEXT: v_perm_b32 v3, v6, v3, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v15, v11, vcc_lo +; GFX10-NEXT: v_perm_b32 v2, v6, v5, 0x5040100 +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11TRUE16-LABEL: v_vselect_v8bf16: diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll index 3cea1d17a2bfa..92ece0d007fe2 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -652,13 +652,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 32 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v1 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v1 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 32 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v3 ; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm @@ -760,6 +760,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 32 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 @@ -768,8 +769,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 32 -; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v3, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %arrayidx, align 1 diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll index afaabe08d6d6d..3c45596fba14b 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -1503,13 +1503,13 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xff ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_or_b32_e32 v2, 0x100, v1 +; GFX9-GISEL-NEXT: v_or_b32_e32 v3, 0x100, v1 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v1, v2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xff +; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v1, v3 ; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm @@ -1604,6 +1604,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 @@ -1612,8 +1613,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX9-GISEL-NEXT: v_or_b32_e32 v2, 0x10000, v1 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v1, v2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v1, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %arrayidx, align 1 diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll index 4e10f461beebf..a511233af0703 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll @@ -2793,11 +2793,11 @@ define bfloat @fmul_select_bf16_test1(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX10-LABEL: fmul_select_bf16_test1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, 0x4000 -; GFX10-NEXT: v_mov_b32_e32 v4, 0x3f80 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x4000 +; GFX10-NEXT: v_mov_b32_e32 v2, 0x3f80 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: v_cndmask_b32_sdwa v1, v4, v3, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 @@ -2927,11 +2927,11 @@ define bfloat @fmul_select_bf16_test2(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX10-LABEL: fmul_select_bf16_test2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, 0x3f00 -; GFX10-NEXT: v_mov_b32_e32 v4, 0x3f80 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x3f00 +; GFX10-NEXT: v_mov_b32_e32 v2, 0x3f80 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: v_cndmask_b32_sdwa v1, v4, v3, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 @@ -3049,24 +3049,24 @@ define <2 x bfloat> @fmul_select_v2bf16_test3(<2 x bfloat> %x, <2 x i32> %bool.a ; GFX9-LABEL: fmul_select_v2bf16_test3: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3f80 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x4000 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], v2, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v1, v3, s[4:5] -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v1, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x4000 +; GFX9-NEXT: v_cndmask_b32_sdwa v3, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_mov_b64 vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -3078,24 +3078,24 @@ define <2 x bfloat> @fmul_select_v2bf16_test3(<2 x bfloat> %x, <2 x i32> %bool.a ; GFX10-LABEL: fmul_select_v2bf16_test3: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v5, 0x4000 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 -; GFX10-NEXT: v_mov_b32_e32 v1, 0x3f80 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, v2, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX10-NEXT: v_mov_b32_e32 v2, 0x4000 +; GFX10-NEXT: v_mov_b32_e32 v4, 0x3f80 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, v1, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: v_cndmask_b32_sdwa v1, v1, v5, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0x3f80, v5, s4 -; GFX10-NEXT: v_mul_f32_e32 v1, v3, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_cndmask_b32_sdwa v3, v4, v2, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: s_mov_b32 vcc_lo, s4 +; GFX10-NEXT: v_cndmask_b32_sdwa v2, v4, v2, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v3 +; GFX10-NEXT: v_mul_f32_e32 v1, v1, v2 ; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo @@ -3258,24 +3258,24 @@ define <2 x bfloat> @fmul_select_v2bf16_test4(<2 x bfloat> %x, <2 x i32> %bool.a ; GFX9-LABEL: fmul_select_v2bf16_test4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3f80 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x3f00 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], v2, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v1, v3, s[4:5] -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v1, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x3f00 +; GFX9-NEXT: v_cndmask_b32_sdwa v3, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_mov_b64 vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -3287,24 +3287,24 @@ define <2 x bfloat> @fmul_select_v2bf16_test4(<2 x bfloat> %x, <2 x i32> %bool.a ; GFX10-LABEL: fmul_select_v2bf16_test4: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v5, 0x3f00 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 -; GFX10-NEXT: v_mov_b32_e32 v1, 0x3f80 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, v2, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX10-NEXT: v_mov_b32_e32 v2, 0x3f00 +; GFX10-NEXT: v_mov_b32_e32 v4, 0x3f80 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, v1, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: v_cndmask_b32_sdwa v1, v1, v5, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0x3f80, v5, s4 -; GFX10-NEXT: v_mul_f32_e32 v1, v3, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_cndmask_b32_sdwa v3, v4, v2, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: s_mov_b32 vcc_lo, s4 +; GFX10-NEXT: v_cndmask_b32_sdwa v2, v4, v2, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v3 +; GFX10-NEXT: v_mul_f32_e32 v1, v1, v2 ; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo @@ -3480,11 +3480,11 @@ define bfloat @fmul_select_bf16_test5(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX10-LABEL: fmul_select_bf16_test5: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, 0x4000 -; GFX10-NEXT: v_mov_b32_e32 v4, 0x4100 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x4000 +; GFX10-NEXT: v_mov_b32_e32 v2, 0x4100 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: v_cndmask_b32_sdwa v1, v4, v3, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 @@ -3616,11 +3616,11 @@ define bfloat @fmul_select_bf16_test6(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX10-LABEL: fmul_select_bf16_test6: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffffc100 -; GFX10-NEXT: v_mov_b32_e32 v4, 0x4040 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, 0xffffc100 +; GFX10-NEXT: v_mov_b32_e32 v2, 0x4040 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: v_cndmask_b32_sdwa v1, v4, v3, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 @@ -3751,11 +3751,11 @@ define bfloat @fmul_select_bf16_test7(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX10-LABEL: fmul_select_bf16_test7: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, 0x4100 -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffffc080 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x4100 +; GFX10-NEXT: v_mov_b32_e32 v2, 0xffffc080 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: v_cndmask_b32_sdwa v1, v4, v3, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 @@ -4015,11 +4015,11 @@ define bfloat @fmul_select_bf16_test9(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX10-LABEL: fmul_select_bf16_test9: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffffc180 -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffffc200 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, 0xffffc180 +; GFX10-NEXT: v_mov_b32_e32 v2, 0xffffc200 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: v_cndmask_b32_sdwa v1, v4, v3, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 @@ -4151,11 +4151,11 @@ define bfloat @fmul_select_bf16_test10_sel_log2val_pos65_pos56(bfloat %x, i32 %b ; GFX10-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffffe000 -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffffdb80 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, 0xffffe000 +; GFX10-NEXT: v_mov_b32_e32 v2, 0xffffdb80 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: v_cndmask_b32_sdwa v1, v4, v3, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 @@ -4287,11 +4287,11 @@ define bfloat @fmul_select_bf16_test11_sel_log2val_neg22_pos25(bfloat %x, i32 %b ; GFX10-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, 0x3480 -; GFX10-NEXT: v_mov_b32_e32 v4, 0x4c00 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x3480 +; GFX10-NEXT: v_mov_b32_e32 v2, 0x4c00 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: v_cndmask_b32_sdwa v1, v4, v3, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll index 16e217008ace5..67a9c12dca94a 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll @@ -113,8 +113,10 @@ define <2 x half> @test_fmax_legacy_ugt_v2f16(<2 x half> %a, <2 x half> %b) #0 { ; VI-SAFE-LABEL: test_fmax_legacy_ugt_v2f16: ; VI-SAFE: ; %bb.0: ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SAFE-NEXT: v_cmp_nle_f16 vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; VI-SAFE-NEXT: v_cndmask_b32_sdwa v2, v1, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; VI-SAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v3, v2 +; VI-SAFE-NEXT: v_cndmask_b32_sdwa v2, v2, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v0, v1 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -217,8 +219,10 @@ define <3 x half> @test_fmax_legacy_ugt_v3f16(<3 x half> %a, <3 x half> %b) #0 { ; VI-SAFE-LABEL: test_fmax_legacy_ugt_v3f16: ; VI-SAFE: ; %bb.0: ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SAFE-NEXT: v_cmp_nle_f16 vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; VI-SAFE-NEXT: v_cndmask_b32_sdwa v4, v2, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; VI-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v5, v4 +; VI-SAFE-NEXT: v_cndmask_b32_sdwa v4, v4, v5, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v1, v3 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll index 144074e114045..fd809c6103d2c 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll @@ -114,8 +114,10 @@ define <2 x half> @test_fmin_legacy_ule_v2f16(<2 x half> %a, <2 x half> %b) #0 { ; VI-SAFE-LABEL: test_fmin_legacy_ule_v2f16: ; VI-SAFE: ; %bb.0: ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SAFE-NEXT: v_cmp_ngt_f16 vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; VI-SAFE-NEXT: v_cndmask_b32_sdwa v2, v1, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; VI-SAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v3, v2 +; VI-SAFE-NEXT: v_cndmask_b32_sdwa v2, v2, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v1 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -218,8 +220,10 @@ define <3 x half> @test_fmin_legacy_ule_v3f16(<3 x half> %a, <3 x half> %b) #0 { ; VI-SAFE-LABEL: test_fmin_legacy_ule_v3f16: ; VI-SAFE: ; %bb.0: ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SAFE-NEXT: v_cmp_ngt_f16 vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; VI-SAFE-NEXT: v_cndmask_b32_sdwa v4, v2, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; VI-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v5, v4 +; VI-SAFE-NEXT: v_cndmask_b32_sdwa v4, v4, v5, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v1, v3 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll index d03e6eef2e364..924378eb2376d 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll @@ -743,25 +743,24 @@ define <2 x half> @select_fneg_select_v2f16(<2 x i1> %cond0, <2 x i1> %cond1, <2 ; GFX9-LABEL: select_fneg_select_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_xor_b32_e32 v3, 0x80008000, v4 +; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v6, v4, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v5, s[4:5] -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v3, v1, v0, s4 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX9-NEXT: v_xor_b32_e32 v1, 0x80008000, v4 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v3 +; GFX9-NEXT: v_cndmask_b32_sdwa v3, v1, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v5, vcc +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_perm_b32 v1, v3, v0, s6 ; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_xor_b32_e32 v3, 0x80008000, v3 -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v1, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX9-NEXT: s_mov_b64 vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v3, v3, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: v_perm_b32 v0, v3, v0, s6 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: select_fneg_select_v2f16: @@ -850,25 +849,24 @@ define <2 x i16> @select_fneg_xor_select_v2i16(<2 x i1> %cond0, <2 x i1> %cond1, ; GFX9-LABEL: select_fneg_xor_select_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_xor_b32_e32 v3, 0x80008000, v4 +; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v6, v4, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v5, s[4:5] -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v3, v1, v0, s4 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX9-NEXT: v_xor_b32_e32 v1, 0x80008000, v4 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v3 +; GFX9-NEXT: v_cndmask_b32_sdwa v3, v1, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v5, vcc +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_perm_b32 v1, v3, v0, s6 ; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_xor_b32_e32 v3, 0x80008000, v3 -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v1, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX9-NEXT: s_mov_b64 vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v3, v3, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: v_perm_b32 v0, v3, v0, s6 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: select_fneg_xor_select_v2i16: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll index 426e05236e62f..17fdc841a1258 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll @@ -664,8 +664,10 @@ define <2 x half> @v_maximum_v2f16(<2 x half> %src0, <2 x half> %src1) { ; GFX8-LABEL: v_maximum_v2f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v3, v2 +; GFX8-NEXT: v_max_f16_e32 v2, v3, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, 0x7e00 ; GFX8-NEXT: v_cndmask_b32_sdwa v2, v3, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_max_f16_e32 v4, v0, v1 @@ -696,12 +698,12 @@ define <2 x half> @v_maximum_v2f16(<2 x half> %src0, <2 x half> %src1) { ; GFX10-LABEL: v_maximum_v2f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX10-NEXT: v_pk_max_f16 v3, v0, v1 +; GFX10-NEXT: v_pk_max_f16 v2, v0, v1 ; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_mov_b32_e32 v3, 0x7e00 ; GFX10-NEXT: v_cmp_o_f16_e64 s4, v0, v1 -; GFX10-NEXT: v_cndmask_b32_sdwa v1, v2, v3, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v3, s4 +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v2, s4 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -830,8 +832,10 @@ define <2 x half> @v_maximum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) { ; GFX8-LABEL: v_maximum_v2f16__nsz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v3, v2 +; GFX8-NEXT: v_max_f16_e32 v2, v3, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, 0x7e00 ; GFX8-NEXT: v_cndmask_b32_sdwa v2, v3, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_max_f16_e32 v4, v0, v1 @@ -862,12 +866,12 @@ define <2 x half> @v_maximum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) { ; GFX10-LABEL: v_maximum_v2f16__nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX10-NEXT: v_pk_max_f16 v3, v0, v1 +; GFX10-NEXT: v_pk_max_f16 v2, v0, v1 ; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_mov_b32_e32 v3, 0x7e00 ; GFX10-NEXT: v_cmp_o_f16_e64 s4, v0, v1 -; GFX10-NEXT: v_cndmask_b32_sdwa v1, v2, v3, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v3, s4 +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v2, s4 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1160,8 +1164,10 @@ define <3 x half> @v_maximum_v3f16(<3 x half> %src0, <3 x half> %src1) { ; GFX8-LABEL: v_maximum_v3f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_sdwa v4, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v5, v4 +; GFX8-NEXT: v_max_f16_e32 v4, v5, v4 ; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX8-NEXT: v_cndmask_b32_sdwa v4, v5, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_max_f16_e32 v6, v1, v3 @@ -1199,12 +1205,12 @@ define <3 x half> @v_maximum_v3f16(<3 x half> %src0, <3 x half> %src1) { ; GFX10-LABEL: v_maximum_v3f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX10-NEXT: v_pk_max_f16 v5, v0, v2 +; GFX10-NEXT: v_pk_max_f16 v4, v0, v2 ; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX10-NEXT: v_cmp_o_f16_e64 s4, v0, v2 -; GFX10-NEXT: v_cndmask_b32_sdwa v2, v4, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v5, s4 +; GFX10-NEXT: v_cndmask_b32_sdwa v2, v5, v4, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v4, s4 ; GFX10-NEXT: v_pk_max_f16 v4, v1, v3 ; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 ; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 @@ -1363,8 +1369,10 @@ define <3 x half> @v_maximum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) { ; GFX8-LABEL: v_maximum_v3f16__nsz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_sdwa v4, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v5, v4 +; GFX8-NEXT: v_max_f16_e32 v4, v5, v4 ; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX8-NEXT: v_cndmask_b32_sdwa v4, v5, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_max_f16_e32 v6, v1, v3 @@ -1402,12 +1410,12 @@ define <3 x half> @v_maximum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) { ; GFX10-LABEL: v_maximum_v3f16__nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX10-NEXT: v_pk_max_f16 v5, v0, v2 +; GFX10-NEXT: v_pk_max_f16 v4, v0, v2 ; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX10-NEXT: v_cmp_o_f16_e64 s4, v0, v2 -; GFX10-NEXT: v_cndmask_b32_sdwa v2, v4, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v5, s4 +; GFX10-NEXT: v_cndmask_b32_sdwa v2, v5, v4, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v4, s4 ; GFX10-NEXT: v_pk_max_f16 v4, v1, v3 ; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 ; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 @@ -1573,8 +1581,10 @@ define <4 x half> @v_maximum_v4f16(<4 x half> %src0, <4 x half> %src1) { ; GFX8-LABEL: v_maximum_v4f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_o_f16 vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v5, v4 +; GFX8-NEXT: v_max_f16_e32 v4, v5, v4 ; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v0 @@ -1622,19 +1632,20 @@ define <4 x half> @v_maximum_v4f16(<4 x half> %src0, <4 x half> %src1) { ; GFX10-LABEL: v_maximum_v4f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX10-NEXT: v_pk_max_f16 v5, v1, v3 -; GFX10-NEXT: v_pk_max_f16 v6, v0, v2 -; GFX10-NEXT: v_cmp_o_f16_e64 s4, v0, v2 -; GFX10-NEXT: v_cndmask_b32_sdwa v7, v4, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v6, s4 +; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_pk_max_f16 v6, v1, v3 +; GFX10-NEXT: v_cmp_o_f16_sdwa s4, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_pk_max_f16 v5, v0, v2 +; GFX10-NEXT: v_cmp_o_f16_e64 s5, v0, v2 ; GFX10-NEXT: v_cndmask_b32_sdwa v2, v4, v6, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: s_mov_b32 vcc_lo, s4 +; GFX10-NEXT: v_cndmask_b32_sdwa v4, v4, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 -; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v5, vcc_lo -; GFX10-NEXT: v_perm_b32 v1, v7, v1, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v5, s5 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v6, vcc_lo +; GFX10-NEXT: v_perm_b32 v0, v4, v0, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_maximum_v4f16: @@ -1810,8 +1821,10 @@ define <4 x half> @v_maximum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) { ; GFX8-LABEL: v_maximum_v4f16__nsz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_o_f16 vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v5, v4 +; GFX8-NEXT: v_max_f16_e32 v4, v5, v4 ; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v0 @@ -1859,19 +1872,20 @@ define <4 x half> @v_maximum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) { ; GFX10-LABEL: v_maximum_v4f16__nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX10-NEXT: v_pk_max_f16 v5, v1, v3 -; GFX10-NEXT: v_pk_max_f16 v6, v0, v2 -; GFX10-NEXT: v_cmp_o_f16_e64 s4, v0, v2 -; GFX10-NEXT: v_cndmask_b32_sdwa v7, v4, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v6, s4 +; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_pk_max_f16 v6, v1, v3 +; GFX10-NEXT: v_cmp_o_f16_sdwa s4, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_pk_max_f16 v5, v0, v2 +; GFX10-NEXT: v_cmp_o_f16_e64 s5, v0, v2 ; GFX10-NEXT: v_cndmask_b32_sdwa v2, v4, v6, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: s_mov_b32 vcc_lo, s4 +; GFX10-NEXT: v_cndmask_b32_sdwa v4, v4, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 -; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v5, vcc_lo -; GFX10-NEXT: v_perm_b32 v1, v7, v1, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v5, s5 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v6, vcc_lo +; GFX10-NEXT: v_perm_b32 v0, v4, v0, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_maximum_v4f16__nsz: @@ -2168,30 +2182,31 @@ define <8 x half> @v_maximum_v8f16(<8 x half> %src0, <8 x half> %src1) { ; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: v_pk_max_f16 v8, v3, v7 ; GFX10-NEXT: v_mov_b32_e32 v9, 0x7e00 +; GFX10-NEXT: v_pk_max_f16 v10, v2, v6 ; GFX10-NEXT: v_cmp_o_f16_sdwa s4, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_pk_max_f16 v11, v2, v6 ; GFX10-NEXT: v_cmp_o_f16_sdwa s5, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_pk_max_f16 v12, v1, v5 -; GFX10-NEXT: v_cndmask_b32_sdwa v10, v9, v8, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_pk_max_f16 v12, v0, v4 +; GFX10-NEXT: v_cndmask_b32_sdwa v11, v9, v8, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v6 +; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v10, vcc_lo ; GFX10-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-NEXT: v_pk_max_f16 v15, v0, v4 -; GFX10-NEXT: v_cndmask_b32_sdwa v13, v9, v11, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cmp_o_f16_sdwa s4, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_cndmask_b32_sdwa v6, v9, v10, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_pk_max_f16 v10, v1, v5 ; GFX10-NEXT: s_mov_b32 vcc_lo, s5 -; GFX10-NEXT: v_cmp_o_f16_e64 s4, v2, v6 -; GFX10-NEXT: v_cndmask_b32_sdwa v14, v9, v12, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0x7e00, v11, s4 -; GFX10-NEXT: v_cmp_o_f16_e64 s4, v0, v4 -; GFX10-NEXT: v_cndmask_b32_sdwa v4, v9, v15, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_sdwa v13, v9, v10, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v12, vcc_lo +; GFX10-NEXT: s_mov_b32 vcc_lo, s4 +; GFX10-NEXT: v_cndmask_b32_sdwa v4, v9, v12, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v5 -; GFX10-NEXT: v_perm_b32 v2, v13, v2, 0x5040100 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v15, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v12, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7 ; GFX10-NEXT: v_perm_b32 v0, v4, v0, 0x5040100 -; GFX10-NEXT: v_perm_b32 v1, v14, v1, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v10, vcc_lo +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7 +; GFX10-NEXT: v_perm_b32 v1, v13, v1, 0x5040100 ; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v8, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v10, v3, 0x5040100 +; GFX10-NEXT: v_perm_b32 v3, v11, v3, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_maximum_v8f16: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll index a8e7ccb5d326f..b8e5be785a77d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll @@ -554,8 +554,10 @@ define <2 x half> @v_minimum_v2f16(<2 x half> %src0, <2 x half> %src1) { ; GFX8-LABEL: v_minimum_v2f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_min_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v3, v2 +; GFX8-NEXT: v_min_f16_e32 v2, v3, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, 0x7e00 ; GFX8-NEXT: v_cndmask_b32_sdwa v2, v3, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_min_f16_e32 v4, v0, v1 @@ -586,12 +588,12 @@ define <2 x half> @v_minimum_v2f16(<2 x half> %src0, <2 x half> %src1) { ; GFX10-LABEL: v_minimum_v2f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX10-NEXT: v_pk_min_f16 v3, v0, v1 +; GFX10-NEXT: v_pk_min_f16 v2, v0, v1 ; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_mov_b32_e32 v3, 0x7e00 ; GFX10-NEXT: v_cmp_o_f16_e64 s4, v0, v1 -; GFX10-NEXT: v_cndmask_b32_sdwa v1, v2, v3, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v3, s4 +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v2, s4 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -685,8 +687,10 @@ define <2 x half> @v_minimum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) { ; GFX8-LABEL: v_minimum_v2f16__nsz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_min_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v3, v2 +; GFX8-NEXT: v_min_f16_e32 v2, v3, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, 0x7e00 ; GFX8-NEXT: v_cndmask_b32_sdwa v2, v3, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_min_f16_e32 v4, v0, v1 @@ -717,12 +721,12 @@ define <2 x half> @v_minimum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) { ; GFX10-LABEL: v_minimum_v2f16__nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX10-NEXT: v_pk_min_f16 v3, v0, v1 +; GFX10-NEXT: v_pk_min_f16 v2, v0, v1 ; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_mov_b32_e32 v3, 0x7e00 ; GFX10-NEXT: v_cmp_o_f16_e64 s4, v0, v1 -; GFX10-NEXT: v_cndmask_b32_sdwa v1, v2, v3, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v3, s4 +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v2, s4 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -946,8 +950,10 @@ define <3 x half> @v_minimum_v3f16(<3 x half> %src0, <3 x half> %src1) { ; GFX8-LABEL: v_minimum_v3f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_min_f16_sdwa v4, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v5, v4 +; GFX8-NEXT: v_min_f16_e32 v4, v5, v4 ; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX8-NEXT: v_cndmask_b32_sdwa v4, v5, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_min_f16_e32 v6, v1, v3 @@ -985,12 +991,12 @@ define <3 x half> @v_minimum_v3f16(<3 x half> %src0, <3 x half> %src1) { ; GFX10-LABEL: v_minimum_v3f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX10-NEXT: v_pk_min_f16 v5, v0, v2 +; GFX10-NEXT: v_pk_min_f16 v4, v0, v2 ; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX10-NEXT: v_cmp_o_f16_e64 s4, v0, v2 -; GFX10-NEXT: v_cndmask_b32_sdwa v2, v4, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v5, s4 +; GFX10-NEXT: v_cndmask_b32_sdwa v2, v5, v4, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v4, s4 ; GFX10-NEXT: v_pk_min_f16 v4, v1, v3 ; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 ; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 @@ -1102,8 +1108,10 @@ define <3 x half> @v_minimum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) { ; GFX8-LABEL: v_minimum_v3f16__nsz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_o_f16 vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_min_f16_sdwa v4, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v5, v4 +; GFX8-NEXT: v_min_f16_e32 v4, v5, v4 ; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX8-NEXT: v_cndmask_b32_sdwa v4, v5, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_min_f16_e32 v6, v1, v3 @@ -1141,12 +1149,12 @@ define <3 x half> @v_minimum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) { ; GFX10-LABEL: v_minimum_v3f16__nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX10-NEXT: v_pk_min_f16 v5, v0, v2 +; GFX10-NEXT: v_pk_min_f16 v4, v0, v2 ; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX10-NEXT: v_cmp_o_f16_e64 s4, v0, v2 -; GFX10-NEXT: v_cndmask_b32_sdwa v2, v4, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v5, s4 +; GFX10-NEXT: v_cndmask_b32_sdwa v2, v5, v4, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v4, s4 ; GFX10-NEXT: v_pk_min_f16 v4, v1, v3 ; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 ; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 @@ -1258,8 +1266,10 @@ define <4 x half> @v_minimum_v4f16(<4 x half> %src0, <4 x half> %src1) { ; GFX8-LABEL: v_minimum_v4f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_o_f16 vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_min_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v5, v4 +; GFX8-NEXT: v_min_f16_e32 v4, v5, v4 ; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v0 @@ -1307,19 +1317,20 @@ define <4 x half> @v_minimum_v4f16(<4 x half> %src0, <4 x half> %src1) { ; GFX10-LABEL: v_minimum_v4f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX10-NEXT: v_pk_min_f16 v5, v1, v3 -; GFX10-NEXT: v_pk_min_f16 v6, v0, v2 -; GFX10-NEXT: v_cmp_o_f16_e64 s4, v0, v2 -; GFX10-NEXT: v_cndmask_b32_sdwa v7, v4, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v6, s4 +; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_pk_min_f16 v6, v1, v3 +; GFX10-NEXT: v_cmp_o_f16_sdwa s4, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_pk_min_f16 v5, v0, v2 +; GFX10-NEXT: v_cmp_o_f16_e64 s5, v0, v2 ; GFX10-NEXT: v_cndmask_b32_sdwa v2, v4, v6, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: s_mov_b32 vcc_lo, s4 +; GFX10-NEXT: v_cndmask_b32_sdwa v4, v4, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 -; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v5, vcc_lo -; GFX10-NEXT: v_perm_b32 v1, v7, v1, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v5, s5 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v6, vcc_lo +; GFX10-NEXT: v_perm_b32 v0, v4, v0, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_minimum_v4f16: @@ -1436,8 +1447,10 @@ define <4 x half> @v_minimum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) { ; GFX8-LABEL: v_minimum_v4f16__nsz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_o_f16 vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_min_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v5, v4 +; GFX8-NEXT: v_min_f16_e32 v4, v5, v4 ; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v0 @@ -1485,19 +1498,20 @@ define <4 x half> @v_minimum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) { ; GFX10-LABEL: v_minimum_v4f16__nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX10-NEXT: v_pk_min_f16 v5, v1, v3 -; GFX10-NEXT: v_pk_min_f16 v6, v0, v2 -; GFX10-NEXT: v_cmp_o_f16_e64 s4, v0, v2 -; GFX10-NEXT: v_cndmask_b32_sdwa v7, v4, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v6, s4 +; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_pk_min_f16 v6, v1, v3 +; GFX10-NEXT: v_cmp_o_f16_sdwa s4, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_pk_min_f16 v5, v0, v2 +; GFX10-NEXT: v_cmp_o_f16_e64 s5, v0, v2 ; GFX10-NEXT: v_cndmask_b32_sdwa v2, v4, v6, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: s_mov_b32 vcc_lo, s4 +; GFX10-NEXT: v_cndmask_b32_sdwa v4, v4, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 -; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v5, vcc_lo -; GFX10-NEXT: v_perm_b32 v1, v7, v1, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v5, s5 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v6, vcc_lo +; GFX10-NEXT: v_perm_b32 v0, v4, v0, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_minimum_v4f16__nsz: @@ -1707,30 +1721,31 @@ define <8 x half> @v_minimum_v8f16(<8 x half> %src0, <8 x half> %src1) { ; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: v_pk_min_f16 v8, v3, v7 ; GFX10-NEXT: v_mov_b32_e32 v9, 0x7e00 +; GFX10-NEXT: v_pk_min_f16 v10, v2, v6 ; GFX10-NEXT: v_cmp_o_f16_sdwa s4, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_pk_min_f16 v11, v2, v6 ; GFX10-NEXT: v_cmp_o_f16_sdwa s5, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_pk_min_f16 v12, v1, v5 -; GFX10-NEXT: v_cndmask_b32_sdwa v10, v9, v8, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_pk_min_f16 v12, v0, v4 +; GFX10-NEXT: v_cndmask_b32_sdwa v11, v9, v8, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v6 +; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v10, vcc_lo ; GFX10-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-NEXT: v_pk_min_f16 v15, v0, v4 -; GFX10-NEXT: v_cndmask_b32_sdwa v13, v9, v11, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cmp_o_f16_sdwa s4, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_cndmask_b32_sdwa v6, v9, v10, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_pk_min_f16 v10, v1, v5 ; GFX10-NEXT: s_mov_b32 vcc_lo, s5 -; GFX10-NEXT: v_cmp_o_f16_e64 s4, v2, v6 -; GFX10-NEXT: v_cndmask_b32_sdwa v14, v9, v12, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0x7e00, v11, s4 -; GFX10-NEXT: v_cmp_o_f16_e64 s4, v0, v4 -; GFX10-NEXT: v_cndmask_b32_sdwa v4, v9, v15, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_sdwa v13, v9, v10, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v12, vcc_lo +; GFX10-NEXT: s_mov_b32 vcc_lo, s4 +; GFX10-NEXT: v_cndmask_b32_sdwa v4, v9, v12, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v5 -; GFX10-NEXT: v_perm_b32 v2, v13, v2, 0x5040100 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v15, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v12, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7 ; GFX10-NEXT: v_perm_b32 v0, v4, v0, 0x5040100 -; GFX10-NEXT: v_perm_b32 v1, v14, v1, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v10, vcc_lo +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7 +; GFX10-NEXT: v_perm_b32 v1, v13, v1, 0x5040100 ; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v8, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v10, v3, 0x5040100 +; GFX10-NEXT: v_perm_b32 v3, v11, v3, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_minimum_v8f16: diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave32.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave32.mir index ec2efd5ceaa9c..8f0757b00a3f6 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave32.mir +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave32.mir @@ -13,8 +13,9 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: $vcc_lo = V_CMP_EQ_U32_e64 undef [[DEF]], 1, implicit $exec + ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 undef [[DEF]], 1, implicit $exec ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[DEF1]], implicit $exec + ; CHECK-NEXT: $vcc_lo = COPY killed [[V_CMP_EQ_U32_e64_]] ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, undef [[DEF1]], 0, 6, 0, 6, 5, implicit $vcc_lo, implicit $exec ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]] @@ -40,8 +41,9 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: $vcc_lo = V_CMP_CLASS_F32_e64 2, undef [[DEF]], 1, implicit $exec + ; CHECK-NEXT: [[V_CMP_CLASS_F32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_CLASS_F32_e64 2, undef [[DEF]], 1, implicit $exec ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[DEF1]], implicit $exec + ; CHECK-NEXT: $vcc_lo = COPY killed [[V_CMP_CLASS_F32_e64_]] ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, undef [[DEF1]], 0, 6, 0, 6, 5, implicit $vcc_lo, implicit $exec ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]] diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave64.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave64.mir index 8104f01298e47..7041f7311d797 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave64.mir +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave64.mir @@ -13,8 +13,9 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: $vcc = V_CMP_EQ_U32_e64 undef [[DEF]], 1, implicit $exec + ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 undef [[DEF]], 1, implicit $exec ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[DEF1]], implicit $exec + ; CHECK-NEXT: $vcc = COPY killed [[V_CMP_EQ_U32_e64_]] ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, undef [[DEF1]], 0, 6, 0, 6, 5, implicit $vcc, implicit $exec ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]] @@ -40,8 +41,9 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: $vcc = V_CMP_CLASS_F32_e64 2, undef [[DEF]], 1, implicit $exec + ; CHECK-NEXT: [[V_CMP_CLASS_F32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_CLASS_F32_e64 2, undef [[DEF]], 1, implicit $exec ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[DEF1]], implicit $exec + ; CHECK-NEXT: $vcc = COPY killed [[V_CMP_CLASS_F32_e64_]] ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, undef [[DEF1]], 0, 6, 0, 6, 5, implicit $vcc, implicit $exec ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]] diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll index cf2f84bbfa97d..21719226710de 100644 --- a/llvm/test/CodeGen/AMDGPU/select.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll @@ -809,30 +809,33 @@ define amdgpu_kernel void @select_v2f16( ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s18, s6 +; VI-NEXT: s_mov_b32 s22, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s16, s10 -; VI-NEXT: s_mov_b32 s17, s11 -; VI-NEXT: s_mov_b32 s19, s7 ; VI-NEXT: s_mov_b32 s20, s12 ; VI-NEXT: s_mov_b32 s21, s13 -; VI-NEXT: s_mov_b32 s22, s6 ; VI-NEXT: s_mov_b32 s23, s7 +; VI-NEXT: s_mov_b32 s16, s10 +; VI-NEXT: s_mov_b32 s17, s11 +; VI-NEXT: s_mov_b32 s18, s6 +; VI-NEXT: s_mov_b32 s19, s7 +; VI-NEXT: buffer_load_dword v0, off, s[20:23], 0 +; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; VI-NEXT: s_mov_b32 s2, s6 +; VI-NEXT: s_mov_b32 s3, s7 ; VI-NEXT: s_mov_b32 s12, s14 ; VI-NEXT: s_mov_b32 s13, s15 ; VI-NEXT: s_mov_b32 s14, s6 ; VI-NEXT: s_mov_b32 s15, s7 -; VI-NEXT: s_mov_b32 s2, s6 -; VI-NEXT: s_mov_b32 s3, s7 -; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0 -; VI-NEXT: buffer_load_dword v1, off, s[20:23], 0 ; VI-NEXT: buffer_load_dword v2, off, s[0:3], 0 ; VI-NEXT: buffer_load_dword v3, off, s[12:15], 0 ; VI-NEXT: s_mov_b32 s4, s8 ; VI-NEXT: s_mov_b32 s5, s9 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_cmp_lt_f16 vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_cmp_lt_f16_e64 s[0:1], v0, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; VI-NEXT: v_cmp_lt_f16_e32 vcc, v5, v4 +; VI-NEXT: v_cmp_lt_f16_e64 s[0:1], v1, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e64 v0, v2, v3, s[0:1] ; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -1005,6 +1008,7 @@ define amdgpu_kernel void @select_v2f16_imm_a( ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s2 ; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; VI-NEXT: s_mov_b32 s16, s4 ; VI-NEXT: s_mov_b32 s17, s5 ; VI-NEXT: s_mov_b32 s18, s10 @@ -1013,14 +1017,14 @@ define amdgpu_kernel void @select_v2f16_imm_a( ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 -; VI-NEXT: v_mov_b32_e32 v3, 0x3900 +; VI-NEXT: s_movk_i32 s2, 0x3900 ; VI-NEXT: s_mov_b32 s8, s0 ; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_cmp_gt_f16 vcc, v0, v3 src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; VI-NEXT: v_cmp_lt_f16_e32 vcc, s2, v3 ; VI-NEXT: v_cmp_lt_f16_e64 s[0:1], 0.5, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[0:1] @@ -1180,6 +1184,7 @@ define amdgpu_kernel void @select_v2f16_imm_b( ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s2 ; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; VI-NEXT: s_mov_b32 s16, s4 ; VI-NEXT: s_mov_b32 s17, s5 ; VI-NEXT: s_mov_b32 s18, s10 @@ -1188,14 +1193,14 @@ define amdgpu_kernel void @select_v2f16_imm_b( ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 -; VI-NEXT: v_mov_b32_e32 v3, 0x3900 +; VI-NEXT: s_movk_i32 s2, 0x3900 ; VI-NEXT: s_mov_b32 s8, s0 ; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_cmp_lt_f16 vcc, v0, v3 src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; VI-NEXT: v_cmp_gt_f16_e32 vcc, s2, v3 ; VI-NEXT: v_cmp_gt_f16_e64 s[0:1], 0.5, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[0:1] @@ -1350,33 +1355,36 @@ define amdgpu_kernel void @select_v2f16_imm_c( ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s12, s2 -; VI-NEXT: s_mov_b32 s13, s3 ; VI-NEXT: s_mov_b32 s16, s4 ; VI-NEXT: s_mov_b32 s17, s5 -; VI-NEXT: s_mov_b32 s18, s10 -; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s11 ; VI-NEXT: s_mov_b32 s4, s6 ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0 +; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 ; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 ; VI-NEXT: v_mov_b32_e32 v3, 0x3800 ; VI-NEXT: s_mov_b32 s8, s0 ; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v1, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc -; VI-NEXT: v_cmp_nlt_f16 vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_mov_b32_e32 v0, 0x3900 -; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v5, v4 +; VI-NEXT: v_mov_b32_e32 v1, 0x3900 +; VI-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; @@ -1526,31 +1534,34 @@ define amdgpu_kernel void @select_v2f16_imm_d( ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s12, s2 -; VI-NEXT: s_mov_b32 s13, s3 ; VI-NEXT: s_mov_b32 s16, s4 ; VI-NEXT: s_mov_b32 s17, s5 -; VI-NEXT: s_mov_b32 s18, s10 -; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s11 ; VI-NEXT: s_mov_b32 s4, s6 ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0 +; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 ; VI-NEXT: s_mov_b32 s7, s11 ; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 ; VI-NEXT: v_mov_b32_e32 v3, 0x3800 ; VI-NEXT: s_mov_b32 s8, s0 ; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cmp_lt_f16 vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_cmp_lt_f16_e64 s[0:1], v0, v1 -; VI-NEXT: v_mov_b32_e32 v1, 0x3900 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; VI-NEXT: v_cmp_lt_f16_e32 vcc, v1, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[0:1] +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; VI-NEXT: v_cmp_lt_f16_e32 vcc, v5, v4 +; VI-NEXT: v_mov_b32_e32 v1, 0x3900 ; VI-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 From 8d7825a354364d417b1252ab98745a4543628e60 Mon Sep 17 00:00:00 2001 From: Frederik Harwath Date: Mon, 5 May 2025 13:22:24 +0200 Subject: [PATCH 23/26] Apply suggestions from code review Co-authored-by: Matt Arsenault --- .../sdwa-peephole-cndmask-vop3-wave32.mir | 10 +++++----- .../sdwa-peephole-cndmask-vop3-wave64.mir | 20 +++++++++---------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave32.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave32.mir index 8f0757b00a3f6..33d886ef8e5fc 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave32.mir +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave32.mir @@ -79,7 +79,7 @@ body: | ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %0:sreg_32_xm0_xexec = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF - %2:vgpr_32 = V_PK_MAX_F16 8, undef %1, 8, undef %1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %2:vgpr_32 = V_PK_MAX_F16 8, %1, 8, %1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec %3:vgpr_32 = V_LSHRREV_B32_e64 16, %2, implicit $exec %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %3, killed undef %0, implicit $exec $vgpr0 = COPY %4 @@ -109,7 +109,7 @@ body: | ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %0:sreg_32_xm0_xexec = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF - %2:vgpr_32 = V_PK_MAX_F16 8, undef %1, 8, undef %1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %2:vgpr_32 = V_PK_MAX_F16 8, %1, 8, %1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec %3:vgpr_32 = V_LSHRREV_B32_e64 16, %2, implicit $exec %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %3, killed undef %0, implicit $exec $vgpr0 = COPY %4 @@ -139,8 +139,8 @@ body: | ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %0:sreg_32_xm0_xexec = IMPLICIT_DEF %2:vgpr_32 = IMPLICIT_DEF - %3:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 undef %0, 1, implicit $exec - %5:vgpr_32 = V_LSHRREV_B32_e64 16, undef %2, implicit $exec + %3:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 %0, 1, implicit $exec + %5:vgpr_32 = V_LSHRREV_B32_e64 16, %2, implicit $exec V_CMP_EQ_U32_e32 1, undef %2, implicit-def $vcc, implicit $exec %6:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %5, %3, implicit $exec %7:vgpr_32 = V_CNDMASK_B32_e32 killed %5, killed %5, implicit $vcc, implicit $exec @@ -194,7 +194,7 @@ body: | ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]] ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %1:sreg_32_xm0_xexec = IMPLICIT_DEF - %2:vgpr_32 = V_LSHRREV_B32_e64 16, undef %1:sreg_32_xm0_xexec, implicit $exec + %2:vgpr_32 = V_LSHRREV_B32_e64 16, %1:sreg_32_xm0_xexec, implicit $exec %3:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %2, undef %0:sreg_32_xm0_xexec, implicit $exec $vgpr0 = COPY %3 SI_RETURN implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave64.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave64.mir index 7041f7311d797..d73edc7345371 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave64.mir +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave64.mir @@ -22,8 +22,8 @@ body: | ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %0:vgpr_32 = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF - %2:sreg_64_xexec = V_CMP_EQ_U32_e64 undef %0, 1, implicit $exec - %3:vgpr_32 = V_LSHRREV_B32_e64 16, undef %1, implicit $exec + %2:sreg_64_xexec = V_CMP_EQ_U32_e64 %0, 1, implicit $exec + %3:vgpr_32 = V_LSHRREV_B32_e64 16, %1, implicit $exec %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %3, killed %2, implicit $exec $vgpr0 = COPY %4 SI_RETURN implicit $vgpr0 @@ -50,8 +50,8 @@ body: | ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %0:sreg_32_xm0_xexec = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF - %2:sreg_64_xexec = V_CMP_CLASS_F32_e64 2, undef %0, 1, implicit $exec - %3:vgpr_32 = V_LSHRREV_B32_e64 16, undef %1, implicit $exec + %2:sreg_64_xexec = V_CMP_CLASS_F32_e64 2,%0, 1, implicit $exec + %3:vgpr_32 = V_LSHRREV_B32_e64 16, %1, implicit $exec %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %3, killed %2, implicit $exec $vgpr0 = COPY %4 SI_RETURN implicit $vgpr0 @@ -79,9 +79,9 @@ body: | ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %0:sreg_64_xexec = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF - %2:vgpr_32 = V_PK_MAX_F16 8, undef %1, 8, undef %1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %2:vgpr_32 = V_PK_MAX_F16 8, %1, 8, %1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec %3:vgpr_32 = V_LSHRREV_B32_e64 16, %2, implicit $exec - %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %3, killed undef %0, implicit $exec + %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %3, killed %0, implicit $exec $vgpr0 = COPY %4 SI_RETURN implicit $vgpr0 ... @@ -111,7 +111,7 @@ body: | %1:vgpr_32 = IMPLICIT_DEF %2:vgpr_32 = V_PK_MAX_F16 8, undef %1, 8, undef %1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec %3:vgpr_32 = V_LSHRREV_B32_e64 16, %2, implicit $exec - %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %3, killed undef %0, implicit $exec + %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %3, killed %0, implicit $exec $vgpr0 = COPY %4 $vgpr1 = COPY %0 SI_RETURN implicit $vgpr0 @@ -139,9 +139,9 @@ body: | ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %0:vgpr_32 = IMPLICIT_DEF %2:vgpr_32 = IMPLICIT_DEF - %3:sreg_64_xexec = V_CMP_EQ_U32_e64 undef %0, 1, implicit $exec - %5:vgpr_32 = V_LSHRREV_B32_e64 16, undef %2, implicit $exec - V_CMP_EQ_U32_e32 1, undef %2, implicit-def $vcc, implicit $exec + %3:sreg_64_xexec = V_CMP_EQ_U32_e64 %0, 1, implicit $exec + %5:vgpr_32 = V_LSHRREV_B32_e64 16, %2, implicit $exec + V_CMP_EQ_U32_e32 1, %2, implicit-def $vcc, implicit $exec %6:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %5, %3, implicit $exec %7:vgpr_32 = V_CNDMASK_B32_e32 killed %5, killed %5, implicit $vcc, implicit $exec $vgpr0 = COPY %6 From 5a98da935697952cd98a23a26ce9190b81a7d211 Mon Sep 17 00:00:00 2001 From: Frederik Harwath Date: Mon, 5 May 2025 13:23:31 +0200 Subject: [PATCH 24/26] Apply suggestions from code review Co-authored-by: Matt Arsenault --- .../test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave32.mir | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave32.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave32.mir index 33d886ef8e5fc..5ada9ec070394 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave32.mir +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave32.mir @@ -168,8 +168,8 @@ body: | ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %0:sreg_32_xm0_xexec = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF - %2:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 undef %0, 0, implicit $exec - %3:vgpr_32 = V_LSHRREV_B32_e64 16, undef %1, implicit $exec + %2:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 %0, 0, implicit $exec + %3:vgpr_32 = V_LSHRREV_B32_e64 16, %1, implicit $exec %4:vgpr_32 = V_CNDMASK_B32_e64 1, 0, 0, %3, killed %2, implicit $exec $vgpr0 = COPY %4 SI_RETURN implicit $vgpr0 From b79a9ce5169a4ed3d836149c593881f5293cbbab Mon Sep 17 00:00:00 2001 From: Frederik Harwath Date: Mon, 5 May 2025 08:56:37 -0400 Subject: [PATCH 25/26] Adjust tests - Move VOP2 tests cases into VOP3 wave32, wave64 test files - Adjust test names and comments to the fact that we no longer attempt to change the carry-in operands to VOP2. --- .../sdwa-peephole-cndmask-vop3-wave32.mir | 69 +++++++++++----- .../sdwa-peephole-cndmask-vop3-wave64.mir | 78 +++++++++++++------ 2 files changed, 105 insertions(+), 42 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave32.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave32.mir index 5ada9ec070394..4b45c54a3b83d 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave32.mir +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave32.mir @@ -2,13 +2,44 @@ # RUN: llc %s -mtriple=amdgcn -mcpu=gfx1030 -run-pass=si-peephole-sdwa -o - | FileCheck %s --- -name: change-compare-dest-to-vcc # carry-in def is a comparison that can write to VCC +name: cndmask_b32 # can be directly converted to SDWA without a copy to VCC +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vcc + + ; CHECK-LABEL: name: cndmask_b32 + ; CHECK: liveins: $vgpr0, $vgpr1, $vcc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[COPY]], implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[COPY1]], implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[COPY]], 0, [[COPY1]], 0, 6, 0, 5, 5, implicit $vcc_lo, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 + %0:vgpr_32 = COPY $vgpr1 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_e64 16, %0, implicit $exec + %3:vgpr_32 = V_LSHRREV_B32_e64 16, %1, implicit $exec + %4:vgpr_32 = V_CNDMASK_B32_e32 killed %2, killed %3, implicit $exec, implicit $vcc + $vgpr0 = COPY %4 + SI_RETURN implicit $vgpr0 +... + +# For SDWA conversion of V_CNDMASK, the carry-in operand must be +# available in VCC_LO. This is achieved by introducing a COPY +# instruction. Comparison instructions could be changed to VOP2 form +# intead, but we prefer to use a COPY. + +--- +name: carry-compare tracksRegLiveness: true body: | bb.0: liveins: $vgpr0 - ; CHECK-LABEL: name: change-compare-dest-to-vcc + ; CHECK-LABEL: name: carry-compare ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF @@ -30,13 +61,13 @@ body: | ... --- -name: change-compare-class-dest-to-vcc # check that non-compare instr V_CMP_CLASS is also handled +name: carry-compare-class tracksRegLiveness: true body: | bb.0: liveins: $vgpr0 - ; CHECK-LABEL: name: change-compare-class-dest-to-vcc + ; CHECK-LABEL: name: carry-compare-class ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF @@ -59,18 +90,18 @@ body: | ... --- -name: carry-copy-non-compare # copy of carry-in necessary because def. instr. cannot be changed to write to VCC +name: carry-non-compare tracksRegLiveness: true body: | bb.0: liveins: $vgpr0 - ; CHECK-LABEL: name: carry-copy-non-compare + ; CHECK-LABEL: name: carry-non-compare ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_F16 8, undef [[DEF1]], 8, undef [[DEF1]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_F16 8, [[DEF1]], 8, [[DEF1]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[V_PK_MAX_F16_]], implicit $exec ; CHECK-NEXT: $vcc_lo = COPY killed undef [[DEF]] ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec @@ -88,20 +119,20 @@ body: | ... --- -name: carry-copy-multiuse # copy of carry-in necessary because of second use +name: carry-multiuse tracksRegLiveness: true body: | bb.0: liveins: $vgpr0, $vgpr1 - ; CHECK-LABEL: name: carry-copy-multiuse + ; CHECK-LABEL: name: carry-multiuse ; CHECK: liveins: $vgpr0, $vgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_F16 8, undef [[DEF1]], 8, undef [[DEF1]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_F16 8, [[DEF1]], 8, [[DEF1]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[V_PK_MAX_F16_]], implicit $exec - ; CHECK-NEXT: $vcc_lo = COPY killed undef [[DEF]] + ; CHECK-NEXT: $vcc_lo = COPY undef [[DEF]] ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, [[V_PK_MAX_F16_]], 0, 6, 0, 6, 5, implicit $vcc_lo, implicit $exec ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]] @@ -111,7 +142,7 @@ body: | %1:vgpr_32 = IMPLICIT_DEF %2:vgpr_32 = V_PK_MAX_F16 8, %1, 8, %1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec %3:vgpr_32 = V_LSHRREV_B32_e64 16, %2, implicit $exec - %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %3, killed undef %0, implicit $exec + %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %3, undef %0, implicit $exec $vgpr0 = COPY %4 $vgpr1 = COPY %0 SI_RETURN implicit $vgpr0 @@ -130,8 +161,8 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 undef [[DEF]], 1, implicit $exec - ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[DEF1]], implicit $exec + ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[DEF]], 1, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[DEF1]], implicit $exec ; CHECK-NEXT: V_CMP_EQ_U32_e32 1, undef [[DEF1]], implicit-def $vcc_lo, implicit $exec ; CHECK-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, [[V_LSHRREV_B32_e64_]], [[V_CMP_EQ_U32_e64_]], implicit $exec ; CHECK-NEXT: [[V_CNDMASK_B32_e32_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e32 killed [[V_LSHRREV_B32_e64_]], killed [[V_LSHRREV_B32_e64_]], implicit $vcc_lo, implicit $exec @@ -150,19 +181,19 @@ body: | ... --- -name: cannot-shrink-source-mdoes # cannot shrink because of source modifiers +name: cannot-shrink-with-source-mods tracksRegLiveness: true body: | bb.0: liveins: $vgpr0 - ; CHECK-LABEL: name: cannot-shrink-source-mdoes + ; CHECK-LABEL: name: cannot-shrink-with-source-mods ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 undef [[DEF]], 0, implicit $exec - ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[DEF1]], implicit $exec + ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[DEF]], 0, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[DEF1]], implicit $exec ; CHECK-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 1, 0, 0, [[V_LSHRREV_B32_e64_]], killed [[V_CMP_EQ_U32_e64_]], implicit $exec ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]] ; CHECK-NEXT: SI_RETURN implicit $vgpr0 @@ -189,7 +220,7 @@ body: | ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF - ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[DEF]], implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[DEF]], implicit $exec ; CHECK-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, [[V_LSHRREV_B32_e64_]], undef %0:sreg_32_xm0_xexec, implicit $exec ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]] ; CHECK-NEXT: SI_RETURN implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave64.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave64.mir index d73edc7345371..e243df4077ff4 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave64.mir +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave64.mir @@ -2,22 +2,54 @@ # RUN: llc %s -mtriple=amdgcn -mcpu=gfx900 -run-pass=si-peephole-sdwa -o - | FileCheck %s --- -name: change-compare-dest-to-vcc # carry-in def is a comparison that can write to VCC +name: cndmask_b32 # can be directly converted to SDWA without a copy to VCC +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vcc + + ; CHECK-LABEL: name: cndmask_b32 + ; CHECK: liveins: $vgpr0, $vgpr1, $vcc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[COPY]], implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[COPY1]], implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[COPY]], 0, [[COPY1]], 0, 6, 0, 5, 5, implicit $vcc, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 + %0:vgpr_32 = COPY $vgpr1 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_e64 16, %0, implicit $exec + %3:vgpr_32 = V_LSHRREV_B32_e64 16, %1, implicit $exec + %4:vgpr_32 = V_CNDMASK_B32_e32 killed %2, killed %3, implicit $exec, implicit $vcc + $vgpr0 = COPY %4 + SI_RETURN implicit $vgpr0 + +... + +# For SDWA conversion of V_CNDMASK, the carry-in operand must be +# available in VCC. This is achieved by introducing a COPY +# instruction. Comparison instructions could be changed to VOP2 form +# instead, but we prefer to use a COPY. + +--- +name: carry-compare tracksRegLiveness: true body: | bb.0: liveins: $vgpr0 - ; CHECK-LABEL: name: change-compare-dest-to-vcc + ; CHECK-LABEL: name: carry-compare ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 undef [[DEF]], 1, implicit $exec - ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[DEF1]], implicit $exec + ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[DEF]], 1, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[DEF1]], implicit $exec ; CHECK-NEXT: $vcc = COPY killed [[V_CMP_EQ_U32_e64_]] ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; CHECK-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, undef [[DEF1]], 0, 6, 0, 6, 5, implicit $vcc, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, [[DEF1]], 0, 6, 0, 6, 5, implicit $vcc, implicit $exec ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]] ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %0:vgpr_32 = IMPLICIT_DEF @@ -30,22 +62,22 @@ body: | ... --- -name: change-compare-class-dest-to-vcc # check that non-compare instr V_CMP_CLASS is also handled +name: carry-compare-class tracksRegLiveness: true body: | bb.0: liveins: $vgpr0 - ; CHECK-LABEL: name: change-compare-class-dest-to-vcc + ; CHECK-LABEL: name: carry-compare-class ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[V_CMP_CLASS_F32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_CLASS_F32_e64 2, undef [[DEF]], 1, implicit $exec - ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[DEF1]], implicit $exec + ; CHECK-NEXT: [[V_CMP_CLASS_F32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_CLASS_F32_e64 2, [[DEF]], 1, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[DEF1]], implicit $exec ; CHECK-NEXT: $vcc = COPY killed [[V_CMP_CLASS_F32_e64_]] ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; CHECK-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, undef [[DEF1]], 0, 6, 0, 6, 5, implicit $vcc, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, [[DEF1]], 0, 6, 0, 6, 5, implicit $vcc, implicit $exec ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]] ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %0:sreg_32_xm0_xexec = IMPLICIT_DEF @@ -59,20 +91,20 @@ body: | ... --- -name: carry-copy-non-compare # copy of carry-in necessary because def. instr. cannot be changed to write to VCC +name: carry-non-compare tracksRegLiveness: true body: | bb.0: liveins: $vgpr0 - ; CHECK-LABEL: name: carry-copy-non-compare + ; CHECK-LABEL: name: carry-non-compare ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_F16 8, undef [[DEF1]], 8, undef [[DEF1]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_F16 8, [[DEF1]], 8, [[DEF1]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[V_PK_MAX_F16_]], implicit $exec - ; CHECK-NEXT: $vcc = COPY killed undef [[DEF]] + ; CHECK-NEXT: $vcc = COPY killed [[DEF]] ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, [[V_PK_MAX_F16_]], 0, 6, 0, 6, 5, implicit $vcc, implicit $exec ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]] @@ -88,20 +120,20 @@ body: | ... --- -name: carry-copy-multiuse # copy of carry-in necessary because of second use +name: carry-multiuse tracksRegLiveness: true body: | bb.0: liveins: $vgpr0, $vgpr1 - ; CHECK-LABEL: name: carry-copy-multiuse + ; CHECK-LABEL: name: carry-multiuse ; CHECK: liveins: $vgpr0, $vgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_F16 8, undef [[DEF1]], 8, undef [[DEF1]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[V_PK_MAX_F16_]], implicit $exec - ; CHECK-NEXT: $vcc = COPY killed undef [[DEF]] + ; CHECK-NEXT: $vcc = COPY [[DEF]] ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, [[V_PK_MAX_F16_]], 0, 6, 0, 6, 5, implicit $vcc, implicit $exec ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]] @@ -111,7 +143,7 @@ body: | %1:vgpr_32 = IMPLICIT_DEF %2:vgpr_32 = V_PK_MAX_F16 8, undef %1, 8, undef %1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec %3:vgpr_32 = V_LSHRREV_B32_e64 16, %2, implicit $exec - %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %3, killed %0, implicit $exec + %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %3, %0, implicit $exec $vgpr0 = COPY %4 $vgpr1 = COPY %0 SI_RETURN implicit $vgpr0 @@ -130,9 +162,9 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 undef [[DEF]], 1, implicit $exec - ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[DEF1]], implicit $exec - ; CHECK-NEXT: V_CMP_EQ_U32_e32 1, undef [[DEF1]], implicit-def $vcc, implicit $exec + ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[DEF]], 1, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[DEF1]], implicit $exec + ; CHECK-NEXT: V_CMP_EQ_U32_e32 1, [[DEF1]], implicit-def $vcc, implicit $exec ; CHECK-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, [[V_LSHRREV_B32_e64_]], [[V_CMP_EQ_U32_e64_]], implicit $exec ; CHECK-NEXT: [[V_CNDMASK_B32_e32_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e32 killed [[V_LSHRREV_B32_e64_]], killed [[V_LSHRREV_B32_e64_]], implicit $vcc, implicit $exec ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]] @@ -150,13 +182,13 @@ body: | ... --- -name: cannot-shrink-source-mods # cannot shrink because of source modifiers +name: cannot-shrink-with-source-mods tracksRegLiveness: true body: | bb.0: liveins: $vgpr0 - ; CHECK-LABEL: name: cannot-shrink-source-mods + ; CHECK-LABEL: name: cannot-shrink-with-source-mods ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF From 1975582ca3e9b13da401290fdc1801bea8fd8297 Mon Sep 17 00:00:00 2001 From: Frederik Harwath Date: Mon, 5 May 2025 08:59:35 -0400 Subject: [PATCH 26/26] Rename VOP2 test file and remove "-vop3" from other test names ... since those files now also cover the vop2 case. --- .../AMDGPU/sdwa-peephole-cndmask-vop2.mir | 41 ------------------- ...2.mir => sdwa-peephole-cndmask-wave32.mir} | 0 ...4.mir => sdwa-peephole-cndmask-wave64.mir} | 0 3 files changed, 41 deletions(-) delete mode 100644 llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop2.mir rename llvm/test/CodeGen/AMDGPU/{sdwa-peephole-cndmask-vop3-wave32.mir => sdwa-peephole-cndmask-wave32.mir} (100%) rename llvm/test/CodeGen/AMDGPU/{sdwa-peephole-cndmask-vop3-wave64.mir => sdwa-peephole-cndmask-wave64.mir} (100%) diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop2.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop2.mir deleted file mode 100644 index c3bcd6129e346..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop2.mir +++ /dev/null @@ -1,41 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 -# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=si-peephole-sdwa -mcpu=gfx803 -o - %s | FileCheck -check-prefix=GFX8 %s -# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=si-peephole-sdwa -mcpu=gfx1100 -o - %s | FileCheck -check-prefix=GFX11 %s - ---- -name: v_cndmask_b32_test -tracksRegLiveness: true -body: | - bb.0: - liveins: $vgpr0, $vgpr1, $vcc - - ; GFX8-LABEL: name: v_cndmask_b32_test - ; GFX8: liveins: $vgpr0, $vgpr1, $vcc - ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX8-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[COPY]], implicit $exec - ; GFX8-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[COPY1]], implicit $exec - ; GFX8-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[COPY]], 0, [[COPY1]], 0, 6, 0, 5, 5, implicit $vcc, implicit $exec - ; GFX8-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]] - ; GFX8-NEXT: SI_RETURN implicit $vgpr0 - ; - ; GFX11-LABEL: name: v_cndmask_b32_test - ; GFX11: liveins: $vgpr0, $vgpr1, $vcc - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[COPY]], implicit $exec - ; GFX11-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[COPY1]], implicit $exec - ; GFX11-NEXT: [[V_CNDMASK_B32_e32_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e32 killed [[V_LSHRREV_B32_e64_]], killed [[V_LSHRREV_B32_e64_1]], implicit $exec, implicit $vcc_lo - ; GFX11-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e32_]] - ; GFX11-NEXT: SI_RETURN implicit $vgpr0 - %0:vgpr_32 = COPY $vgpr1 - %1:vgpr_32 = COPY $vgpr0 - %2:vgpr_32 = V_LSHRREV_B32_e64 16, %0, implicit $exec - %3:vgpr_32 = V_LSHRREV_B32_e64 16, %1, implicit $exec - %4:vgpr_32 = V_CNDMASK_B32_e32 killed %2, killed %3, implicit $exec, implicit $vcc - $vgpr0 = COPY %4 - SI_RETURN implicit $vgpr0 - -... diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave32.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave32.mir similarity index 100% rename from llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave32.mir rename to llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave32.mir diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave64.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave64.mir similarity index 100% rename from llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-vop3-wave64.mir rename to llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave64.mir