diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index eb9aabf8b6317..67832cfc0c571 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -126,6 +126,7 @@ class SIFoldOperandsImpl { std::optional getImmOrMaterializedImm(MachineOperand &Op) const; bool tryConstantFoldOp(MachineInstr *MI) const; bool tryFoldCndMask(MachineInstr &MI) const; + bool tryScalarizeReadLaneSrc(MachineInstr &MI) const; bool tryFoldZeroHighBits(MachineInstr &MI) const; bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const; @@ -1407,6 +1408,148 @@ bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const { return true; } +static unsigned +getScalarizedReadLaneSrcOpc(const GCNSubtarget &ST, unsigned Opc, + SmallVectorImpl &Ops) { + // Opcodes here are added as-needed because there are hundreds of + // instructions we could convert, but realistically we only need + // the most frequent ones to make an impact. + // + // The InstCombine version of this transform will do the heavy + // lifting, this is just a cleanup for the readlanes added during + // lowering. + switch (Opc) { + case AMDGPU::V_OR_B32_e32: + case AMDGPU::V_OR_B32_e64: + return AMDGPU::S_OR_B32; + case AMDGPU::V_MUL_HI_U32_e64: + if (ST.getGeneration() >= GCNSubtarget::GFX9) + return AMDGPU::S_MUL_HI_U32; + break; + case AMDGPU::V_AND_B32_e32: + case AMDGPU::V_AND_B32_e64: + return AMDGPU::S_AND_B32; + case AMDGPU::V_LSHRREV_B32_e32: // dst = S1 >> S0 + case AMDGPU::V_LSHRREV_B32_e64: + std::swap(Ops[0], Ops[1]); // dst = S0 >> S1 (!) + return AMDGPU::S_LSHR_B32; + case AMDGPU::V_CVT_U32_F32_e32: + case AMDGPU::V_CVT_U32_F32_e64: + if (ST.hasSALUFloatInsts()) + return AMDGPU::S_CVT_U32_F32; + break; + case AMDGPU::V_MIN_U32_e32: + case AMDGPU::V_MIN_U32_e64: + return AMDGPU::S_MIN_U32; + case AMDGPU::V_MIN_I32_e32: + case AMDGPU::V_MIN_I32_e64: + return AMDGPU::S_MIN_I32; + case AMDGPU::V_MAX_U32_e32: + case AMDGPU::V_MAX_U32_e64: + return AMDGPU::S_MAX_U32; + case AMDGPU::V_MAX_I32_e32: + case AMDGPU::V_MAX_I32_e64: + return AMDGPU::S_MAX_I32; + default: + break; + } + + return -1; +} + +// Try to transform +// %0:vgpr = (valu op) %x:vgpr +// %1:sgpr = v_readfirstlane %0 +// Into +// %0:sgpr = v_readfirstlane %x:vgpr +// %1:sgpr = (salu op) %0 +bool SIFoldOperandsImpl::tryScalarizeReadLaneSrc(MachineInstr &MI) const { + const unsigned Opc = MI.getOpcode(); + if (Opc != AMDGPU::V_READFIRSTLANE_B32 && Opc != AMDGPU::V_READLANE_B32) + return false; + + const auto VSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); + const Register VSrc = MI.getOperand(VSrcIdx).getReg(); + + if (!MRI->hasOneNonDBGUse(VSrc)) + return false; + + MachineInstr *VSrcDef = MRI->getVRegDef(VSrc); + // Need a unary or binary VALU instruction as operand. + if (!VSrcDef || (VSrcDef->getParent() != MI.getParent()) || + !TII->isVALU(*VSrcDef) || VSrcDef->getNumExplicitOperands() > 3 || + execMayBeModifiedBeforeUse(*MRI, VSrc, *VSrcDef, MI)) + return false; + + const bool IsReadLane = (Opc == AMDGPU::V_READLANE_B32); + if (IsReadLane) { + MachineOperand &LaneOp = MI.getOperand(2); + if (LaneOp.isReg()) { // Can the lane be an imm? + Register LaneReg = LaneOp.getReg(); + for (auto It = VSrcDef->getIterator(); It != MI.getIterator(); ++It) { + if (It->modifiesRegister(LaneReg, TRI)) + return false; + } + } + } + + SmallVector Ops; + MachineOperand *TargetOp = nullptr; + for (MachineOperand &SrcOp : VSrcDef->operands()) { + if (SrcOp.isReg()) { + if (SrcOp.isImplicit() || SrcOp.isDef()) + continue; + + Ops.push_back(&SrcOp); + + Register Reg = SrcOp.getReg(); + if (TRI->isVectorRegister(*MRI, Reg)) { + // This only works if we have one VGPR src. + if (TargetOp) + return false; + TargetOp = &SrcOp; + } + } else { + Ops.push_back(&SrcOp); // also collect imms + } + } + if (!TargetOp) + return false; + + LLVM_DEBUG(dbgs() << "tryScalarizeReadLaneSrc:\n\treadlane: " << MI + << "\tsrc: " << *VSrcDef << "\top: " << *TargetOp << "\n"); + + const unsigned ScalarOp = + getScalarizedReadLaneSrcOpc(*ST, VSrcDef->getOpcode(), Ops); + if (ScalarOp == unsigned(-1)) + return false; + + // We only support unary/binary ops. + assert(Ops.size() <= 2); + + MachineBasicBlock *MBB = VSrcDef->getParent(); + auto InsertBefore = VSrcDef->getIterator(); + const DebugLoc &DL = VSrcDef->getDebugLoc(); + Register SDst = MI.getOperand(0).getReg(); + + Register STargetOp = MRI->createVirtualRegister(MRI->getRegClass(SDst)); + auto NewMI = BuildMI(*MBB, InsertBefore, DL, MI.getDesc(), STargetOp) + .addReg(TargetOp->getReg()); + if (IsReadLane) + NewMI.add(MI.getOperand(2)); // lane index + auto ScalarMI = BuildMI(*MBB, InsertBefore, DL, TII->get(ScalarOp), SDst); + for (MachineOperand *Op : Ops) { + if (Op == TargetOp) + ScalarMI.addReg(STargetOp); + else + ScalarMI.add(*Op); + } + + VSrcDef->eraseFromParent(); + MI.eraseFromParent(); + return true; +} + bool SIFoldOperandsImpl::tryFoldZeroHighBits(MachineInstr &MI) const { if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 && MI.getOpcode() != AMDGPU::V_AND_B32_e32) @@ -2353,6 +2496,11 @@ bool SIFoldOperandsImpl::run(MachineFunction &MF) { for (auto &MI : make_early_inc_range(*MBB)) { Changed |= tryFoldCndMask(MI); + if (tryScalarizeReadLaneSrc(MI)) { + Changed = true; + continue; + } + if (tryFoldZeroHighBits(MI)) { Changed = true; continue; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll index 2389924b82484..76cdbaa661579 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -730,18 +730,18 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX6-NEXT: s_lshl_b32 s3, s4, 24 ; GFX6-NEXT: s_lshl_b32 s4, s7, 24 ; GFX6-NEXT: s_not_b32 s5, s3 -; GFX6-NEXT: s_min_u32 s4, s5, s4 ; GFX6-NEXT: s_lshr_b32 s1, s1, 24 +; GFX6-NEXT: s_min_u32 s4, s5, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: s_lshr_b32 s2, s2, 24 ; GFX6-NEXT: s_add_i32 s3, s3, s4 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: s_lshr_b32 s3, s3, 24 ; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 24 +; GFX6-NEXT: s_lshr_b32 s3, s3, 24 ; GFX6-NEXT: s_lshl_b32 s0, s2, 16 -; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX6-NEXT: s_lshl_b32 s0, s3, 24 -; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: v_readfirstlane_b32 s1, v0 +; GFX6-NEXT: s_or_b32 s0, s1, s0 +; GFX6-NEXT: s_lshl_b32 s1, s3, 24 +; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_uaddsat_v4i8: @@ -1020,8 +1020,8 @@ define amdgpu_ps i24 @s_uaddsat_i24(i24 inreg %lhs, i24 inreg %rhs) { ; GFX8-NEXT: s_lshl_b32 s0, s0, 8 ; GFX8-NEXT: v_mov_b32_e32 v0, s1 ; GFX8-NEXT: v_add_u32_e64 v0, s[0:1], s0, v0 clamp -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: s_lshr_b32 s0, s0, 8 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_i24: @@ -1030,8 +1030,8 @@ define amdgpu_ps i24 @s_uaddsat_i24(i24 inreg %lhs, i24 inreg %rhs) { ; GFX9-NEXT: s_lshl_b32 s0, s0, 8 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_add_u32_e64 v0, s0, v0 clamp -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: s_lshr_b32 s0, s0, 8 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_uaddsat_i24: @@ -1039,8 +1039,8 @@ define amdgpu_ps i24 @s_uaddsat_i24(i24 inreg %lhs, i24 inreg %rhs) { ; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8 ; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v0, s0, s1 clamp -; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, 8 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = call i24 @llvm.uadd.sat.i24(i24 %lhs, i24 %rhs) ret i24 %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll index 34d36581a21db..0311e0fb4d68c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -714,18 +714,18 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX6-NEXT: s_sub_i32 s2, s2, s3 ; GFX6-NEXT: s_lshl_b32 s3, s4, 24 ; GFX6-NEXT: s_lshl_b32 s4, s7, 24 -; GFX6-NEXT: s_min_u32 s4, s3, s4 ; GFX6-NEXT: s_lshr_b32 s1, s1, 24 +; GFX6-NEXT: s_min_u32 s4, s3, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: s_lshr_b32 s2, s2, 24 ; GFX6-NEXT: s_sub_i32 s3, s3, s4 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: s_lshr_b32 s3, s3, 24 ; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 24 +; GFX6-NEXT: s_lshr_b32 s3, s3, 24 ; GFX6-NEXT: s_lshl_b32 s0, s2, 16 -; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX6-NEXT: s_lshl_b32 s0, s3, 24 -; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: v_readfirstlane_b32 s1, v0 +; GFX6-NEXT: s_or_b32 s0, s1, s0 +; GFX6-NEXT: s_lshl_b32 s1, s3, 24 +; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_usubsat_v4i8: @@ -1002,8 +1002,8 @@ define amdgpu_ps i24 @s_usubsat_i24(i24 inreg %lhs, i24 inreg %rhs) { ; GFX8-NEXT: s_lshl_b32 s0, s0, 8 ; GFX8-NEXT: v_mov_b32_e32 v0, s1 ; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s0, v0 clamp -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: s_lshr_b32 s0, s0, 8 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubsat_i24: @@ -1012,8 +1012,8 @@ define amdgpu_ps i24 @s_usubsat_i24(i24 inreg %lhs, i24 inreg %rhs) { ; GFX9-NEXT: s_lshl_b32 s0, s0, 8 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: s_lshr_b32 s0, s0, 8 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_usubsat_i24: @@ -1021,8 +1021,8 @@ define amdgpu_ps i24 @s_usubsat_i24(i24 inreg %lhs, i24 inreg %rhs) { ; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8 ; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, s0, s1 clamp -; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, 8 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = call i24 @llvm.usub.sat.i24(i24 %lhs, i24 %rhs) ret i24 %result diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 3737cc414c58f..39f9bd3768a42 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -7180,10 +7180,10 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 -; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 +; GFX7LESS-NEXT: s_and_b32 s4, s4, 0xff +; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; GFX7LESS-NEXT: v_or_b32_e32 v0, s4, v0 ; GFX7LESS-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm @@ -7214,10 +7214,10 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace ; GFX8-NEXT: v_lshrrev_b32_e32 v0, s2, v0 ; GFX8-NEXT: .LBB12_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_and_b32 s4, s4, 0xff ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -7251,10 +7251,10 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace ; GFX9-NEXT: v_lshrrev_b32_e32 v0, s2, v0 ; GFX9-NEXT: .LBB12_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xff ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -7289,11 +7289,11 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace ; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s7, v0 ; GFX1064-NEXT: .LBB12_2: ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_cndmask_b32_e64 v0, s6, 0, vcc +; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: s_and_b32 s2, s2, 0xff ; GFX1064-NEXT: v_or_b32_e32 v0, s2, v0 ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -7325,11 +7325,11 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace ; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s5, v0 ; GFX1032-NEXT: .LBB12_2: ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_cndmask_b32_e64 v0, s6, 0, vcc_lo +; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: s_and_b32 s2, s2, 0xff ; GFX1032-NEXT: v_or_b32_e32 v0, s2, v0 ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -7363,13 +7363,13 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace ; GFX1164-NEXT: v_lshrrev_b32_e32 v0, s7, v0 ; GFX1164-NEXT: .LBB12_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_cndmask_b32_e64 v0, s6, 0, vcc -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_and_b32 s2, s2, 0xff +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_or_b32_e32 v0, s2, v0 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: buffer_store_b8 v0, off, s[0:3], 0 @@ -7402,13 +7402,13 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace ; GFX1132-NEXT: v_lshrrev_b32_e32 v0, s6, v0 ; GFX1132-NEXT: .LBB12_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_cndmask_b32_e64 v0, s4, 0, vcc_lo -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_and_b32 s2, s2, 0xff +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_or_b32_e32 v0, s2, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: buffer_store_b8 v0, off, s[0:3], 0 @@ -7442,13 +7442,13 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace ; GFX1264-NEXT: v_lshrrev_b32_e32 v0, s7, v0 ; GFX1264-NEXT: .LBB12_2: ; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1264-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1264-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1264-NEXT: v_cndmask_b32_e64 v0, s6, 0, vcc -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264-NEXT: s_and_b32 s2, s2, 0xff +; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1264-NEXT: v_or_b32_e32 v0, s2, v0 ; GFX1264-NEXT: s_mov_b32 s2, -1 ; GFX1264-NEXT: buffer_store_b8 v0, off, s[0:3], null @@ -7481,13 +7481,13 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace ; GFX1232-NEXT: v_lshrrev_b32_e32 v0, s6, v0 ; GFX1232-NEXT: .LBB12_2: ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1232-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX1232-NEXT: s_wait_kmcnt 0x0 -; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232-NEXT: v_cndmask_b32_e64 v0, s4, 0, vcc_lo -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1232-NEXT: s_and_b32 s2, s2, 0xff +; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1232-NEXT: v_or_b32_e32 v0, s2, v0 ; GFX1232-NEXT: s_mov_b32 s2, -1 ; GFX1232-NEXT: buffer_store_b8 v0, off, s[0:3], null @@ -7551,8 +7551,8 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 +; GFX7LESS-NEXT: s_and_b32 s4, s4, 0xff ; GFX7LESS-NEXT: s_and_b32 s5, s10, 0xff ; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 ; GFX7LESS-NEXT: v_mad_u32_u24 v0, s5, v4, v0 @@ -7608,8 +7608,8 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX8-NEXT: v_lshrrev_b32_e32 v0, s11, v2 ; GFX8-NEXT: .LBB13_4: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8-NEXT: s_and_b32 s4, s4, 0xff ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s3, 0xf000 @@ -7666,8 +7666,8 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX9-NEXT: v_lshrrev_b32_e32 v0, s11, v2 ; GFX9-NEXT: .LBB13_4: ; %Flow ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9-NEXT: s_and_b32 s4, s4, 0xff ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s3, 0xf000 @@ -7725,10 +7725,10 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s11, v2 ; GFX1064-NEXT: .LBB13_4: ; %Flow ; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX1064-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: s_and_b32 s2, s2, 0xff ; GFX1064-NEXT: v_mad_u16 v0, s10, v4, s2 ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -7782,10 +7782,10 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s2, v2 ; GFX1032-NEXT: .LBB13_4: ; %Flow ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX1032-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: s_and_b32 s2, s2, 0xff ; GFX1032-NEXT: v_mad_u16 v0, s8, v4, s2 ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -7845,12 +7845,12 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1164-NEXT: v_lshrrev_b32_e32 v0, s11, v2 ; GFX1164-NEXT: .LBB13_4: ; %Flow ; GFX1164-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_and_b32 s2, s2, 0xff +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: v_mad_u16 v0, s10, v4, s2 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: buffer_store_b8 v0, off, s[0:3], 0 @@ -7907,12 +7907,12 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1132-NEXT: v_lshrrev_b32_e32 v0, s2, v2 ; GFX1132-NEXT: .LBB13_4: ; %Flow ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_and_b32 s2, s2, 0xff +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: v_mad_u16 v0, s8, v4, s2 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: buffer_store_b8 v0, off, s[0:3], 0 @@ -7972,13 +7972,12 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1264-NEXT: v_lshrrev_b32_e32 v0, s11, v2 ; GFX1264-NEXT: .LBB13_4: ; %Flow ; GFX1264-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1264-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1264-NEXT: s_wait_alu 0xf1ff -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264-NEXT: s_and_b32 s2, s2, 0xff +; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: v_mad_u16 v0, s10, v4, s2 ; GFX1264-NEXT: s_mov_b32 s2, -1 ; GFX1264-NEXT: buffer_store_b8 v0, off, s[0:3], null @@ -8038,13 +8037,12 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1232-NEXT: v_lshrrev_b32_e32 v0, s2, v2 ; GFX1232-NEXT: .LBB13_4: ; %Flow ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1232-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX1232-NEXT: s_wait_kmcnt 0x0 -; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1232-NEXT: s_wait_alu 0xf1ff -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1232-NEXT: s_and_b32 s2, s2, 0xff +; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: v_mad_u16 v0, s8, v4, s2 ; GFX1232-NEXT: s_mov_b32 s2, -1 ; GFX1232-NEXT: buffer_store_b8 v0, off, s[0:3], null @@ -8456,10 +8454,10 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 -; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 +; GFX7LESS-NEXT: s_and_b32 s4, s4, 0xffff +; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; GFX7LESS-NEXT: v_or_b32_e32 v0, s4, v0 ; GFX7LESS-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm @@ -8490,10 +8488,10 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac ; GFX8-NEXT: v_lshrrev_b32_e32 v0, s2, v0 ; GFX8-NEXT: .LBB15_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -8527,10 +8525,10 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac ; GFX9-NEXT: v_lshrrev_b32_e32 v0, s2, v0 ; GFX9-NEXT: .LBB15_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -8565,11 +8563,11 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac ; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s7, v0 ; GFX1064-NEXT: .LBB15_2: ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_cndmask_b32_e64 v0, s6, 0, vcc +; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX1064-NEXT: v_or_b32_e32 v0, s2, v0 ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -8601,11 +8599,11 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac ; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s5, v0 ; GFX1032-NEXT: .LBB15_2: ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_cndmask_b32_e64 v0, s6, 0, vcc_lo +; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX1032-NEXT: v_or_b32_e32 v0, s2, v0 ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -8639,13 +8637,13 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac ; GFX1164-NEXT: v_lshrrev_b32_e32 v0, s7, v0 ; GFX1164-NEXT: .LBB15_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_cndmask_b32_e64 v0, s6, 0, vcc -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_or_b32_e32 v0, s2, v0 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: buffer_store_b16 v0, off, s[0:3], 0 @@ -8678,13 +8676,13 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac ; GFX1132-NEXT: v_lshrrev_b32_e32 v0, s6, v0 ; GFX1132-NEXT: .LBB15_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_cndmask_b32_e64 v0, s4, 0, vcc_lo -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_or_b32_e32 v0, s2, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: buffer_store_b16 v0, off, s[0:3], 0 @@ -8718,13 +8716,13 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac ; GFX1264-NEXT: v_lshrrev_b32_e32 v0, s7, v0 ; GFX1264-NEXT: .LBB15_2: ; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1264-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1264-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1264-NEXT: v_cndmask_b32_e64 v0, s6, 0, vcc -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1264-NEXT: v_or_b32_e32 v0, s2, v0 ; GFX1264-NEXT: s_mov_b32 s2, -1 ; GFX1264-NEXT: buffer_store_b16 v0, off, s[0:3], null @@ -8757,13 +8755,13 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac ; GFX1232-NEXT: v_lshrrev_b32_e32 v0, s6, v0 ; GFX1232-NEXT: .LBB15_2: ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1232-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1232-NEXT: s_wait_kmcnt 0x0 -; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232-NEXT: v_cndmask_b32_e64 v0, s4, 0, vcc_lo -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1232-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1232-NEXT: v_or_b32_e32 v0, s2, v0 ; GFX1232-NEXT: s_mov_b32 s2, -1 ; GFX1232-NEXT: buffer_store_b16 v0, off, s[0:3], null @@ -8827,8 +8825,8 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 +; GFX7LESS-NEXT: s_and_b32 s4, s4, 0xffff ; GFX7LESS-NEXT: s_and_b32 s5, s10, 0xffff ; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 ; GFX7LESS-NEXT: v_mad_u32_u24 v0, s5, v4, v0 @@ -8884,8 +8882,8 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX8-NEXT: v_lshrrev_b32_e32 v0, s11, v2 ; GFX8-NEXT: .LBB16_4: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s3, 0xf000 @@ -8942,8 +8940,8 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX9-NEXT: v_lshrrev_b32_e32 v0, s11, v2 ; GFX9-NEXT: .LBB16_4: ; %Flow ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s3, 0xf000 @@ -9001,10 +8999,10 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s11, v2 ; GFX1064-NEXT: .LBB16_4: ; %Flow ; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX1064-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX1064-NEXT: v_mad_u16 v0, s10, v4, s2 ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -9058,10 +9056,10 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s2, v2 ; GFX1032-NEXT: .LBB16_4: ; %Flow ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX1032-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX1032-NEXT: v_mad_u16 v0, s8, v4, s2 ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -9121,12 +9119,12 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1164-NEXT: v_lshrrev_b32_e32 v0, s11, v2 ; GFX1164-NEXT: .LBB16_4: ; %Flow ; GFX1164-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: v_mad_u16 v0, s10, v4, s2 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: buffer_store_b16 v0, off, s[0:3], 0 @@ -9183,12 +9181,12 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1132-NEXT: v_lshrrev_b32_e32 v0, s2, v2 ; GFX1132-NEXT: .LBB16_4: ; %Flow ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: v_mad_u16 v0, s8, v4, s2 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: buffer_store_b16 v0, off, s[0:3], 0 @@ -9248,13 +9246,12 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1264-NEXT: v_lshrrev_b32_e32 v0, s11, v2 ; GFX1264-NEXT: .LBB16_4: ; %Flow ; GFX1264-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1264-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1264-NEXT: s_wait_alu 0xf1ff -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: v_mad_u16 v0, s10, v4, s2 ; GFX1264-NEXT: s_mov_b32 s2, -1 ; GFX1264-NEXT: buffer_store_b16 v0, off, s[0:3], null @@ -9314,13 +9311,12 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1232-NEXT: v_lshrrev_b32_e32 v0, s2, v2 ; GFX1232-NEXT: .LBB16_4: ; %Flow ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1232-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1232-NEXT: s_wait_kmcnt 0x0 -; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1232-NEXT: s_wait_alu 0xf1ff -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1232-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: v_mad_u16 v0, s8, v4, s2 ; GFX1232-NEXT: s_mov_b32 s2, -1 ; GFX1232-NEXT: buffer_store_b16 v0, off, s[0:3], null diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index aafdb1c8cc36f..50372d2095ae5 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -18718,15 +18718,15 @@ define amdgpu_ps i32 @s_fneg_bf16(bfloat inreg %a) { ; GCN-LABEL: s_fneg_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: v_mul_f32_e64 v0, -1.0, s0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: s_lshr_b32 s0, s0, 16 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_fneg_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_mul_f32_e64 v0, -1.0, s0 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: s_lshr_b32 s0, s0, 16 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fneg_bf16: @@ -30619,18 +30619,18 @@ define amdgpu_ps i32 @s_copysign_bf16_bf16(bfloat inreg %mag, bfloat inreg %sign ; GCN-NEXT: s_and_b32 s0, s1, 0x80000000 ; GCN-NEXT: s_lshr_b32 s0, s0, 16 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GCN-NEXT: v_or_b32_e32 v0, s0, v0 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: v_readfirstlane_b32 s1, v0 +; GCN-NEXT: s_or_b32 s0, s1, s0 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_copysign_bf16_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0 ; GFX7-NEXT: s_and_b32 s0, s1, 0x80000000 -; GFX7-NEXT: s_lshr_b32 s0, s0, 16 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: s_lshr_b32 s0, s0, 16 +; GFX7-NEXT: v_readfirstlane_b32 s1, v0 +; GFX7-NEXT: s_or_b32 s0, s1, s0 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_copysign_bf16_bf16: @@ -30639,8 +30639,8 @@ define amdgpu_ps i32 @s_copysign_bf16_bf16(bfloat inreg %mag, bfloat inreg %sign ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_bfi_b32 v0, s2, v0, v1 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_copysign_bf16_bf16: @@ -30649,16 +30649,16 @@ define amdgpu_ps i32 @s_copysign_bf16_bf16(bfloat inreg %mag, bfloat inreg %sign ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_copysign_bf16_bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v0, s1 ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_copysign_bf16_bf16: @@ -30666,9 +30666,9 @@ define amdgpu_ps i32 @s_copysign_bf16_bf16(bfloat inreg %mag, bfloat inreg %sign ; GFX11-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX11-NEXT: ; return to shader part epilog %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign) %cast = bitcast bfloat %op to i16 @@ -30684,18 +30684,18 @@ define amdgpu_ps i32 @s_copysign_bf16_f32(bfloat inreg %mag, float inreg %sign.f ; GCN-NEXT: s_and_b32 s0, s1, 0x80000000 ; GCN-NEXT: s_lshr_b32 s0, s0, 16 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GCN-NEXT: v_or_b32_e32 v0, s0, v0 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: v_readfirstlane_b32 s1, v0 +; GCN-NEXT: s_or_b32 s0, s1, s0 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_copysign_bf16_f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0 ; GFX7-NEXT: s_and_b32 s0, s1, 0x80000000 -; GFX7-NEXT: s_lshr_b32 s0, s0, 16 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: s_lshr_b32 s0, s0, 16 +; GFX7-NEXT: v_readfirstlane_b32 s1, v0 +; GFX7-NEXT: s_or_b32 s0, s1, s0 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_copysign_bf16_f32: @@ -30704,8 +30704,8 @@ define amdgpu_ps i32 @s_copysign_bf16_f32(bfloat inreg %mag, float inreg %sign.f ; GFX8-NEXT: s_movk_i32 s1, 0x7fff ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_bfi_b32 v0, s1, v1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_copysign_bf16_f32: @@ -30714,16 +30714,16 @@ define amdgpu_ps i32 @s_copysign_bf16_f32(bfloat inreg %mag, float inreg %sign.f ; GFX9-NEXT: s_movk_i32 s1, 0x7fff ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_bfi_b32 v0, s1, v1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_copysign_bf16_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 16, s1 ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_copysign_bf16_f32: @@ -30731,9 +30731,9 @@ define amdgpu_ps i32 @s_copysign_bf16_f32(bfloat inreg %mag, float inreg %sign.f ; GFX11-NEXT: v_lshrrev_b32_e64 v0, 16, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX11-NEXT: ; return to shader part epilog %sign = fptrunc float %sign.f32 to bfloat %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign) @@ -30750,18 +30750,18 @@ define amdgpu_ps i32 @s_copysign_bf16_f64(bfloat inreg %mag, double inreg %sign. ; GCN-NEXT: s_and_b32 s0, s2, 0x80000000 ; GCN-NEXT: s_lshr_b32 s0, s0, 16 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GCN-NEXT: v_or_b32_e32 v0, s0, v0 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: v_readfirstlane_b32 s1, v0 +; GCN-NEXT: s_or_b32 s0, s1, s0 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_copysign_bf16_f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0 ; GFX7-NEXT: s_and_b32 s0, s2, 0x80000000 -; GFX7-NEXT: s_lshr_b32 s0, s0, 16 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: s_lshr_b32 s0, s0, 16 +; GFX7-NEXT: v_readfirstlane_b32 s1, v0 +; GFX7-NEXT: s_or_b32 s0, s1, s0 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_copysign_bf16_f64: @@ -30770,8 +30770,8 @@ define amdgpu_ps i32 @s_copysign_bf16_f64(bfloat inreg %mag, double inreg %sign. ; GFX8-NEXT: s_movk_i32 s1, 0x7fff ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_bfi_b32 v0, s1, v1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_copysign_bf16_f64: @@ -30780,16 +30780,16 @@ define amdgpu_ps i32 @s_copysign_bf16_f64(bfloat inreg %mag, double inreg %sign. ; GFX9-NEXT: s_movk_i32 s1, 0x7fff ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_bfi_b32 v0, s1, v1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_copysign_bf16_f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 16, s2 ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_copysign_bf16_f64: @@ -30797,9 +30797,9 @@ define amdgpu_ps i32 @s_copysign_bf16_f64(bfloat inreg %mag, double inreg %sign. ; GFX11-NEXT: v_lshrrev_b32_e64 v0, 16, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX11-NEXT: ; return to shader part epilog %sign = fptrunc double %sign.f64 to bfloat %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign) @@ -30836,8 +30836,8 @@ define amdgpu_ps i32 @s_copysign_bf16_f16(bfloat inreg %mag, half inreg %sign.f1 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_bfi_b32 v0, s2, v0, v1 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_copysign_bf16_f16: @@ -30846,16 +30846,16 @@ define amdgpu_ps i32 @s_copysign_bf16_f16(bfloat inreg %mag, half inreg %sign.f1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_copysign_bf16_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v0, s1 ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_copysign_bf16_f16: @@ -30863,9 +30863,9 @@ define amdgpu_ps i32 @s_copysign_bf16_f16(bfloat inreg %mag, half inreg %sign.f1 ; GFX11-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX11-NEXT: ; return to shader part epilog %sign = bitcast half %sign.f16 to bfloat %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign) @@ -31075,8 +31075,8 @@ define amdgpu_ps i32 @s_copysign_f16_bf16(half inreg %mag, bfloat inreg %sign.bf ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_bfi_b32 v0, s2, v0, v1 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_copysign_f16_bf16: @@ -31085,16 +31085,16 @@ define amdgpu_ps i32 @s_copysign_f16_bf16(half inreg %mag, bfloat inreg %sign.bf ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_copysign_f16_bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v0, s1 ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_copysign_f16_bf16: @@ -31102,9 +31102,9 @@ define amdgpu_ps i32 @s_copysign_f16_bf16(half inreg %mag, bfloat inreg %sign.bf ; GFX11-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX11-NEXT: ; return to shader part epilog %sign = bitcast bfloat %sign.bf16 to half %op = call half @llvm.copysign.f16(half %mag, half %sign) @@ -38110,8 +38110,8 @@ define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) { ; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: s_lshr_b32 s0, s0, 16 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_select_bf16: @@ -38120,8 +38120,8 @@ define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) { ; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: s_lshr_b32 s0, s0, 16 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_select_bf16: @@ -38130,8 +38130,8 @@ define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) { ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_select_bf16: @@ -38140,8 +38140,8 @@ define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) { ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_select_bf16: @@ -38149,8 +38149,8 @@ define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) { ; GFX10-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, s1, v1, vcc_lo -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11TRUE16-LABEL: s_select_bf16: @@ -38159,9 +38159,9 @@ define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) { ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, s1, v0.l, vcc_lo -; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11TRUE16-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX11TRUE16-NEXT: ; return to shader part epilog ; ; GFX11FAKE16-LABEL: s_select_bf16: @@ -38170,9 +38170,9 @@ define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) { ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, s1, v1, vcc_lo -; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX11FAKE16-NEXT: ; return to shader part epilog %cond = icmp eq i32 %c, 0 %op = select i1 %cond, bfloat %a, bfloat %b @@ -39793,9 +39793,9 @@ define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat> ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX8-NEXT: v_readfirstlane_b32 s0, v1 +; GFX8-NEXT: s_and_b32 s1, s0, 0xffff ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: v_readfirstlane_b32 s1, v1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_select_v3bf16: @@ -39807,9 +39807,9 @@ define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat> ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_readfirstlane_b32 s0, v1 +; GFX9-NEXT: s_and_b32 s1, s0, 0xffff ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_select_v3bf16: @@ -39820,8 +39820,8 @@ define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat> ; GFX10-NEXT: v_cndmask_b32_e32 v0, s2, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, s3, v2, vcc_lo ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: s_and_b32 s1, s1, 0xffff ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_select_v3bf16: @@ -39833,9 +39833,9 @@ define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat> ; GFX11-NEXT: v_cndmask_b32_e32 v1, s3, v2, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_and_b32 s1, s1, 0xffff ; GFX11-NEXT: ; return to shader part epilog %cond = icmp eq i32 %c, 0 %op = select i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll index 2a372dffce650..1b06c098fa5e0 100644 --- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll @@ -1317,10 +1317,10 @@ define amdgpu_ps i32 @s_fmaximum3_f16(half inreg %a, half inreg %b, half inreg % ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_maximum3_f16 v0, s0, s1, v0 -; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_readfirstlane_b32 s0, v0 -; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: ; return to shader part epilog ; ; GFX942-LABEL: s_fmaximum3_f16: @@ -1335,9 +1335,9 @@ define amdgpu_ps i32 @s_fmaximum3_f16(half inreg %a, half inreg %b, half inreg % ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, s2, v0 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX942-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_readfirstlane_b32 s0, v0 +; GFX942-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX942-NEXT: ; return to shader part epilog ; ; GFX950-LABEL: s_fmaximum3_f16: @@ -1346,9 +1346,8 @@ define amdgpu_ps i32 @s_fmaximum3_f16(half inreg %a, half inreg %b, half inreg % ; GFX950-NEXT: v_mov_b32_e32 v1, s2 ; GFX950-NEXT: v_pk_maximum3_f16 v0, s0, v0, v1 ; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_readfirstlane_b32 s0, v0 +; GFX950-NEXT: s_and_b32 s0, s0, 0xffff ; GFX950-NEXT: ; return to shader part epilog %max0 = call half @llvm.maximum.f16(half %a, half %b) %max1 = call half @llvm.maximum.f16(half %max0, half %c) @@ -3764,12 +3763,12 @@ define amdgpu_ps <2 x i32> @s_no_fmaximum3_f16__multi_use(half inreg %a, half in ; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX942-NEXT: v_max_f16_e32 v1, s2, v0 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, s2, v0 -; GFX942-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX942-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX942-NEXT: v_readfirstlane_b32 s0, v0 +; GFX942-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_readfirstlane_b32 s1, v1 +; GFX942-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX942-NEXT: ; return to shader part epilog ; ; GFX950-LABEL: s_no_fmaximum3_f16__multi_use: @@ -3778,10 +3777,10 @@ define amdgpu_ps <2 x i32> @s_no_fmaximum3_f16__multi_use(half inreg %a, half in ; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s1, s1 ; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_pk_maximum3_f16 v1, v0, s2, s2 -; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX950-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX950-NEXT: v_readfirstlane_b32 s0, v0 ; GFX950-NEXT: v_readfirstlane_b32 s1, v1 +; GFX950-NEXT: s_and_b32 s0, s0, 0xffff +; GFX950-NEXT: s_and_b32 s1, s1, 0xffff ; GFX950-NEXT: ; return to shader part epilog %max0 = call half @llvm.maximum.f16(half %a, half %b) %max1 = call half @llvm.maximum.f16(half %max0, half %c) diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll index 34d7e5acb7896..62de5094ebec5 100644 --- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll @@ -1317,10 +1317,10 @@ define amdgpu_ps i32 @s_fminimum3_f16(half inreg %a, half inreg %b, half inreg % ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_minimum3_f16 v0, s0, s1, v0 -; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_readfirstlane_b32 s0, v0 -; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: ; return to shader part epilog ; ; GFX942-LABEL: s_fminimum3_f16: @@ -1335,9 +1335,9 @@ define amdgpu_ps i32 @s_fminimum3_f16(half inreg %a, half inreg %b, half inreg % ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, s2, v0 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX942-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_readfirstlane_b32 s0, v0 +; GFX942-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX942-NEXT: ; return to shader part epilog ; ; GFX950-LABEL: s_fminimum3_f16: @@ -1346,9 +1346,8 @@ define amdgpu_ps i32 @s_fminimum3_f16(half inreg %a, half inreg %b, half inreg % ; GFX950-NEXT: v_mov_b32_e32 v1, s2 ; GFX950-NEXT: v_pk_minimum3_f16 v0, s0, v0, v1 ; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_readfirstlane_b32 s0, v0 +; GFX950-NEXT: s_and_b32 s0, s0, 0xffff ; GFX950-NEXT: ; return to shader part epilog %max0 = call half @llvm.minimum.f16(half %a, half %b) %max1 = call half @llvm.minimum.f16(half %max0, half %c) @@ -3764,12 +3763,12 @@ define amdgpu_ps <2 x i32> @s_no_fminimum3_f16__multi_use(half inreg %a, half in ; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX942-NEXT: v_min_f16_e32 v1, s2, v0 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, s2, v0 -; GFX942-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX942-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX942-NEXT: v_readfirstlane_b32 s0, v0 +; GFX942-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_readfirstlane_b32 s1, v1 +; GFX942-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX942-NEXT: ; return to shader part epilog ; ; GFX950-LABEL: s_no_fminimum3_f16__multi_use: @@ -3778,10 +3777,10 @@ define amdgpu_ps <2 x i32> @s_no_fminimum3_f16__multi_use(half inreg %a, half in ; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s1, s1 ; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_pk_minimum3_f16 v1, v0, s2, s2 -; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX950-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX950-NEXT: v_readfirstlane_b32 s0, v0 ; GFX950-NEXT: v_readfirstlane_b32 s1, v1 +; GFX950-NEXT: s_and_b32 s0, s0, 0xffff +; GFX950-NEXT: s_and_b32 s1, s1, 0xffff ; GFX950-NEXT: ; return to shader part epilog %max0 = call half @llvm.minimum.f16(half %a, half %b) %max1 = call half @llvm.minimum.f16(half %max0, half %c) diff --git a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll index 8c91acd5ae024..8dd0e0e9be9f2 100644 --- a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll @@ -3271,8 +3271,8 @@ define amdgpu_ps i32 @s_mul_32_f16(half inreg %x, half inreg %y) { ; GFX1011-LABEL: s_mul_32_f16: ; GFX1011: ; %bb.0: ; GFX1011-NEXT: v_mul_f16_e64 v0, 0x5000, s0 -; GFX1011-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1011-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1011-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX1011-NEXT: ; return to shader part epilog %mul = fmul contract half %x, 32.0 %cast = bitcast half %mul to i16 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll index ad5e9f4eb6a63..f03ea81e8981a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll @@ -98,7 +98,9 @@ define amdgpu_kernel void @gws_barrier_sgpr_offset_add1(i32 %val, i32 %offset.ba ; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16 ; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}} -; NOLOOP-GISEL-DAG: s_lshl_b32 m0, [[READLANE]], 16 + +; FIXME: check exact reg, GFX11 has an added s_and +; NOLOOP-GISEL-DAG: s_lshl_b32 m0, {{s[0-9]+}}, 16 ; NOLOOP-DAG: v_mov_b32_e32 [[GWS_VAL:v[0-9]+]], [[BAR_NUM]] ; NOLOOP: ds_gws_barrier [[GWS_VAL]] gds{{$}} @@ -116,7 +118,8 @@ define amdgpu_kernel void @gws_barrier_vgpr_offset(i32 %val) #0 { ; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16 ; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}} -; NOLOOP-GISEL-DAG: s_lshl_b32 m0, [[READLANE]], 16 +; FIXME: check exact reg, GFX11 has an added s_and +; NOLOOP-GISEL-DAG: s_lshl_b32 m0, {{s[0-9]+}}, 16 ; NOLOOP-DAG: v_mov_b32_e32 [[GWS_VAL:v[0-9]+]], [[BAR_NUM]] ; NOLOOP: ds_gws_barrier [[GWS_VAL]] offset:3 gds{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll index f658ab39f771f..48db6787cc2f1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll @@ -91,7 +91,8 @@ define amdgpu_kernel void @gws_init_sgpr_offset_add1(i32 %val, i32 %offset.base) ; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16 ; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}} -; NOLOOP-GISEL-DAG: s_lshl_b32 m0, [[READLANE]], 16 +; FIXME: check exact reg, GFX11 has an added s_and +; NOLOOP-GISEL-DAG: s_lshl_b32 m0, {{s[0-9]+}}, 16 ; NOLOOP-DAG: v_mov_b32_e32 v0, [[BAR_NUM]] ; NOLOOP: ds_gws_init v0 gds{{$}} @@ -109,7 +110,8 @@ define amdgpu_kernel void @gws_init_vgpr_offset(i32 %val) #0 { ; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16 ; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}} -; NOLOOP-GISEL-DAG: s_lshl_b32 m0, [[READLANE]], 16 +; FIXME: check exact reg, GFX11 has an added s_and +; NOLOOP-GISEL-DAG: s_lshl_b32 m0, {{s[0-9]+}}, 16 ; NOLOOP-DAG: v_mov_b32_e32 v0, [[BAR_NUM]] ; NOLOOP: ds_gws_init v0 offset:3 gds{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll index 840287b10bb49..19d3a80e368ba 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll @@ -134,20 +134,20 @@ define amdgpu_kernel void @id_arg_i32(i32 %row) #0 { define amdgpu_kernel void @id_row_i32() #0 { ; GFX11-LABEL: id_row_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x63 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s0, s0, 0x3ff ; GFX11-NEXT: s_mov_b32 m0, s0 ; GFX11-NEXT: exp pos0 v0, off, off, off done row_en ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: id_row_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_readfirstlane_b32 s0, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0x63 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_b32 s0, s0, 0x3ff ; GFX12-NEXT: s_mov_b32 m0, s0 ; GFX12-NEXT: export pos0 v0, off, off, off done row_en ; GFX12-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll index 076cf09678b57..940d5c7c2ed33 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll @@ -799,17 +799,16 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 % ; GFX11-SDAG-NEXT: s_clause 0x1 ; GFX11-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1 +; GFX11-SDAG-NEXT: v_bfe_u32 v1, v0, 10, 10 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: s_and_b32 s3, s3, 0x3ff ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s3, s2 +; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vvv_i32: @@ -817,49 +816,48 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 % ; GFX11-GISEL-NEXT: s_clause 0x1 ; GFX11-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: v_bfe_u32 v1, v0, 10, 10 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v1 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v1 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-GISEL-NEXT: s_and_b32 s2, s3, 0x3ff +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s4 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vvv_i32: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s3, v1 +; GFX12-SDAG-NEXT: v_bfe_u32 v1, v0, 10, 10 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s3, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: s_and_b32 s3, s3, 0x3ff ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s3, s2 +; GFX12-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vvv_i32: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: v_bfe_u32 v1, v0, 10, 10 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v1 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-GISEL-NEXT: s_and_b32 s2, s3, 0x3ff +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s4 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() @@ -900,41 +898,77 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i64(ptr addrspace(1) %out, i64 % ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlane16_b32_vvv_i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s5, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: v_readfirstlane_b32 s4, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-NEXT: v_permlane16_b32 v0, v0, s4, s5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_permlane16_b32 v1, v1, s4, s5 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-SDAG-LABEL: v_permlane16_b32_vvv_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_bfe_u32 v1, v0, 10, 10 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2 +; GFX11-SDAG-NEXT: s_and_b32 s2, s4, 0x3ff +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s5 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s5 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlane16_b32_vvv_i64: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_readfirstlane_b32 s5, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: v_readfirstlane_b32 s4, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-NEXT: v_permlane16_b32 v0, v0, s4, s5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_permlane16_b32 v1, v1, s4, s5 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_endpgm +; GFX11-GISEL-LABEL: v_permlane16_b32_vvv_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_bfe_u32 v1, v0, 10, 10 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: s_and_b32 s2, s4, 0x3ff +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s5 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s5 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_vvv_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-NEXT: v_bfe_u32 v1, v0, 10, 10 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2 +; GFX12-SDAG-NEXT: s_and_b32 s2, s4, 0x3ff +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s5 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s5 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_vvv_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-NEXT: v_bfe_u32 v1, v0, 10, 10 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: s_and_b32 s2, s4, 0x3ff +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s5 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s5 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidy = call i32 @llvm.amdgcn.workitem.id.y() %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %tidy, i1 false, i1 false) @@ -977,17 +1011,16 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float ; GFX11-SDAG-NEXT: s_clause 0x1 ; GFX11-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1 +; GFX11-SDAG-NEXT: v_bfe_u32 v1, v0, 10, 10 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: s_and_b32 s3, s3, 0x3ff ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s3, s2 +; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vvv_f32: @@ -995,49 +1028,48 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float ; GFX11-GISEL-NEXT: s_clause 0x1 ; GFX11-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: v_bfe_u32 v1, v0, 10, 10 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v1 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v1 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-GISEL-NEXT: s_and_b32 s2, s3, 0x3ff +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s4 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vvv_f32: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s3, v1 +; GFX12-SDAG-NEXT: v_bfe_u32 v1, v0, 10, 10 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s3, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: s_and_b32 s3, s3, 0x3ff ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s3, s2 +; GFX12-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vvv_f32: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: v_bfe_u32 v1, v0, 10, 10 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v1 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-GISEL-NEXT: s_and_b32 s2, s3, 0x3ff +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s4 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() @@ -1078,41 +1110,77 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f64(ptr addrspace(1) %out, doubl ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlane16_b32_vvv_f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s5, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: v_readfirstlane_b32 s4, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-NEXT: v_permlane16_b32 v0, v0, s4, s5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_permlane16_b32 v1, v1, s4, s5 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-SDAG-LABEL: v_permlane16_b32_vvv_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_bfe_u32 v1, v0, 10, 10 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2 +; GFX11-SDAG-NEXT: s_and_b32 s2, s4, 0x3ff +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s5 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s5 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlane16_b32_vvv_f64: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_readfirstlane_b32 s5, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: v_readfirstlane_b32 s4, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-NEXT: v_permlane16_b32 v0, v0, s4, s5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_permlane16_b32 v1, v1, s4, s5 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_endpgm +; GFX11-GISEL-LABEL: v_permlane16_b32_vvv_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_bfe_u32 v1, v0, 10, 10 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: s_and_b32 s2, s4, 0x3ff +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s5 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s5 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_vvv_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-NEXT: v_bfe_u32 v1, v0, 10, 10 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2 +; GFX12-SDAG-NEXT: s_and_b32 s2, s4, 0x3ff +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s5 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s5 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_vvv_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-NEXT: v_bfe_u32 v1, v0, 10, 10 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: s_and_b32 s2, s4, 0x3ff +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s5 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s5 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidy = call i32 @llvm.amdgcn.workitem.id.y() %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %tidx, i32 %tidy, i1 false, i1 false) @@ -1144,54 +1212,31 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i32(ptr addrspace(1) %out, i32 % ; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_permlane16_b32_vvs_i32: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlane16_b32_vvs_i32: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 -; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlane16_b32_vvs_i32: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-SDAG-NEXT: s_endpgm +; GFX11-LABEL: v_permlane16_b32_vvs_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_and_b32 s2, s4, 0x3ff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: v_permlane16_b32_vvs_i32: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 -; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_endpgm +; GFX12-LABEL: v_permlane16_b32_vvs_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_and_b32 s2, s4, 0x3ff +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out @@ -1231,117 +1276,132 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i64(ptr addrspace(1) %out, i64 % ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlane16_b32_vvs_i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s5, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: v_permlane16_b32 v1, v1, s5, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s5, s4 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: v_permlane16_b32_vvs_i64: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_load_b32 s4, s[4:5], 0x34 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_readfirstlane_b32 s5, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: v_permlane16_b32 v1, v1, s5, s4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s5, s4 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) - store i64 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float %src0, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlane16_b32_vvs_f32: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlane16_b32_vvs_f32: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-GISEL-NEXT: s_mov_b32 null, 0 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlane16_b32_vvs_f32: +; GFX11-SDAG-LABEL: v_permlane16_b32_vvs_i64: ; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s5, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-SDAG-NEXT: s_and_b32 s2, s5, 0x3ff +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s4 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; -; GFX11-GISEL-LABEL: v_permlane16_b32_vvs_f32: +; GFX11-GISEL-LABEL: v_permlane16_b32_vvs_i64: ; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s5, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 -; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-GISEL-NEXT: s_and_b32 s2, s5, 0x3ff +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s4 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_endpgm ; -; GFX12-SDAG-LABEL: v_permlane16_b32_vvs_f32: +; GFX12-SDAG-LABEL: v_permlane16_b32_vvs_i64: ; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s5, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: s_and_b32 s2, s5, 0x3ff +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s4 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s4 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: v_permlane16_b32_vvs_f32: +; GFX12-GISEL-LABEL: v_permlane16_b32_vvs_i64: ; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 ; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s5, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 -; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: s_and_b32 s2, s5, 0x3ff +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s4 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s4 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float %src0, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_vvs_f32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_vvs_f32: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlane16_b32_vvs_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_and_b32 s2, s4, 0x3ff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlane16_b32_vvs_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_and_b32 s2, s4, 0x3ff +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %tidx, i32 %src2, i1 false, i1 false) store float %v, ptr addrspace(1) %out @@ -1381,39 +1441,77 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f64(ptr addrspace(1) %out, doubl ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlane16_b32_vvs_f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s5, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: v_permlane16_b32 v1, v1, s5, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s5, s4 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-SDAG-LABEL: v_permlane16_b32_vvs_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s5, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-SDAG-NEXT: s_and_b32 s2, s5, 0x3ff +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s4 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlane16_b32_vvs_f64: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_load_b32 s4, s[4:5], 0x34 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_readfirstlane_b32 s5, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: v_permlane16_b32 v1, v1, s5, s4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s5, s4 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_endpgm +; GFX11-GISEL-LABEL: v_permlane16_b32_vvs_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s5, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-GISEL-NEXT: s_and_b32 s2, s5, 0x3ff +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s4 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_vvs_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s5, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: s_and_b32 s2, s5, 0x3ff +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s4 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s4 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_vvs_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s5, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: s_and_b32 s2, s5, 0x3ff +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s4 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s4 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %tidx, i32 %src2, i1 false, i1 false) store double %v, ptr addrspace(1) %out @@ -3364,17 +3462,16 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i32(ptr addrspace(1) %out, i32 ; GFX11-SDAG-NEXT: s_clause 0x1 ; GFX11-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1 +; GFX11-SDAG-NEXT: v_bfe_u32 v1, v0, 10, 10 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: s_and_b32 s3, s3, 0x3ff ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s3, s2 +; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vvv_i32: @@ -3382,49 +3479,48 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i32(ptr addrspace(1) %out, i32 ; GFX11-GISEL-NEXT: s_clause 0x1 ; GFX11-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: v_bfe_u32 v1, v0, 10, 10 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v1 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v1 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-GISEL-NEXT: s_and_b32 s2, s3, 0x3ff +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s4 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vvv_i32: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s3, v1 +; GFX12-SDAG-NEXT: v_bfe_u32 v1, v0, 10, 10 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s3, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: s_and_b32 s3, s3, 0x3ff ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s3, s2 +; GFX12-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vvv_i32: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: v_bfe_u32 v1, v0, 10, 10 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v1 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-GISEL-NEXT: s_and_b32 s2, s3, 0x3ff +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s4 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() @@ -3469,17 +3565,16 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, floa ; GFX11-SDAG-NEXT: s_clause 0x1 ; GFX11-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1 +; GFX11-SDAG-NEXT: v_bfe_u32 v1, v0, 10, 10 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: s_and_b32 s3, s3, 0x3ff ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s3, s2 +; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vvv_f32: @@ -3487,49 +3582,48 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, floa ; GFX11-GISEL-NEXT: s_clause 0x1 ; GFX11-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: v_bfe_u32 v1, v0, 10, 10 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v1 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v1 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-GISEL-NEXT: s_and_b32 s2, s3, 0x3ff +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s4 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vvv_f32: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s3, v1 +; GFX12-SDAG-NEXT: v_bfe_u32 v1, v0, 10, 10 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s3, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: s_and_b32 s3, s3, 0x3ff ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s3, s2 +; GFX12-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vvv_f32: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: v_bfe_u32 v1, v0, 10, 10 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v1 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-GISEL-NEXT: s_and_b32 s2, s3, 0x3ff +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s4 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() @@ -3570,63 +3664,99 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i64(ptr addrspace(1) %out, i64 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlanex16_b32_vvv_i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s5, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: v_readfirstlane_b32 s4, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s4, s5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_permlanex16_b32 v1, v1, s4, s5 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: v_permlanex16_b32_vvv_i64: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_readfirstlane_b32 s5, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: v_readfirstlane_b32 s4, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s4, s5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_permlanex16_b32 v1, v1, s4, s5 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidy = call i32 @llvm.amdgcn.workitem.id.y() - %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %tidy, i1 false, i1 false) - store i64 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_vvv_f64(ptr addrspace(1) %out, double %src0) { -; GFX10-SDAG-LABEL: v_permlanex16_b32_vvv_f64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-SDAG-NEXT: s_mov_b32 null, 0 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v0 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v1 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s4, s5 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s4, s5 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX10-SDAG-NEXT: s_endpgm +; GFX11-SDAG-LABEL: v_permlanex16_b32_vvv_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_bfe_u32 v1, v0, 10, 10 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2 +; GFX11-SDAG-NEXT: s_and_b32 s2, s4, 0x3ff +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s5 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s5 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_vvv_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_bfe_u32 v1, v0, 10, 10 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: s_and_b32 s2, s4, 0x3ff +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s5 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s5 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vvv_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-NEXT: v_bfe_u32 v1, v0, 10, 10 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2 +; GFX12-SDAG-NEXT: s_and_b32 s2, s4, 0x3ff +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s5 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s5 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vvv_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-NEXT: v_bfe_u32 v1, v0, 10, 10 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: s_and_b32 s2, s4, 0x3ff +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s5 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s5 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %tidy, i1 false, i1 false) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vvv_f64(ptr addrspace(1) %out, double %src0) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_vvv_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: s_mov_b32 null, 0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v1 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vvv_f64: ; GFX10-GISEL: ; %bb.0: @@ -3643,41 +3773,77 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f64(ptr addrspace(1) %out, doub ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlanex16_b32_vvv_f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s5, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: v_readfirstlane_b32 s4, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s4, s5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_permlanex16_b32 v1, v1, s4, s5 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-SDAG-LABEL: v_permlanex16_b32_vvv_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_bfe_u32 v1, v0, 10, 10 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2 +; GFX11-SDAG-NEXT: s_and_b32 s2, s4, 0x3ff +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s5 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s5 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlanex16_b32_vvv_f64: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_readfirstlane_b32 s5, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: v_readfirstlane_b32 s4, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s4, s5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_permlanex16_b32 v1, v1, s4, s5 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_endpgm +; GFX11-GISEL-LABEL: v_permlanex16_b32_vvv_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_bfe_u32 v1, v0, 10, 10 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: s_and_b32 s2, s4, 0x3ff +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s5 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s5 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vvv_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-NEXT: v_bfe_u32 v1, v0, 10, 10 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2 +; GFX12-SDAG-NEXT: s_and_b32 s2, s4, 0x3ff +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s5 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s5 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vvv_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-NEXT: v_bfe_u32 v1, v0, 10, 10 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: s_and_b32 s2, s4, 0x3ff +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s5 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s5 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidy = call i32 @llvm.amdgcn.workitem.id.y() %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %tidx, i32 %tidy, i1 false, i1 false) @@ -3709,54 +3875,31 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32 ; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs_i32: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs_i32: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 -; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs_i32: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-SDAG-NEXT: s_endpgm +; GFX11-LABEL: v_permlanex16_b32_vvs_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_and_b32 s2, s4, 0x3ff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs_i32: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 -; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_endpgm +; GFX12-LABEL: v_permlanex16_b32_vvs_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_and_b32 s2, s4, 0x3ff +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out @@ -3787,54 +3930,31 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f32(ptr addrspace(1) %out, floa ; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs_f32: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs_f32: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 -; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs_f32: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-SDAG-NEXT: s_endpgm +; GFX11-LABEL: v_permlanex16_b32_vvs_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_and_b32 s2, s4, 0x3ff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs_f32: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 -; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_endpgm +; GFX12-LABEL: v_permlanex16_b32_vvs_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_and_b32 s2, s4, 0x3ff +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %tidx, i32 %src2, i1 false, i1 false) store float %v, ptr addrspace(1) %out @@ -3874,39 +3994,77 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i64(ptr addrspace(1) %out, i64 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlanex16_b32_vvs_i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s5, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: v_permlanex16_b32 v1, v1, s5, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s5, s4 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s5, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-SDAG-NEXT: s_and_b32 s2, s5, 0x3ff +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s4 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlanex16_b32_vvs_i64: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_load_b32 s4, s[4:5], 0x34 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_readfirstlane_b32 s5, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: v_permlanex16_b32 v1, v1, s5, s4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s5, s4 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_endpgm +; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s5, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-GISEL-NEXT: s_and_b32 s2, s5, 0x3ff +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s4 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s5, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: s_and_b32 s2, s5, 0x3ff +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s4 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s4 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s5, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: s_and_b32 s2, s5, 0x3ff +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s4 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s4 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) store i64 %v, ptr addrspace(1) %out @@ -3946,39 +4104,77 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f64(ptr addrspace(1) %out, doub ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlanex16_b32_vvs_f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s5, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: v_permlanex16_b32 v1, v1, s5, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s5, s4 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s5, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-SDAG-NEXT: s_and_b32 s2, s5, 0x3ff +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s4 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlanex16_b32_vvs_f64: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_load_b32 s4, s[4:5], 0x34 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_readfirstlane_b32 s5, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: v_permlanex16_b32 v1, v1, s5, s4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s5, s4 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_endpgm +; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s5, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-GISEL-NEXT: s_and_b32 s2, s5, 0x3ff +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s4 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s5, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: s_and_b32 s2, s5, 0x3ff +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s4 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s4 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s5, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: s_and_b32 s2, s5, 0x3ff +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s4 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s4 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %tidx, i32 %src2, i1 false, i1 false) store double %v, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll index 3d9ce6e79d9d2..ab70b1506505b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll @@ -114,8 +114,8 @@ define amdgpu_gs i32 @s_fptrunc_round_f32_to_f16_upward(float inreg %a, ptr addr ; CHECK-NEXT: v_mov_b32_e32 v0, s0 ; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 ; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: s_and_b32 s0, 0xffff, s0 ; CHECK-NEXT: ; return to shader part epilog %res = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.upward") %bitcast = bitcast half %res to i16 @@ -129,8 +129,8 @@ define amdgpu_gs i32 @s_fptrunc_round_f32_to_f16_downward(float inreg %a, ptr ad ; CHECK-NEXT: v_mov_b32_e32 v0, s0 ; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1 ; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: s_and_b32 s0, 0xffff, s0 ; CHECK-NEXT: ; return to shader part epilog %res = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.downward") %bitcast = bitcast half %res to i16 @@ -257,10 +257,10 @@ define amdgpu_gs <2 x i32> @s_fptrunc_round_v2f32_to_v2f16_upward(<2 x float> in ; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 ; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; CHECK-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 ; CHECK-NEXT: v_readfirstlane_b32 s1, v1 +; CHECK-NEXT: s_and_b32 s0, 0xffff, s0 +; CHECK-NEXT: s_and_b32 s1, 0xffff, s1 ; CHECK-NEXT: ; return to shader part epilog %res = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.upward") %bitcast = bitcast <2 x half> %res to <2 x i16> @@ -276,10 +276,10 @@ define amdgpu_gs <2 x i32> @s_fptrunc_round_v2f32_to_v2f16_downward(<2 x float> ; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1 ; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; CHECK-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 ; CHECK-NEXT: v_readfirstlane_b32 s1, v1 +; CHECK-NEXT: s_and_b32 s0, 0xffff, s0 +; CHECK-NEXT: s_and_b32 s1, 0xffff, s1 ; CHECK-NEXT: ; return to shader part epilog %res = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.downward") %bitcast = bitcast <2 x half> %res to <2 x i16> diff --git a/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll b/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll index 0c8dbe865a872..59930801701fd 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll @@ -1193,9 +1193,8 @@ define amdgpu_gfx void @s_set_rounding_select_0_1(i32 inreg %cond) { ; GFX8-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35] ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX8-NEXT: s_mov_b32 s34, 0xa50f -; GFX8-NEXT: v_lshrrev_b32_e64 v0, v0, s34 ; GFX8-NEXT: v_readfirstlane_b32 s34, v0 +; GFX8-NEXT: s_lshr_b32 s34, 0xa50f, s34 ; GFX8-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1206,9 +1205,8 @@ define amdgpu_gfx void @s_set_rounding_select_0_1(i32 inreg %cond) { ; GFX9-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35] ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s34, 0xa50f -; GFX9-NEXT: v_lshrrev_b32_e64 v0, v0, s34 ; GFX9-NEXT: v_readfirstlane_b32 s34, v0 +; GFX9-NEXT: s_lshr_b32 s34, 0xa50f, s34 ; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1219,8 +1217,8 @@ define amdgpu_gfx void @s_set_rounding_select_0_1(i32 inreg %cond) { ; GFX10-NEXT: s_cselect_b32 s34, -1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_lshrrev_b32_e64 v0, v0, 0xa50f ; GFX10-NEXT: v_readfirstlane_b32 s34, v0 +; GFX10-NEXT: s_lshr_b32 s34, 0xa50f, s34 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1231,8 +1229,8 @@ define amdgpu_gfx void @s_set_rounding_select_0_1(i32 inreg %cond) { ; GFX11-NEXT: s_cselect_b32 s0, -1, 0 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: v_lshrrev_b32_e64 v0, v0, 0xa50f ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: s_lshr_b32 s0, 0xa50f, s0 ; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %cond, 0