diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 8e35109061792..98128e878cd2a 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2718,6 +2718,17 @@ def int_amdgcn_call_whole_wave: llvm_vararg_ty], // The arguments to the callee. [IntrConvergent]>; +// +// llvm.amdgcn.wave.shuffle +// value and result can be a 32bit floating-point or +// integer type, and must be the same type. Any index +// value that's outside the valid range will wrap around, +// and reading from an inactive lane will return poison. +def int_amdgcn_wave_shuffle : + DefaultAttrsIntrinsic<[llvm_any_ty], // return types + [LLVMMatchType<0>, llvm_i32_ty], // arg types + [IntrConvergent, IntrNoMem]>; // flags + //===----------------------------------------------------------------------===// // CI+ Intrinsics //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 650df2a87506a..d6a59823526b8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1216,6 +1216,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { case Intrinsic::amdgcn_permlane16_swap: case Intrinsic::amdgcn_permlane32_swap: return selectPermlaneSwapIntrin(I, IntrinsicID); + case Intrinsic::amdgcn_wave_shuffle: + return selectWaveShuffleIntrin(I); default: return selectImpl(I, *CoverageInfo); } @@ -3852,6 +3854,129 @@ bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const { return true; } +bool AMDGPUInstructionSelector::selectWaveShuffleIntrin( + MachineInstr &MI) const { + assert(MI.getNumOperands() == 4); + MachineBasicBlock *MBB = MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + + Register DstReg = MI.getOperand(0).getReg(); + Register ValReg = MI.getOperand(2).getReg(); + Register IdxReg = MI.getOperand(3).getReg(); + + const LLT DstTy = MRI->getType(DstReg); + unsigned DstSize = DstTy.getSizeInBits(); + const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); + const TargetRegisterClass *DstRC = + TRI.getRegClassForSizeOnBank(DstSize, *DstRB); + + assert(DstTy == LLT::scalar(32)); + + // If we can bpermute across the whole wave, then just do that + if (Subtarget->supportsWaveWideBPermute()) { + Register ShiftIdxReg = MRI->createVirtualRegister(DstRC); + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), ShiftIdxReg) + .addImm(2) + .addReg(IdxReg); + + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), DstReg) + .addReg(ShiftIdxReg) + .addReg(ValReg) + .addImm(0); + } else { + // Otherwise, we need to make use of whole wave mode + assert(Subtarget->isWave64()); + + // Set inactive lanes to poison + Register UndefValReg = + MRI->createVirtualRegister(TRI.getRegClass(AMDGPU::SReg_32RegClassID)); + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefValReg); + + Register UndefExecReg = MRI->createVirtualRegister( + TRI.getRegClass(AMDGPU::SReg_64_XEXECRegClassID)); + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefExecReg); + + Register PoisonValReg = MRI->createVirtualRegister(DstRC); + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SET_INACTIVE_B32), PoisonValReg) + .addImm(0) + .addReg(ValReg) + .addImm(0) + .addReg(UndefValReg) + .addReg(UndefExecReg); + + // ds_bpermute requires index to be multiplied by 4 + Register ShiftIdxReg = MRI->createVirtualRegister(DstRC); + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), ShiftIdxReg) + .addImm(2) + .addReg(IdxReg); + + Register PoisonIdxReg = MRI->createVirtualRegister(DstRC); + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SET_INACTIVE_B32), PoisonIdxReg) + .addImm(0) + .addReg(ShiftIdxReg) + .addImm(0) + .addReg(UndefValReg) + .addReg(UndefExecReg); + + // Get permutation of each half, then we'll select which one to use + Register SameSidePermReg = MRI->createVirtualRegister(DstRC); + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), SameSidePermReg) + .addReg(PoisonIdxReg) + .addReg(PoisonValReg) + .addImm(0); + + Register SwappedValReg = MRI->createVirtualRegister(DstRC); + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_PERMLANE64_B32), SwappedValReg) + .addReg(PoisonValReg); + + Register OppSidePermReg = MRI->createVirtualRegister(DstRC); + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), OppSidePermReg) + .addReg(PoisonIdxReg) + .addReg(SwappedValReg) + .addImm(0); + + Register WWMSwapPermReg = MRI->createVirtualRegister(DstRC); + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::STRICT_WWM), WWMSwapPermReg) + .addReg(OppSidePermReg); + + // Select which side to take the permute from + // We can get away with only using mbcnt_lo here since we're only + // trying to detect which side of 32 each lane is on, and mbcnt_lo + // returns 32 for lanes 32-63. + Register ThreadIDReg = MRI->createVirtualRegister(DstRC); + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MBCNT_LO_U32_B32_e64), ThreadIDReg) + .addImm(-1) + .addImm(0); + + Register XORReg = MRI->createVirtualRegister(DstRC); + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_XOR_B32_e64), XORReg) + .addReg(ThreadIDReg) + .addReg(PoisonIdxReg); + + Register ANDReg = MRI->createVirtualRegister(DstRC); + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_AND_B32_e64), ANDReg) + .addReg(XORReg) + .addImm(32); + + Register CompareReg = MRI->createVirtualRegister( + TRI.getRegClass(AMDGPU::SReg_64_XEXECRegClassID)); + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), CompareReg) + .addReg(ANDReg) + .addImm(0); + + // Finally do the selection + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addImm(0) + .addReg(WWMSwapPermReg) + .addImm(0) + .addReg(SameSidePermReg) + .addReg(CompareReg); + } + + MI.eraseFromParent(); + return true; +} + // Match BITOP3 operation and return a number of matched instructions plus // truth table. static std::pair BitOp3_Op(Register R, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index c760fe7ef99dd..627cce277ae38 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -156,6 +156,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector { bool selectSBarrierSignalIsfirst(MachineInstr &I, Intrinsic::ID IID) const; bool selectSGetBarrierState(MachineInstr &I, Intrinsic::ID IID) const; bool selectSBarrierLeave(MachineInstr &I) const; + bool selectWaveShuffleIntrin(MachineInstr &I) const; std::pair selectVOP3ModsImpl(Register Src, bool IsCanonicalizing = true, diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 7ed026ee5f69e..7d838c58d607d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -5230,11 +5230,20 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[2] = AMDGPU::getValueMapping(regBankID, OpSize); break; } - case Intrinsic::amdgcn_s_bitreplicate: + case Intrinsic::amdgcn_s_bitreplicate: { Register MaskReg = MI.getOperand(2).getReg(); unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID); OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64); OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, 32); + break; + } + case Intrinsic::amdgcn_wave_shuffle: { + unsigned OpSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize); + OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize); + OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize); + break; + } } break; } diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index cb27f474d78f3..df98d473e16e2 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1894,6 +1894,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool requiresWaitsBeforeSystemScopeStores() const { return RequiresWaitsBeforeSystemScopeStores; } + + bool supportsWaveWideBPermute() const { + return getGeneration() == AMDGPUSubtarget::GFX12 || isWave32(); + } }; class GCNUserSGPRUsageInfo { diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index e37d739fc25df..6e35901d10c74 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -7280,6 +7280,81 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, return DAG.getBitcast(VT, UnrolledLaneOp); } +static SDValue lowerWaveShuffle(const SITargetLowering &TLI, SDNode *N, + SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + assert(VT.getSizeInBits() == 32); + SDLoc SL(N); + + SDValue Value = N->getOperand(1); + SDValue Index = N->getOperand(2); + + // ds_bpermute requires index to be multiplied by 4 + SDValue ShiftAmount = DAG.getShiftAmountConstant(2, MVT::i32, SL); + SDValue ShiftedIndex = DAG.getNode(ISD::SHL, SL, Index.getValueType(), Index, + ShiftAmount); + + // Intrinsics will require i32 to operate on + SDValue ValueI32 = DAG.getBitcast(MVT::i32, Value); + + auto MakeIntrinsic = [&DAG, &SL](unsigned IID, MVT RetVT, + SmallVector IntrinArgs) -> SDValue { + SmallVector Operands(1); + Operands[0] = DAG.getTargetConstant(IID, SL, MVT::i32); + Operands.append(IntrinArgs); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, RetVT, Operands); + }; + + // If we can bpermute across the whole wave, then just do that + if (TLI.getSubtarget()->supportsWaveWideBPermute()) { + SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32, + {ShiftedIndex, ValueI32}); + return DAG.getBitcast(VT, BPermute); + } + + assert(TLI.getSubtarget()->isWave64()); + + // Otherwise, we need to make use of whole wave mode + SDValue PoisonVal = DAG.getPOISON(ValueI32->getValueType(0)); + + // Set inactive lanes to poison + SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32, + {ValueI32, PoisonVal}); + SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32, + {ShiftedIndex, PoisonVal}); + + SDValue Swapped = + MakeIntrinsic(Intrinsic::amdgcn_permlane64, MVT::i32, {WWMValue}); + + // Get permutation of each half, then we'll select which one to use + SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, + MVT::i32, {WWMIndex, WWMValue}); + SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, + MVT::i32, {WWMIndex, Swapped}); + SDValue BPermOtherHalfWWM = + MakeIntrinsic(Intrinsic::amdgcn_wwm, MVT::i32, {BPermOtherHalf}); + + // Select which side to take the permute from + SDValue ThreadIDMask = DAG.getAllOnesConstant(SL, MVT::i32); + // We can get away with only using mbcnt_lo here since we're only + // trying to detect which side of 32 each lane is on, and mbcnt_lo + // returns 32 for lanes 32-63. + SDValue ThreadID = + MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32, + {ThreadIDMask, DAG.getTargetConstant(0, SL, MVT::i32)}); + + SDValue SameOrOtherHalf = + DAG.getNode(ISD::AND, SL, MVT::i32, + DAG.getNode(ISD::XOR, SL, MVT::i32, ThreadID, Index), + DAG.getTargetConstant(32, SL, MVT::i32)); + SDValue UseSameHalf = + DAG.getSetCC(SL, MVT::i1, SameOrOtherHalf, + DAG.getConstant(0, SL, MVT::i32), ISD::SETEQ); + SDValue Result = DAG.getSelect(SL, MVT::i32, UseSameHalf, BPermSameHalf, + BPermOtherHalfWWM); + return DAG.getBitcast(VT, Result); +} + void SITargetLowering::ReplaceNodeResults(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const { @@ -10187,6 +10262,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Poisons.push_back(DAG.getPOISON(ValTy)); return DAG.getMergeValues(Poisons, SDLoc(Op)); } + case Intrinsic::amdgcn_wave_shuffle: + return lowerWaveShuffle(*this, Op.getNode(), DAG); default: if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.shuffle.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.shuffle.ll new file mode 100644 index 0000000000000..96039dc11c70b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.shuffle.ll @@ -0,0 +1,143 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11-W32 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-W32 %s + +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11-W64 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-W64 %s + +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11-W32-GISEL %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-W32-GISEL %s + +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11-W64-GISEL %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-W64-GISEL %s + +declare float @llvm.amdgcn.wave.shuffle.float(float, i32) + +define float @test_wave_shuffle_float(float %val, i32 %idx) { +; GFX11-W32-LABEL: test_wave_shuffle_float: +; GFX11-W32: ; %bb.0: ; %entry +; GFX11-W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-W32-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX11-W32-NEXT: ds_bpermute_b32 v0, v1, v0 +; GFX11-W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-W32-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-W32-LABEL: test_wave_shuffle_float: +; GFX12-W32: ; %bb.0: ; %entry +; GFX12-W32-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-W32-NEXT: s_wait_expcnt 0x0 +; GFX12-W32-NEXT: s_wait_samplecnt 0x0 +; GFX12-W32-NEXT: s_wait_bvhcnt 0x0 +; GFX12-W32-NEXT: s_wait_kmcnt 0x0 +; GFX12-W32-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX12-W32-NEXT: ds_bpermute_b32 v0, v1, v0 +; GFX12-W32-NEXT: s_wait_dscnt 0x0 +; GFX12-W32-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-W64-LABEL: test_wave_shuffle_float: +; GFX11-W64: ; %bb.0: ; %entry +; GFX11-W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-W64-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX11-W64-NEXT: scratch_store_b32 off, v2, s32 ; 4-byte Folded Spill +; GFX11-W64-NEXT: s_mov_b64 exec, s[0:1] +; GFX11-W64-NEXT: v_lshlrev_b32_e32 v3, 2, v1 +; GFX11-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec +; GFX11-W64-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $exec +; GFX11-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX11-W64-NEXT: v_permlane64_b32 v2, v0 +; GFX11-W64-NEXT: ds_bpermute_b32 v2, v3, v2 +; GFX11-W64-NEXT: s_mov_b64 exec, s[0:1] +; GFX11-W64-NEXT: ds_bpermute_b32 v0, v3, v0 +; GFX11-W64-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX11-W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-W64-NEXT: v_xor_b32_e32 v1, v3, v1 +; GFX11-W64-NEXT: s_waitcnt lgkmcnt(1) +; GFX11-W64-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-W64-NEXT: v_and_b32_e32 v1, 32, v1 +; GFX11-W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX11-W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-W64-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; GFX11-W64-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX11-W64-NEXT: scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload +; GFX11-W64-NEXT: s_mov_b64 exec, s[0:1] +; GFX11-W64-NEXT: s_waitcnt vmcnt(0) +; GFX11-W64-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-W64-LABEL: test_wave_shuffle_float: +; GFX12-W64: ; %bb.0: ; %entry +; GFX12-W64-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-W64-NEXT: s_wait_expcnt 0x0 +; GFX12-W64-NEXT: s_wait_samplecnt 0x0 +; GFX12-W64-NEXT: s_wait_bvhcnt 0x0 +; GFX12-W64-NEXT: s_wait_kmcnt 0x0 +; GFX12-W64-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX12-W64-NEXT: ds_bpermute_b32 v0, v1, v0 +; GFX12-W64-NEXT: s_wait_dscnt 0x0 +; GFX12-W64-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-W32-GISEL-LABEL: test_wave_shuffle_float: +; GFX11-W32-GISEL: ; %bb.0: ; %entry +; GFX11-W32-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-W32-GISEL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX11-W32-GISEL-NEXT: ds_bpermute_b32 v0, v1, v0 +; GFX11-W32-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-W32-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-W32-GISEL-LABEL: test_wave_shuffle_float: +; GFX12-W32-GISEL: ; %bb.0: ; %entry +; GFX12-W32-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-W32-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-W32-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-W32-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-W32-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-W32-GISEL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX12-W32-GISEL-NEXT: ds_bpermute_b32 v0, v1, v0 +; GFX12-W32-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX12-W32-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-W64-GISEL-LABEL: test_wave_shuffle_float: +; GFX11-W64-GISEL: ; %bb.0: ; %entry +; GFX11-W64-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-W64-GISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX11-W64-GISEL-NEXT: scratch_store_b32 off, v2, s32 ; 4-byte Folded Spill +; GFX11-W64-GISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX11-W64-GISEL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX11-W64-GISEL-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec +; GFX11-W64-GISEL-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $exec +; GFX11-W64-GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX11-W64-GISEL-NEXT: v_permlane64_b32 v2, v0 +; GFX11-W64-GISEL-NEXT: ds_bpermute_b32 v2, v1, v2 +; GFX11-W64-GISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX11-W64-GISEL-NEXT: ds_bpermute_b32 v0, v1, v0 +; GFX11-W64-GISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX11-W64-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-W64-GISEL-NEXT: v_xor_b32_e32 v1, v3, v1 +; GFX11-W64-GISEL-NEXT: s_waitcnt lgkmcnt(1) +; GFX11-W64-GISEL-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-W64-GISEL-NEXT: v_and_b32_e32 v1, 32, v1 +; GFX11-W64-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-W64-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX11-W64-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-W64-GISEL-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; GFX11-W64-GISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX11-W64-GISEL-NEXT: scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload +; GFX11-W64-GISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX11-W64-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-W64-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-W64-GISEL-LABEL: test_wave_shuffle_float: +; GFX12-W64-GISEL: ; %bb.0: ; %entry +; GFX12-W64-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-W64-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-W64-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-W64-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-W64-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-W64-GISEL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX12-W64-GISEL-NEXT: ds_bpermute_b32 v0, v1, v0 +; GFX12-W64-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX12-W64-GISEL-NEXT: s_setpc_b64 s[30:31] +entry: + %0 = tail call float @llvm.amdgcn.wave.shuffle(float %val, i32 %idx) + ret float %0 +}