From 29bb614a6091ccee86dba4bcc400d557bd8e9de0 Mon Sep 17 00:00:00 2001 From: easyonaadit Date: Wed, 5 Feb 2025 11:39:49 +0530 Subject: [PATCH 1/5] [AMDGPU] Wave Reduce Intrinsics for i32 type Currently, wave reduction intrinsics are supported for `umin` and `umax` operations only. This patch extends support for the following operations: `uadd`, `add`, `usub`, `sub`, `min`, `max`, `and`, `or`, `xor` for `i32` type. --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 10 +- .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 11 +- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 124 +- llvm/lib/Target/AMDGPU/SIInstructions.td | 31 +- .../CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll | 1237 ++++++++++++++++ .../CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll | 986 +++++++++++++ .../CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll | 986 +++++++++++++ .../CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll | 986 +++++++++++++ .../CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll | 986 +++++++++++++ .../CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll | 1286 ++++++++++++++++ .../CodeGen/AMDGPU/llvm.amdgcn.reduce.uadd.ll | 1240 ++++++++++++++++ .../CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll | 24 +- .../CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll | 24 +- .../CodeGen/AMDGPU/llvm.amdgcn.reduce.usub.ll | 1286 ++++++++++++++++ .../CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll | 1290 +++++++++++++++++ .../AMDGPU/llvm.amdgcn.wave.reduce.add.mir | 90 ++ .../AMDGPU/llvm.amdgcn.wave.reduce.and.mir | 89 ++ .../AMDGPU/llvm.amdgcn.wave.reduce.max.mir | 89 ++ .../AMDGPU/llvm.amdgcn.wave.reduce.min.mir | 89 ++ .../AMDGPU/llvm.amdgcn.wave.reduce.or.mir | 89 ++ .../AMDGPU/llvm.amdgcn.wave.reduce.sub.mir | 92 ++ .../AMDGPU/llvm.amdgcn.wave.reduce.umax.mir | 4 +- .../AMDGPU/llvm.amdgcn.wave.reduce.umin.mir | 4 +- .../AMDGPU/llvm.amdgcn.wave.reduce.xor.mir | 92 ++ 24 files changed, 11096 insertions(+), 49 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.uadd.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.usub.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.add.mir create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.and.mir create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.max.mir create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.min.mir create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.or.mir create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.sub.mir create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.xor.mir diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index eb7bde6999491..fac4228d3bc1f 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2327,8 +2327,14 @@ class AMDGPUWaveReduce : Intrinsic< ], [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg>]>; -def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce; -def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce; +multiclass AMDGPUWaveReduceOps Operations> { + foreach Op = Operations in { def Op : AMDGPUWaveReduce; } +} + +defvar Operations = [ + "umin", "min", "umax", "max", "uadd", "add", "usub", "sub", "and", "or", "xor" +]; +defm int_amdgcn_wave_reduce_ : AMDGPUWaveReduceOps; def int_amdgcn_readfirstlane : Intrinsic<[llvm_any_ty], [LLVMMatchType<0>], diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 2e5f42c3bdc40..7d8fb718a88ed 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4981,8 +4981,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize); break; } + case Intrinsic::amdgcn_wave_reduce_add: + case Intrinsic::amdgcn_wave_reduce_uadd: + case Intrinsic::amdgcn_wave_reduce_sub: + case Intrinsic::amdgcn_wave_reduce_usub: + case Intrinsic::amdgcn_wave_reduce_min: case Intrinsic::amdgcn_wave_reduce_umin: - case Intrinsic::amdgcn_wave_reduce_umax: { + case Intrinsic::amdgcn_wave_reduce_max: + case Intrinsic::amdgcn_wave_reduce_umax: + case Intrinsic::amdgcn_wave_reduce_and: + case Intrinsic::amdgcn_wave_reduce_or: + case Intrinsic::amdgcn_wave_reduce_xor: { unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index b632c50dae0e3..2f239ddecc793 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4940,6 +4940,28 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, return LoopBB; } +static uint32_t getInitialValueForWaveReduction(unsigned Opc) { + switch (Opc) { + case AMDGPU::S_MIN_U32: + return std::numeric_limits::max(); + case AMDGPU::S_MIN_I32: + return std::numeric_limits::max(); + case AMDGPU::S_MAX_U32: + return std::numeric_limits::min(); + case AMDGPU::S_MAX_I32: + return std::numeric_limits::min(); + case AMDGPU::S_ADD_I32: + case AMDGPU::S_SUB_I32: + case AMDGPU::S_OR_B32: + case AMDGPU::S_XOR_B32: + return std::numeric_limits::min(); + case AMDGPU::S_AND_B32: + return std::numeric_limits::max(); + default: + llvm_unreachable("Unexpected opcode in getInitialValueForWaveReduction"); + } +} + static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, @@ -4955,13 +4977,78 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, Register DstReg = MI.getOperand(0).getReg(); MachineBasicBlock *RetBB = nullptr; if (isSGPR) { - // These operations with a uniform value i.e. SGPR are idempotent. - // Reduced value will be same as given sgpr. - // clang-format off - BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg) - .addReg(SrcReg); - // clang-format on - RetBB = &BB; + switch (Opc) { + case AMDGPU::S_MIN_U32: + case AMDGPU::S_MIN_I32: + case AMDGPU::S_MAX_U32: + case AMDGPU::S_MAX_I32: + case AMDGPU::S_AND_B32: + case AMDGPU::S_OR_B32: { + // Idempotent operations. + BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg); + RetBB = &BB; + break; + } + case AMDGPU::S_XOR_B32: + case AMDGPU::S_ADD_I32: + case AMDGPU::S_SUB_I32: { + const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass(); + const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg); + Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass); + Register ActiveLanes = MRI.createVirtualRegister(DstRegClass); + + bool IsWave32 = ST.isWave32(); + unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + unsigned CountReg = + IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64; + + auto Exec = + BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg); + + auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes) + .addReg(Exec->getOperand(0).getReg()); + + switch (Opc) { + case AMDGPU::S_XOR_B32: { + // Performing an XOR operation on a uniform value + // depends on the parity of the number of active lanes. + // For even parity, the result will be 0, for odd + // parity the result will be the same as the input value. + Register ParityRegister = MRI.createVirtualRegister(DstRegClass); + + auto ParityReg = + BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister) + .addReg(NewAccumulator->getOperand(0).getReg()) + .addImm(1); + BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg) + .addReg(SrcReg) + .addReg(ParityReg->getOperand(0).getReg()); + break; + } + case AMDGPU::S_SUB_I32: { + Register NegatedVal = MRI.createVirtualRegister(DstRegClass); + + // Take the negation of the source operand. + auto InvertedValReg = + BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal) + .addImm(-1) + .addReg(SrcReg); + BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg) + .addReg(InvertedValReg->getOperand(0).getReg()) + .addReg(NewAccumulator->getOperand(0).getReg()); + break; + } + case AMDGPU::S_ADD_I32: { + BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg) + .addReg(SrcReg) + .addReg(NewAccumulator->getOperand(0).getReg()); + break; + } + } + RetBB = &BB; + } + } } else { // TODO: Implement DPP Strategy and switch based on immediate strategy // operand. For now, for all the cases (default, Iterative and DPP we use @@ -4997,9 +5084,8 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; // Create initail values of induction variable from Exec, Accumulator and - // insert branch instr to newly created ComputeBlockk - uint32_t InitalValue = - (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits::max() : 0; + // insert branch instr to newly created ComputeBlock + uint32_t InitalValue = getInitialValueForWaveReduction(Opc); auto TmpSReg = BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg); BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg) @@ -5071,8 +5157,26 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, switch (MI.getOpcode()) { case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32: return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32); + case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32); case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32: return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32); + case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32); + case AMDGPU::WAVE_REDUCE_UADD_PSEUDO_U32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32); + case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32); + case AMDGPU::WAVE_REDUCE_USUB_PSEUDO_U32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32); + case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32); + case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32); + case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32); + case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32); case AMDGPU::S_UADDO_PSEUDO: case AMDGPU::S_USUBO_PSEUDO: { const DebugLoc &DL = MI.getDebugLoc(); diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 6e08aff24ec23..8a3e93a4faa95 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -277,16 +277,31 @@ def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)), def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)), (V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>; -let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { - def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst), - (ins VSrc_b32: $src, VSrc_b32:$strategy), - [(set i32:$sdst, (int_amdgcn_wave_reduce_umin i32:$src, i32:$strategy))]> { +// clang-format off +defvar int_amdgcn_wave_reduce_ = "int_amdgcn_wave_reduce_"; +multiclass + AMDGPUWaveReducePseudoGenerator { + let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { + def !toupper(Op) #"_PSEUDO_" #DataType #Size + : VPseudoInstSI<(outs SGPR_32 : $sdst), + (ins VSrc_b32 : $src, VSrc_b32 : $strategy), + [(set i32 : $sdst, (!cast(int_amdgcn_wave_reduce_ #Op) i32 : $src, i32 : $strategy))]> {} } +} +// clang-format on - def WAVE_REDUCE_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst), - (ins VSrc_b32: $src, VSrc_b32:$strategy), - [(set i32:$sdst, (int_amdgcn_wave_reduce_umax i32:$src, i32:$strategy))]> { - } +// Input list : [Operation_name, +// type - Signed(I)/Unsigned(U)/Float(F)/Bitwise(B), +// Size_in_bits] +defvar Operations = [ + ["umin", "U", "32"], ["min", "I", "32"], ["umax", "U", "32"], + ["max", "I", "32"], ["uadd", "U", "32"], ["add", "I", "32"], + ["usub", "U", "32"], ["sub", "I", "32"], ["and", "B", "32"], + ["or", "B", "32"], ["xor", "B", "32"] +]; + +foreach Op = Operations in { + defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator; } let usesCustomInserter = 1, Defs = [VCC] in { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll new file mode 100644 index 0000000000000..e93e8d4108ba0 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll @@ -0,0 +1,1237 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 < %s | FileCheck -check-prefixes=GFX8DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 < %s | FileCheck -check-prefixes=GFX8GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -check-prefixes=GFX9DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck -check-prefixes=GFX9GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s + +define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: uniform_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: uniform_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: uniform_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: uniform_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: uniform_value: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_clause 0x1 +; GFX1064DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: uniform_value: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_clause 0x1 +; GFX1064GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: uniform_value: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_clause 0x1 +; GFX1032DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s2, s3 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: uniform_value: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_clause 0x1 +; GFX1032GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: s_mul_i32 s2, s2, s3 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: uniform_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_clause 0x1 +; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: uniform_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_clause 0x1 +; GFX1164GISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: uniform_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_clause 0x1 +; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s2, s3 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: uniform_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_clause 0x1 +; GFX1132GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: s_mul_i32 s2, s2, s3 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %in, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: const_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8DAGISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: const_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8GISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: const_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9DAGISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: const_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9GISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: const_value: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064DAGISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: const_value: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064GISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: const_value: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032DAGISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: const_value: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032GISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: const_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: const_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: const_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132DAGISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: const_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm + entry: + %result = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 123, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: poison_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: poison_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: poison_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: poison_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: poison_value: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: poison_value: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: poison_value: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: poison_value: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: poison_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: poison_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: poison_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: poison_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 poison, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: divergent_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_mov_b32 s4, 0 +; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX8DAGISEL-NEXT: s_add_i32 s4, s4, s6 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8DAGISEL-NEXT: ; %bb.2: +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: divergent_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_mov_b32 s4, 0 +; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX8GISEL-NEXT: s_add_i32 s4, s4, s6 +; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8GISEL-NEXT: ; %bb.2: +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: divergent_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_mov_b32 s4, 0 +; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX9DAGISEL-NEXT: s_add_i32 s4, s4, s6 +; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9DAGISEL-NEXT: ; %bb.2: +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: divergent_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_mov_b32 s4, 0 +; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX9GISEL-NEXT: s_add_i32 s4, s4, s6 +; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9GISEL-NEXT: ; %bb.2: +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: divergent_value: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: s_mov_b32 s4, 0 +; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1064DAGISEL-NEXT: s_add_i32 s4, s4, s6 +; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1064DAGISEL-NEXT: ; %bb.2: +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: divergent_value: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: s_mov_b32 s4, 0 +; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1064GISEL-NEXT: s_add_i32 s4, s4, s6 +; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1064GISEL-NEXT: ; %bb.2: +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: divergent_value: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, 0 +; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1032DAGISEL-NEXT: s_add_i32 s2, s2, s5 +; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1032DAGISEL-NEXT: ; %bb.2: +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: divergent_value: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032GISEL-NEXT: s_mov_b32 s2, 0 +; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1032GISEL-NEXT: s_add_i32 s2, s2, s5 +; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1032GISEL-NEXT: ; %bb.2: +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: divergent_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: s_mov_b32 s4, 0 +; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: s_add_i32 s4, s4, s6 +; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1164DAGISEL-NEXT: ; %bb.2: +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: divergent_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: s_mov_b32 s4, 0 +; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: s_add_i32 s4, s4, s6 +; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1164GISEL-NEXT: ; %bb.2: +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: divergent_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, 0 +; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: s_add_i32 s2, s2, s5 +; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1132DAGISEL-NEXT: ; %bb.2: +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: divergent_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132GISEL-NEXT: s_mov_b32 s2, 0 +; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: s_add_i32 s2, s2, s5 +; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1132GISEL-NEXT: ; %bb.2: +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %result = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %id.x, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: divergent_cfg: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr2 +; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX8DAGISEL-NEXT: ; %bb.1: ; %else +; GFX8DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX8DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX8DAGISEL-NEXT: ; %bb.3: ; %if +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX8DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX8DAGISEL-NEXT: s_add_i32 s6, s6, s8 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX8DAGISEL-NEXT: ; %bb.5: +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v1 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: divergent_cfg: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX8GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX8GISEL-NEXT: ; %bb.1: ; %else +; GFX8GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: s_mul_i32 s6, s6, s2 +; GFX8GISEL-NEXT: .LBB4_2: ; %Flow +; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX8GISEL-NEXT: ; %bb.3: ; %if +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_mov_b32 s6, 0 +; GFX8GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX8GISEL-NEXT: s_add_i32 s6, s6, s8 +; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX8GISEL-NEXT: .LBB4_5: ; %endif +; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: divergent_cfg: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr2 +; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX9DAGISEL-NEXT: ; %bb.1: ; %else +; GFX9DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX9DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX9DAGISEL-NEXT: ; %bb.3: ; %if +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX9DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX9DAGISEL-NEXT: s_add_i32 s6, s6, s8 +; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX9DAGISEL-NEXT: ; %bb.5: +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: divergent_cfg: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX9GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX9GISEL-NEXT: ; %bb.1: ; %else +; GFX9GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: s_mul_i32 s6, s6, s2 +; GFX9GISEL-NEXT: .LBB4_2: ; %Flow +; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX9GISEL-NEXT: ; %bb.3: ; %if +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_mov_b32 s6, 0 +; GFX9GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX9GISEL-NEXT: s_add_i32 s6, s6, s8 +; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX9GISEL-NEXT: .LBB4_5: ; %endif +; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: divergent_cfg: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr2 +; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1064DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX1064DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX1064DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX1064DAGISEL-NEXT: s_add_i32 s6, s6, s8 +; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1064DAGISEL-NEXT: ; %bb.5: +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: divergent_cfg: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1064GISEL-NEXT: ; %bb.1: ; %else +; GFX1064GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: s_mul_i32 s6, s6, s2 +; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1064GISEL-NEXT: ; %bb.3: ; %if +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: s_mov_b32 s6, 0 +; GFX1064GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX1064GISEL-NEXT: s_add_i32 s6, s6, s8 +; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1064GISEL-NEXT: .LBB4_5: ; %endif +; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: divergent_cfg: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0 +; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1 +; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1032DAGISEL-NEXT: s_load_dword s1, s[4:5], 0x2c +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: s_mul_i32 s1, s1, s2 +; GFX1032DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032DAGISEL-NEXT: s_mov_b32 s1, 0 +; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3 +; GFX1032DAGISEL-NEXT: s_add_i32 s1, s1, s6 +; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1032DAGISEL-NEXT: ; %bb.5: +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: divergent_cfg: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0 +; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0 +; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1032GISEL-NEXT: ; %bb.1: ; %else +; GFX1032GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c +; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: s_mul_i32 s0, s0, s2 +; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1032GISEL-NEXT: ; %bb.3: ; %if +; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032GISEL-NEXT: s_mov_b32 s0, 0 +; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3 +; GFX1032GISEL-NEXT: s_add_i32 s0, s0, s6 +; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1032GISEL-NEXT: .LBB4_5: ; %endif +; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: divergent_cfg: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr2 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 +; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX1164DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX1164DAGISEL-NEXT: s_add_i32 s6, s6, s8 +; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1164DAGISEL-NEXT: ; %bb.5: +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: divergent_cfg: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 +; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1164GISEL-NEXT: ; %bb.1: ; %else +; GFX1164GISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: s_mul_i32 s6, s6, s2 +; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1] +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1164GISEL-NEXT: ; %bb.3: ; %if +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: s_mov_b32 s6, 0 +; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX1164GISEL-NEXT: s_add_i32 s6, s6, s8 +; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1164GISEL-NEXT: .LBB4_5: ; %endif +; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: divergent_cfg: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr1 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 +; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: s_mul_i32 s1, s1, s2 +; GFX1132DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0 +; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3 +; GFX1132DAGISEL-NEXT: s_add_i32 s1, s1, s6 +; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1132DAGISEL-NEXT: ; %bb.5: +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: divergent_cfg: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo +; GFX1132GISEL-NEXT: ; implicit-def: $sgpr0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 +; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1132GISEL-NEXT: ; %bb.1: ; %else +; GFX1132GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c +; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: s_mul_i32 s0, s0, s2 +; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1132GISEL-NEXT: ; %bb.3: ; %if +; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132GISEL-NEXT: s_mov_b32 s0, 0 +; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3 +; GFX1132GISEL-NEXT: s_add_i32 s0, s0, s6 +; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1132GISEL-NEXT: .LBB4_5: ; %endif +; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX1132GISEL-NEXT: s_endpgm + entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %d_cmp = icmp ult i32 %tid, 16 + br i1 %d_cmp, label %if, label %else + +if: + %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %tid, i32 1) + br label %endif + +else: + %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %in, i32 1) + br label %endif + +endif: + %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else] + store i32 %combine, ptr addrspace(1) %out + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX10DAGISEL: {{.*}} +; GFX10GISEL: {{.*}} +; GFX11DAGISEL: {{.*}} +; GFX11GISEL: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll new file mode 100644 index 0000000000000..ac11fa6a5a7a5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll @@ -0,0 +1,986 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 < %s | FileCheck -check-prefixes=GFX8DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 < %s | FileCheck -check-prefixes=GFX8GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -check-prefixes=GFX9DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck -check-prefixes=GFX9GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s + +define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: uniform_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: uniform_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: uniform_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: uniform_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX10DAGISEL-LABEL: uniform_value: +; GFX10DAGISEL: ; %bb.0: ; %entry +; GFX10DAGISEL-NEXT: s_clause 0x1 +; GFX10DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10DAGISEL-NEXT: s_endpgm +; +; GFX10GISEL-LABEL: uniform_value: +; GFX10GISEL: ; %bb.0: ; %entry +; GFX10GISEL-NEXT: s_clause 0x1 +; GFX10GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: uniform_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_clause 0x1 +; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: uniform_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_clause 0x1 +; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: uniform_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_clause 0x1 +; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: uniform_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_clause 0x1 +; GFX1132GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.wave.reduce.and.i32(i32 %in, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: const_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, 0x7b +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: const_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, 0x7b +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: const_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: const_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX10DAGISEL-LABEL: const_value: +; GFX10DAGISEL: ; %bb.0: ; %entry +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10DAGISEL-NEXT: s_endpgm +; +; GFX10GISEL-LABEL: const_value: +; GFX10GISEL: ; %bb.0: ; %entry +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: const_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: const_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: const_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7b +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: const_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm + entry: + %result = call i32 @llvm.amdgcn.wave.reduce.and.i32(i32 123, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: poison_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v0 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: poison_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v0 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: poison_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v0, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: poison_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v0, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX10DAGISEL-LABEL: poison_value: +; GFX10DAGISEL: ; %bb.0: ; %entry +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10DAGISEL-NEXT: global_store_dword v0, v0, s[0:1] +; GFX10DAGISEL-NEXT: s_endpgm +; +; GFX10GISEL-LABEL: poison_value: +; GFX10GISEL: ; %bb.0: ; %entry +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10GISEL-NEXT: global_store_dword v0, v0, s[0:1] +; GFX10GISEL-NEXT: s_endpgm +; +; GFX11DAGISEL-LABEL: poison_value: +; GFX11DAGISEL: ; %bb.0: ; %entry +; GFX11DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX11DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11DAGISEL-NEXT: global_store_b32 v0, v0, s[0:1] +; GFX11DAGISEL-NEXT: s_endpgm +; +; GFX11GISEL-LABEL: poison_value: +; GFX11GISEL: ; %bb.0: ; %entry +; GFX11GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX11GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11GISEL-NEXT: global_store_b32 v0, v0, s[0:1] +; GFX11GISEL-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.wave.reduce.and.i32(i32 poison, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: divergent_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_mov_b32 s4, -1 +; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX8DAGISEL-NEXT: s_and_b32 s4, s4, s6 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8DAGISEL-NEXT: ; %bb.2: +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: divergent_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_mov_b32 s4, -1 +; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX8GISEL-NEXT: s_and_b32 s4, s4, s6 +; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8GISEL-NEXT: ; %bb.2: +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: divergent_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_mov_b32 s4, -1 +; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX9DAGISEL-NEXT: s_and_b32 s4, s4, s6 +; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9DAGISEL-NEXT: ; %bb.2: +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: divergent_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_mov_b32 s4, -1 +; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX9GISEL-NEXT: s_and_b32 s4, s4, s6 +; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9GISEL-NEXT: ; %bb.2: +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: divergent_value: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: s_mov_b32 s4, -1 +; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1064DAGISEL-NEXT: s_and_b32 s4, s4, s6 +; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1064DAGISEL-NEXT: ; %bb.2: +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: divergent_value: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: s_mov_b32 s4, -1 +; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1064GISEL-NEXT: s_and_b32 s4, s4, s6 +; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1064GISEL-NEXT: ; %bb.2: +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: divergent_value: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, -1 +; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1032DAGISEL-NEXT: s_and_b32 s2, s2, s5 +; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1032DAGISEL-NEXT: ; %bb.2: +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: divergent_value: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032GISEL-NEXT: s_mov_b32 s2, -1 +; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1032GISEL-NEXT: s_and_b32 s2, s2, s5 +; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1032GISEL-NEXT: ; %bb.2: +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: divergent_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: s_mov_b32 s4, -1 +; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: s_and_b32 s4, s4, s6 +; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1164DAGISEL-NEXT: ; %bb.2: +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: divergent_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: s_mov_b32 s4, -1 +; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: s_and_b32 s4, s4, s6 +; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1164GISEL-NEXT: ; %bb.2: +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: divergent_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, -1 +; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: s_and_b32 s2, s2, s5 +; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1132DAGISEL-NEXT: ; %bb.2: +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: divergent_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132GISEL-NEXT: s_mov_b32 s2, -1 +; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: s_and_b32 s2, s2, s5 +; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1132GISEL-NEXT: ; %bb.2: +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %result = call i32 @llvm.amdgcn.wave.reduce.and.i32(i32 %id.x, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: divergent_cfg: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr2 +; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8DAGISEL-NEXT: ; %bb.1: ; %else +; GFX8DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX8DAGISEL-NEXT: ; %bb.2: ; %Flow +; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX8DAGISEL-NEXT: ; %bb.3: ; %if +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_mov_b32 s6, -1 +; GFX8DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX8DAGISEL-NEXT: s_and_b32 s6, s6, s8 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX8DAGISEL-NEXT: ; %bb.5: +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v1 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: divergent_cfg: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX8GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX8GISEL-NEXT: ; %bb.1: ; %else +; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: s_mov_b32 s6, s2 +; GFX8GISEL-NEXT: .LBB4_2: ; %Flow +; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX8GISEL-NEXT: ; %bb.3: ; %if +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_mov_b32 s6, -1 +; GFX8GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX8GISEL-NEXT: s_and_b32 s6, s6, s8 +; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX8GISEL-NEXT: .LBB4_5: ; %endif +; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: divergent_cfg: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr2 +; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9DAGISEL-NEXT: ; %bb.1: ; %else +; GFX9DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX9DAGISEL-NEXT: ; %bb.2: ; %Flow +; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX9DAGISEL-NEXT: ; %bb.3: ; %if +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_mov_b32 s6, -1 +; GFX9DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX9DAGISEL-NEXT: s_and_b32 s6, s6, s8 +; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX9DAGISEL-NEXT: ; %bb.5: +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: divergent_cfg: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX9GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX9GISEL-NEXT: ; %bb.1: ; %else +; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: s_mov_b32 s6, s2 +; GFX9GISEL-NEXT: .LBB4_2: ; %Flow +; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX9GISEL-NEXT: ; %bb.3: ; %if +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_mov_b32 s6, -1 +; GFX9GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX9GISEL-NEXT: s_and_b32 s6, s6, s8 +; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX9GISEL-NEXT: .LBB4_5: ; %endif +; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: divergent_cfg: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr2 +; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1064DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1064DAGISEL-NEXT: ; %bb.2: ; %Flow +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: s_mov_b32 s6, -1 +; GFX1064DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX1064DAGISEL-NEXT: s_and_b32 s6, s6, s8 +; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1064DAGISEL-NEXT: ; %bb.5: +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: divergent_cfg: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1064GISEL-NEXT: ; %bb.1: ; %else +; GFX1064GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: s_mov_b32 s6, s2 +; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1064GISEL-NEXT: ; %bb.3: ; %if +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: s_mov_b32 s6, -1 +; GFX1064GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX1064GISEL-NEXT: s_and_b32 s6, s6, s8 +; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1064GISEL-NEXT: .LBB4_5: ; %endif +; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: divergent_cfg: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0 +; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1 +; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1032DAGISEL-NEXT: s_load_dword s1, s[4:5], 0x2c +; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1032DAGISEL-NEXT: ; %bb.2: ; %Flow +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032DAGISEL-NEXT: s_mov_b32 s1, -1 +; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3 +; GFX1032DAGISEL-NEXT: s_and_b32 s1, s1, s6 +; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1032DAGISEL-NEXT: ; %bb.5: +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: divergent_cfg: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0 +; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0 +; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1032GISEL-NEXT: ; %bb.1: ; %else +; GFX1032GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c +; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: s_mov_b32 s0, s0 +; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1032GISEL-NEXT: ; %bb.3: ; %if +; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032GISEL-NEXT: s_mov_b32 s0, -1 +; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3 +; GFX1032GISEL-NEXT: s_and_b32 s0, s0, s6 +; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1032GISEL-NEXT: .LBB4_5: ; %endif +; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: divergent_cfg: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr2 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 +; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1164DAGISEL-NEXT: ; %bb.2: ; %Flow +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: s_mov_b32 s6, -1 +; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX1164DAGISEL-NEXT: s_and_b32 s6, s6, s8 +; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1164DAGISEL-NEXT: ; %bb.5: +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: divergent_cfg: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 +; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1164GISEL-NEXT: ; %bb.1: ; %else +; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: s_mov_b32 s6, s2 +; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1] +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1164GISEL-NEXT: ; %bb.3: ; %if +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: s_mov_b32 s6, -1 +; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX1164GISEL-NEXT: s_and_b32 s6, s6, s8 +; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1164GISEL-NEXT: .LBB4_5: ; %endif +; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: divergent_cfg: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr1 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 +; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c +; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1132DAGISEL-NEXT: ; %bb.2: ; %Flow +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132DAGISEL-NEXT: s_mov_b32 s1, -1 +; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3 +; GFX1132DAGISEL-NEXT: s_and_b32 s1, s1, s6 +; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1132DAGISEL-NEXT: ; %bb.5: +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: divergent_cfg: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo +; GFX1132GISEL-NEXT: ; implicit-def: $sgpr0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 +; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1132GISEL-NEXT: ; %bb.1: ; %else +; GFX1132GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c +; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: s_mov_b32 s0, s0 +; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1132GISEL-NEXT: ; %bb.3: ; %if +; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132GISEL-NEXT: s_mov_b32 s0, -1 +; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3 +; GFX1132GISEL-NEXT: s_and_b32 s0, s0, s6 +; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1132GISEL-NEXT: .LBB4_5: ; %endif +; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX1132GISEL-NEXT: s_endpgm + entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %d_cmp = icmp ult i32 %tid, 16 + br i1 %d_cmp, label %if, label %else + +if: + %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.and.i32(i32 %tid, i32 1) + br label %endif + +else: + %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.and.i32(i32 %in, i32 1) + br label %endif + +endif: + %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else] + store i32 %combine, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll new file mode 100644 index 0000000000000..2df2886d068c8 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll @@ -0,0 +1,986 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 < %s | FileCheck -check-prefixes=GFX8DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 < %s | FileCheck -check-prefixes=GFX8GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -check-prefixes=GFX9DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck -check-prefixes=GFX9GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s + +define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: uniform_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: uniform_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: uniform_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: uniform_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX10DAGISEL-LABEL: uniform_value: +; GFX10DAGISEL: ; %bb.0: ; %entry +; GFX10DAGISEL-NEXT: s_clause 0x1 +; GFX10DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10DAGISEL-NEXT: s_endpgm +; +; GFX10GISEL-LABEL: uniform_value: +; GFX10GISEL: ; %bb.0: ; %entry +; GFX10GISEL-NEXT: s_clause 0x1 +; GFX10GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: uniform_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_clause 0x1 +; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: uniform_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_clause 0x1 +; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: uniform_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_clause 0x1 +; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: uniform_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_clause 0x1 +; GFX1132GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.wave.reduce.max.i32(i32 %in, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: const_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, 0x7b +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: const_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, 0x7b +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: const_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: const_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX10DAGISEL-LABEL: const_value: +; GFX10DAGISEL: ; %bb.0: ; %entry +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10DAGISEL-NEXT: s_endpgm +; +; GFX10GISEL-LABEL: const_value: +; GFX10GISEL: ; %bb.0: ; %entry +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: const_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: const_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: const_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7b +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: const_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm + entry: + %result = call i32 @llvm.amdgcn.wave.reduce.max.i32(i32 123, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: poison_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v0 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: poison_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v0 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: poison_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v0, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: poison_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v0, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX10DAGISEL-LABEL: poison_value: +; GFX10DAGISEL: ; %bb.0: ; %entry +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10DAGISEL-NEXT: global_store_dword v0, v0, s[0:1] +; GFX10DAGISEL-NEXT: s_endpgm +; +; GFX10GISEL-LABEL: poison_value: +; GFX10GISEL: ; %bb.0: ; %entry +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10GISEL-NEXT: global_store_dword v0, v0, s[0:1] +; GFX10GISEL-NEXT: s_endpgm +; +; GFX11DAGISEL-LABEL: poison_value: +; GFX11DAGISEL: ; %bb.0: ; %entry +; GFX11DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX11DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11DAGISEL-NEXT: global_store_b32 v0, v0, s[0:1] +; GFX11DAGISEL-NEXT: s_endpgm +; +; GFX11GISEL-LABEL: poison_value: +; GFX11GISEL: ; %bb.0: ; %entry +; GFX11GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX11GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11GISEL-NEXT: global_store_b32 v0, v0, s[0:1] +; GFX11GISEL-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.wave.reduce.max.i32(i32 poison, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: divergent_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_brev_b32 s4, 1 +; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX8DAGISEL-NEXT: s_max_i32 s4, s4, s6 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8DAGISEL-NEXT: ; %bb.2: +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: divergent_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_brev_b32 s4, 1 +; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX8GISEL-NEXT: s_max_i32 s4, s4, s6 +; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8GISEL-NEXT: ; %bb.2: +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: divergent_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_brev_b32 s4, 1 +; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX9DAGISEL-NEXT: s_max_i32 s4, s4, s6 +; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9DAGISEL-NEXT: ; %bb.2: +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: divergent_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_brev_b32 s4, 1 +; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX9GISEL-NEXT: s_max_i32 s4, s4, s6 +; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9GISEL-NEXT: ; %bb.2: +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: divergent_value: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: s_brev_b32 s4, 1 +; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1064DAGISEL-NEXT: s_max_i32 s4, s4, s6 +; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1064DAGISEL-NEXT: ; %bb.2: +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: divergent_value: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: s_brev_b32 s4, 1 +; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1064GISEL-NEXT: s_max_i32 s4, s4, s6 +; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1064GISEL-NEXT: ; %bb.2: +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: divergent_value: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032DAGISEL-NEXT: s_brev_b32 s2, 1 +; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1032DAGISEL-NEXT: s_max_i32 s2, s2, s5 +; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1032DAGISEL-NEXT: ; %bb.2: +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: divergent_value: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032GISEL-NEXT: s_brev_b32 s2, 1 +; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1032GISEL-NEXT: s_max_i32 s2, s2, s5 +; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1032GISEL-NEXT: ; %bb.2: +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: divergent_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: s_brev_b32 s4, 1 +; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: s_max_i32 s4, s4, s6 +; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1164DAGISEL-NEXT: ; %bb.2: +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: divergent_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: s_brev_b32 s4, 1 +; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: s_max_i32 s4, s4, s6 +; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1164GISEL-NEXT: ; %bb.2: +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: divergent_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132DAGISEL-NEXT: s_brev_b32 s2, 1 +; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: s_max_i32 s2, s2, s5 +; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1132DAGISEL-NEXT: ; %bb.2: +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: divergent_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132GISEL-NEXT: s_brev_b32 s2, 1 +; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: s_max_i32 s2, s2, s5 +; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1132GISEL-NEXT: ; %bb.2: +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %result = call i32 @llvm.amdgcn.wave.reduce.max.i32(i32 %id.x, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: divergent_cfg: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr2 +; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8DAGISEL-NEXT: ; %bb.1: ; %else +; GFX8DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX8DAGISEL-NEXT: ; %bb.2: ; %Flow +; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX8DAGISEL-NEXT: ; %bb.3: ; %if +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_brev_b32 s6, 1 +; GFX8DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX8DAGISEL-NEXT: s_max_i32 s6, s6, s8 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX8DAGISEL-NEXT: ; %bb.5: +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v1 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: divergent_cfg: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX8GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX8GISEL-NEXT: ; %bb.1: ; %else +; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: s_mov_b32 s6, s2 +; GFX8GISEL-NEXT: .LBB4_2: ; %Flow +; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX8GISEL-NEXT: ; %bb.3: ; %if +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_brev_b32 s6, 1 +; GFX8GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX8GISEL-NEXT: s_max_i32 s6, s6, s8 +; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX8GISEL-NEXT: .LBB4_5: ; %endif +; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: divergent_cfg: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr2 +; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9DAGISEL-NEXT: ; %bb.1: ; %else +; GFX9DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX9DAGISEL-NEXT: ; %bb.2: ; %Flow +; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX9DAGISEL-NEXT: ; %bb.3: ; %if +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_brev_b32 s6, 1 +; GFX9DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX9DAGISEL-NEXT: s_max_i32 s6, s6, s8 +; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX9DAGISEL-NEXT: ; %bb.5: +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: divergent_cfg: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX9GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX9GISEL-NEXT: ; %bb.1: ; %else +; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: s_mov_b32 s6, s2 +; GFX9GISEL-NEXT: .LBB4_2: ; %Flow +; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX9GISEL-NEXT: ; %bb.3: ; %if +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_brev_b32 s6, 1 +; GFX9GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX9GISEL-NEXT: s_max_i32 s6, s6, s8 +; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX9GISEL-NEXT: .LBB4_5: ; %endif +; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: divergent_cfg: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr2 +; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1064DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1064DAGISEL-NEXT: ; %bb.2: ; %Flow +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: s_brev_b32 s6, 1 +; GFX1064DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX1064DAGISEL-NEXT: s_max_i32 s6, s6, s8 +; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1064DAGISEL-NEXT: ; %bb.5: +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: divergent_cfg: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1064GISEL-NEXT: ; %bb.1: ; %else +; GFX1064GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: s_mov_b32 s6, s2 +; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1064GISEL-NEXT: ; %bb.3: ; %if +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: s_brev_b32 s6, 1 +; GFX1064GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX1064GISEL-NEXT: s_max_i32 s6, s6, s8 +; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1064GISEL-NEXT: .LBB4_5: ; %endif +; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: divergent_cfg: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0 +; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1 +; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1032DAGISEL-NEXT: s_load_dword s1, s[4:5], 0x2c +; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1032DAGISEL-NEXT: ; %bb.2: ; %Flow +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032DAGISEL-NEXT: s_brev_b32 s1, 1 +; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3 +; GFX1032DAGISEL-NEXT: s_max_i32 s1, s1, s6 +; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1032DAGISEL-NEXT: ; %bb.5: +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: divergent_cfg: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0 +; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0 +; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1032GISEL-NEXT: ; %bb.1: ; %else +; GFX1032GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c +; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: s_mov_b32 s0, s0 +; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1032GISEL-NEXT: ; %bb.3: ; %if +; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032GISEL-NEXT: s_brev_b32 s0, 1 +; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3 +; GFX1032GISEL-NEXT: s_max_i32 s0, s0, s6 +; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1032GISEL-NEXT: .LBB4_5: ; %endif +; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: divergent_cfg: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr2 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 +; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1164DAGISEL-NEXT: ; %bb.2: ; %Flow +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: s_brev_b32 s6, 1 +; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX1164DAGISEL-NEXT: s_max_i32 s6, s6, s8 +; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1164DAGISEL-NEXT: ; %bb.5: +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: divergent_cfg: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 +; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1164GISEL-NEXT: ; %bb.1: ; %else +; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: s_mov_b32 s6, s2 +; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1] +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1164GISEL-NEXT: ; %bb.3: ; %if +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: s_brev_b32 s6, 1 +; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX1164GISEL-NEXT: s_max_i32 s6, s6, s8 +; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1164GISEL-NEXT: .LBB4_5: ; %endif +; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: divergent_cfg: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr1 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 +; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c +; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1132DAGISEL-NEXT: ; %bb.2: ; %Flow +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132DAGISEL-NEXT: s_brev_b32 s1, 1 +; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3 +; GFX1132DAGISEL-NEXT: s_max_i32 s1, s1, s6 +; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1132DAGISEL-NEXT: ; %bb.5: +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: divergent_cfg: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo +; GFX1132GISEL-NEXT: ; implicit-def: $sgpr0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 +; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1132GISEL-NEXT: ; %bb.1: ; %else +; GFX1132GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c +; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: s_mov_b32 s0, s0 +; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1132GISEL-NEXT: ; %bb.3: ; %if +; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132GISEL-NEXT: s_brev_b32 s0, 1 +; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3 +; GFX1132GISEL-NEXT: s_max_i32 s0, s0, s6 +; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1132GISEL-NEXT: .LBB4_5: ; %endif +; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX1132GISEL-NEXT: s_endpgm + entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %d_cmp = icmp ult i32 %tid, 16 + br i1 %d_cmp, label %if, label %else + +if: + %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.max.i32(i32 %tid, i32 1) + br label %endif + +else: + %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.max.i32(i32 %in, i32 1) + br label %endif + +endif: + %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else] + store i32 %combine, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll new file mode 100644 index 0000000000000..45335c7234956 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll @@ -0,0 +1,986 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 < %s | FileCheck -check-prefixes=GFX8DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 < %s | FileCheck -check-prefixes=GFX8GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -check-prefixes=GFX9DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck -check-prefixes=GFX9GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s + +define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: uniform_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: uniform_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: uniform_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: uniform_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX10DAGISEL-LABEL: uniform_value: +; GFX10DAGISEL: ; %bb.0: ; %entry +; GFX10DAGISEL-NEXT: s_clause 0x1 +; GFX10DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10DAGISEL-NEXT: s_endpgm +; +; GFX10GISEL-LABEL: uniform_value: +; GFX10GISEL: ; %bb.0: ; %entry +; GFX10GISEL-NEXT: s_clause 0x1 +; GFX10GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: uniform_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_clause 0x1 +; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: uniform_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_clause 0x1 +; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: uniform_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_clause 0x1 +; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: uniform_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_clause 0x1 +; GFX1132GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.wave.reduce.min.i32(i32 %in, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: const_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, 0x7b +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: const_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, 0x7b +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: const_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: const_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX10DAGISEL-LABEL: const_value: +; GFX10DAGISEL: ; %bb.0: ; %entry +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10DAGISEL-NEXT: s_endpgm +; +; GFX10GISEL-LABEL: const_value: +; GFX10GISEL: ; %bb.0: ; %entry +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: const_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: const_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: const_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7b +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: const_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm + entry: + %result = call i32 @llvm.amdgcn.wave.reduce.min.i32(i32 123, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: poison_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v0 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: poison_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v0 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: poison_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v0, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: poison_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v0, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX10DAGISEL-LABEL: poison_value: +; GFX10DAGISEL: ; %bb.0: ; %entry +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10DAGISEL-NEXT: global_store_dword v0, v0, s[0:1] +; GFX10DAGISEL-NEXT: s_endpgm +; +; GFX10GISEL-LABEL: poison_value: +; GFX10GISEL: ; %bb.0: ; %entry +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10GISEL-NEXT: global_store_dword v0, v0, s[0:1] +; GFX10GISEL-NEXT: s_endpgm +; +; GFX11DAGISEL-LABEL: poison_value: +; GFX11DAGISEL: ; %bb.0: ; %entry +; GFX11DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX11DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11DAGISEL-NEXT: global_store_b32 v0, v0, s[0:1] +; GFX11DAGISEL-NEXT: s_endpgm +; +; GFX11GISEL-LABEL: poison_value: +; GFX11GISEL: ; %bb.0: ; %entry +; GFX11GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX11GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11GISEL-NEXT: global_store_b32 v0, v0, s[0:1] +; GFX11GISEL-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.wave.reduce.min.i32(i32 poison, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: divergent_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_brev_b32 s4, -2 +; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX8DAGISEL-NEXT: s_min_i32 s4, s4, s6 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8DAGISEL-NEXT: ; %bb.2: +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: divergent_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_brev_b32 s4, -2 +; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX8GISEL-NEXT: s_min_i32 s4, s4, s6 +; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8GISEL-NEXT: ; %bb.2: +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: divergent_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_brev_b32 s4, -2 +; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX9DAGISEL-NEXT: s_min_i32 s4, s4, s6 +; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9DAGISEL-NEXT: ; %bb.2: +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: divergent_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_brev_b32 s4, -2 +; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX9GISEL-NEXT: s_min_i32 s4, s4, s6 +; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9GISEL-NEXT: ; %bb.2: +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: divergent_value: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: s_brev_b32 s4, -2 +; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1064DAGISEL-NEXT: s_min_i32 s4, s4, s6 +; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1064DAGISEL-NEXT: ; %bb.2: +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: divergent_value: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: s_brev_b32 s4, -2 +; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1064GISEL-NEXT: s_min_i32 s4, s4, s6 +; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1064GISEL-NEXT: ; %bb.2: +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: divergent_value: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032DAGISEL-NEXT: s_brev_b32 s2, -2 +; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1032DAGISEL-NEXT: s_min_i32 s2, s2, s5 +; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1032DAGISEL-NEXT: ; %bb.2: +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: divergent_value: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032GISEL-NEXT: s_brev_b32 s2, -2 +; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1032GISEL-NEXT: s_min_i32 s2, s2, s5 +; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1032GISEL-NEXT: ; %bb.2: +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: divergent_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: s_brev_b32 s4, -2 +; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: s_min_i32 s4, s4, s6 +; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1164DAGISEL-NEXT: ; %bb.2: +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: divergent_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: s_brev_b32 s4, -2 +; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: s_min_i32 s4, s4, s6 +; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1164GISEL-NEXT: ; %bb.2: +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: divergent_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132DAGISEL-NEXT: s_brev_b32 s2, -2 +; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: s_min_i32 s2, s2, s5 +; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1132DAGISEL-NEXT: ; %bb.2: +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: divergent_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132GISEL-NEXT: s_brev_b32 s2, -2 +; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: s_min_i32 s2, s2, s5 +; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1132GISEL-NEXT: ; %bb.2: +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %result = call i32 @llvm.amdgcn.wave.reduce.min.i32(i32 %id.x, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: divergent_cfg: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr2 +; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8DAGISEL-NEXT: ; %bb.1: ; %else +; GFX8DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX8DAGISEL-NEXT: ; %bb.2: ; %Flow +; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX8DAGISEL-NEXT: ; %bb.3: ; %if +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_brev_b32 s6, -2 +; GFX8DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX8DAGISEL-NEXT: s_min_i32 s6, s6, s8 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX8DAGISEL-NEXT: ; %bb.5: +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v1 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: divergent_cfg: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX8GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX8GISEL-NEXT: ; %bb.1: ; %else +; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: s_mov_b32 s6, s2 +; GFX8GISEL-NEXT: .LBB4_2: ; %Flow +; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX8GISEL-NEXT: ; %bb.3: ; %if +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_brev_b32 s6, -2 +; GFX8GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX8GISEL-NEXT: s_min_i32 s6, s6, s8 +; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX8GISEL-NEXT: .LBB4_5: ; %endif +; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: divergent_cfg: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr2 +; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9DAGISEL-NEXT: ; %bb.1: ; %else +; GFX9DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX9DAGISEL-NEXT: ; %bb.2: ; %Flow +; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX9DAGISEL-NEXT: ; %bb.3: ; %if +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_brev_b32 s6, -2 +; GFX9DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX9DAGISEL-NEXT: s_min_i32 s6, s6, s8 +; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX9DAGISEL-NEXT: ; %bb.5: +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: divergent_cfg: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX9GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX9GISEL-NEXT: ; %bb.1: ; %else +; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: s_mov_b32 s6, s2 +; GFX9GISEL-NEXT: .LBB4_2: ; %Flow +; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX9GISEL-NEXT: ; %bb.3: ; %if +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_brev_b32 s6, -2 +; GFX9GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX9GISEL-NEXT: s_min_i32 s6, s6, s8 +; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX9GISEL-NEXT: .LBB4_5: ; %endif +; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: divergent_cfg: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr2 +; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1064DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1064DAGISEL-NEXT: ; %bb.2: ; %Flow +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: s_brev_b32 s6, -2 +; GFX1064DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX1064DAGISEL-NEXT: s_min_i32 s6, s6, s8 +; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1064DAGISEL-NEXT: ; %bb.5: +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: divergent_cfg: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1064GISEL-NEXT: ; %bb.1: ; %else +; GFX1064GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: s_mov_b32 s6, s2 +; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1064GISEL-NEXT: ; %bb.3: ; %if +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: s_brev_b32 s6, -2 +; GFX1064GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX1064GISEL-NEXT: s_min_i32 s6, s6, s8 +; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1064GISEL-NEXT: .LBB4_5: ; %endif +; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: divergent_cfg: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0 +; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1 +; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1032DAGISEL-NEXT: s_load_dword s1, s[4:5], 0x2c +; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1032DAGISEL-NEXT: ; %bb.2: ; %Flow +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032DAGISEL-NEXT: s_brev_b32 s1, -2 +; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3 +; GFX1032DAGISEL-NEXT: s_min_i32 s1, s1, s6 +; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1032DAGISEL-NEXT: ; %bb.5: +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: divergent_cfg: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0 +; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0 +; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1032GISEL-NEXT: ; %bb.1: ; %else +; GFX1032GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c +; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: s_mov_b32 s0, s0 +; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1032GISEL-NEXT: ; %bb.3: ; %if +; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032GISEL-NEXT: s_brev_b32 s0, -2 +; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3 +; GFX1032GISEL-NEXT: s_min_i32 s0, s0, s6 +; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1032GISEL-NEXT: .LBB4_5: ; %endif +; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: divergent_cfg: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr2 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 +; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1164DAGISEL-NEXT: ; %bb.2: ; %Flow +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: s_brev_b32 s6, -2 +; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX1164DAGISEL-NEXT: s_min_i32 s6, s6, s8 +; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1164DAGISEL-NEXT: ; %bb.5: +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: divergent_cfg: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 +; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1164GISEL-NEXT: ; %bb.1: ; %else +; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: s_mov_b32 s6, s2 +; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1] +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1164GISEL-NEXT: ; %bb.3: ; %if +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: s_brev_b32 s6, -2 +; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX1164GISEL-NEXT: s_min_i32 s6, s6, s8 +; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1164GISEL-NEXT: .LBB4_5: ; %endif +; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: divergent_cfg: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr1 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 +; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c +; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1132DAGISEL-NEXT: ; %bb.2: ; %Flow +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132DAGISEL-NEXT: s_brev_b32 s1, -2 +; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3 +; GFX1132DAGISEL-NEXT: s_min_i32 s1, s1, s6 +; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1132DAGISEL-NEXT: ; %bb.5: +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: divergent_cfg: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo +; GFX1132GISEL-NEXT: ; implicit-def: $sgpr0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 +; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1132GISEL-NEXT: ; %bb.1: ; %else +; GFX1132GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c +; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: s_mov_b32 s0, s0 +; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1132GISEL-NEXT: ; %bb.3: ; %if +; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132GISEL-NEXT: s_brev_b32 s0, -2 +; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3 +; GFX1132GISEL-NEXT: s_min_i32 s0, s0, s6 +; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1132GISEL-NEXT: .LBB4_5: ; %endif +; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX1132GISEL-NEXT: s_endpgm + entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %d_cmp = icmp ult i32 %tid, 16 + br i1 %d_cmp, label %if, label %else + +if: + %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.min.i32(i32 %tid, i32 1) + br label %endif + +else: + %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.min.i32(i32 %in, i32 1) + br label %endif + +endif: + %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else] + store i32 %combine, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll new file mode 100644 index 0000000000000..590c5cc383d43 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll @@ -0,0 +1,986 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 < %s | FileCheck -check-prefixes=GFX8DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 < %s | FileCheck -check-prefixes=GFX8GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -check-prefixes=GFX9DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck -check-prefixes=GFX9GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s + +define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: uniform_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: uniform_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: uniform_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: uniform_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX10DAGISEL-LABEL: uniform_value: +; GFX10DAGISEL: ; %bb.0: ; %entry +; GFX10DAGISEL-NEXT: s_clause 0x1 +; GFX10DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10DAGISEL-NEXT: s_endpgm +; +; GFX10GISEL-LABEL: uniform_value: +; GFX10GISEL: ; %bb.0: ; %entry +; GFX10GISEL-NEXT: s_clause 0x1 +; GFX10GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: uniform_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_clause 0x1 +; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: uniform_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_clause 0x1 +; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: uniform_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_clause 0x1 +; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: uniform_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_clause 0x1 +; GFX1132GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.wave.reduce.or.i32(i32 %in, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: const_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, 0x7b +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: const_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, 0x7b +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: const_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: const_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX10DAGISEL-LABEL: const_value: +; GFX10DAGISEL: ; %bb.0: ; %entry +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10DAGISEL-NEXT: s_endpgm +; +; GFX10GISEL-LABEL: const_value: +; GFX10GISEL: ; %bb.0: ; %entry +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: const_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: const_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: const_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7b +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: const_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm + entry: + %result = call i32 @llvm.amdgcn.wave.reduce.or.i32(i32 123, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: poison_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v0 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: poison_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v0 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: poison_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v0, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: poison_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v0, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX10DAGISEL-LABEL: poison_value: +; GFX10DAGISEL: ; %bb.0: ; %entry +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10DAGISEL-NEXT: global_store_dword v0, v0, s[0:1] +; GFX10DAGISEL-NEXT: s_endpgm +; +; GFX10GISEL-LABEL: poison_value: +; GFX10GISEL: ; %bb.0: ; %entry +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10GISEL-NEXT: global_store_dword v0, v0, s[0:1] +; GFX10GISEL-NEXT: s_endpgm +; +; GFX11DAGISEL-LABEL: poison_value: +; GFX11DAGISEL: ; %bb.0: ; %entry +; GFX11DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX11DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11DAGISEL-NEXT: global_store_b32 v0, v0, s[0:1] +; GFX11DAGISEL-NEXT: s_endpgm +; +; GFX11GISEL-LABEL: poison_value: +; GFX11GISEL: ; %bb.0: ; %entry +; GFX11GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX11GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11GISEL-NEXT: global_store_b32 v0, v0, s[0:1] +; GFX11GISEL-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.wave.reduce.or.i32(i32 poison, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: divergent_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_mov_b32 s4, 0 +; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX8DAGISEL-NEXT: s_or_b32 s4, s4, s6 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8DAGISEL-NEXT: ; %bb.2: +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: divergent_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_mov_b32 s4, 0 +; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX8GISEL-NEXT: s_or_b32 s4, s4, s6 +; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8GISEL-NEXT: ; %bb.2: +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: divergent_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_mov_b32 s4, 0 +; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX9DAGISEL-NEXT: s_or_b32 s4, s4, s6 +; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9DAGISEL-NEXT: ; %bb.2: +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: divergent_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_mov_b32 s4, 0 +; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX9GISEL-NEXT: s_or_b32 s4, s4, s6 +; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9GISEL-NEXT: ; %bb.2: +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: divergent_value: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: s_mov_b32 s4, 0 +; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1064DAGISEL-NEXT: s_or_b32 s4, s4, s6 +; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1064DAGISEL-NEXT: ; %bb.2: +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: divergent_value: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: s_mov_b32 s4, 0 +; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1064GISEL-NEXT: s_or_b32 s4, s4, s6 +; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1064GISEL-NEXT: ; %bb.2: +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: divergent_value: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, 0 +; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1032DAGISEL-NEXT: s_or_b32 s2, s2, s5 +; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1032DAGISEL-NEXT: ; %bb.2: +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: divergent_value: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032GISEL-NEXT: s_mov_b32 s2, 0 +; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1032GISEL-NEXT: s_or_b32 s2, s2, s5 +; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1032GISEL-NEXT: ; %bb.2: +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: divergent_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: s_mov_b32 s4, 0 +; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: s_or_b32 s4, s4, s6 +; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1164DAGISEL-NEXT: ; %bb.2: +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: divergent_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: s_mov_b32 s4, 0 +; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: s_or_b32 s4, s4, s6 +; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1164GISEL-NEXT: ; %bb.2: +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: divergent_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, 0 +; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: s_or_b32 s2, s2, s5 +; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1132DAGISEL-NEXT: ; %bb.2: +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: divergent_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132GISEL-NEXT: s_mov_b32 s2, 0 +; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: s_or_b32 s2, s2, s5 +; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1132GISEL-NEXT: ; %bb.2: +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %result = call i32 @llvm.amdgcn.wave.reduce.or.i32(i32 %id.x, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: divergent_cfg: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr2 +; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8DAGISEL-NEXT: ; %bb.1: ; %else +; GFX8DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX8DAGISEL-NEXT: ; %bb.2: ; %Flow +; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX8DAGISEL-NEXT: ; %bb.3: ; %if +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX8DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX8DAGISEL-NEXT: s_or_b32 s6, s6, s8 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX8DAGISEL-NEXT: ; %bb.5: +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v1 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: divergent_cfg: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX8GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX8GISEL-NEXT: ; %bb.1: ; %else +; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: s_mov_b32 s6, s2 +; GFX8GISEL-NEXT: .LBB4_2: ; %Flow +; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX8GISEL-NEXT: ; %bb.3: ; %if +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_mov_b32 s6, 0 +; GFX8GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX8GISEL-NEXT: s_or_b32 s6, s6, s8 +; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX8GISEL-NEXT: .LBB4_5: ; %endif +; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: divergent_cfg: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr2 +; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9DAGISEL-NEXT: ; %bb.1: ; %else +; GFX9DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX9DAGISEL-NEXT: ; %bb.2: ; %Flow +; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX9DAGISEL-NEXT: ; %bb.3: ; %if +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX9DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX9DAGISEL-NEXT: s_or_b32 s6, s6, s8 +; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX9DAGISEL-NEXT: ; %bb.5: +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: divergent_cfg: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX9GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX9GISEL-NEXT: ; %bb.1: ; %else +; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: s_mov_b32 s6, s2 +; GFX9GISEL-NEXT: .LBB4_2: ; %Flow +; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX9GISEL-NEXT: ; %bb.3: ; %if +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_mov_b32 s6, 0 +; GFX9GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX9GISEL-NEXT: s_or_b32 s6, s6, s8 +; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX9GISEL-NEXT: .LBB4_5: ; %endif +; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: divergent_cfg: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr2 +; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1064DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1064DAGISEL-NEXT: ; %bb.2: ; %Flow +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX1064DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX1064DAGISEL-NEXT: s_or_b32 s6, s6, s8 +; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1064DAGISEL-NEXT: ; %bb.5: +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: divergent_cfg: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1064GISEL-NEXT: ; %bb.1: ; %else +; GFX1064GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: s_mov_b32 s6, s2 +; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1064GISEL-NEXT: ; %bb.3: ; %if +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: s_mov_b32 s6, 0 +; GFX1064GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX1064GISEL-NEXT: s_or_b32 s6, s6, s8 +; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1064GISEL-NEXT: .LBB4_5: ; %endif +; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: divergent_cfg: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0 +; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1 +; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1032DAGISEL-NEXT: s_load_dword s1, s[4:5], 0x2c +; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1032DAGISEL-NEXT: ; %bb.2: ; %Flow +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032DAGISEL-NEXT: s_mov_b32 s1, 0 +; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3 +; GFX1032DAGISEL-NEXT: s_or_b32 s1, s1, s6 +; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1032DAGISEL-NEXT: ; %bb.5: +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: divergent_cfg: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0 +; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0 +; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1032GISEL-NEXT: ; %bb.1: ; %else +; GFX1032GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c +; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: s_mov_b32 s0, s0 +; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1032GISEL-NEXT: ; %bb.3: ; %if +; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032GISEL-NEXT: s_mov_b32 s0, 0 +; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3 +; GFX1032GISEL-NEXT: s_or_b32 s0, s0, s6 +; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1032GISEL-NEXT: .LBB4_5: ; %endif +; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: divergent_cfg: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr2 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 +; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1164DAGISEL-NEXT: ; %bb.2: ; %Flow +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX1164DAGISEL-NEXT: s_or_b32 s6, s6, s8 +; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1164DAGISEL-NEXT: ; %bb.5: +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: divergent_cfg: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 +; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1164GISEL-NEXT: ; %bb.1: ; %else +; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: s_mov_b32 s6, s2 +; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1] +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1164GISEL-NEXT: ; %bb.3: ; %if +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: s_mov_b32 s6, 0 +; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX1164GISEL-NEXT: s_or_b32 s6, s6, s8 +; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1164GISEL-NEXT: .LBB4_5: ; %endif +; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: divergent_cfg: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr1 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 +; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c +; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1132DAGISEL-NEXT: ; %bb.2: ; %Flow +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0 +; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3 +; GFX1132DAGISEL-NEXT: s_or_b32 s1, s1, s6 +; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1132DAGISEL-NEXT: ; %bb.5: +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: divergent_cfg: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo +; GFX1132GISEL-NEXT: ; implicit-def: $sgpr0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 +; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1132GISEL-NEXT: ; %bb.1: ; %else +; GFX1132GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c +; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: s_mov_b32 s0, s0 +; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1132GISEL-NEXT: ; %bb.3: ; %if +; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132GISEL-NEXT: s_mov_b32 s0, 0 +; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3 +; GFX1132GISEL-NEXT: s_or_b32 s0, s0, s6 +; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1132GISEL-NEXT: .LBB4_5: ; %endif +; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX1132GISEL-NEXT: s_endpgm + entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %d_cmp = icmp ult i32 %tid, 16 + br i1 %d_cmp, label %if, label %else + +if: + %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.or.i32(i32 %tid, i32 1) + br label %endif + +else: + %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.or.i32(i32 %in, i32 1) + br label %endif + +endif: + %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else] + store i32 %combine, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll new file mode 100644 index 0000000000000..1377dc51c4b34 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll @@ -0,0 +1,1286 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 < %s | FileCheck -check-prefixes=GFX8DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 < %s | FileCheck -check-prefixes=GFX8GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -check-prefixes=GFX9DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck -check-prefixes=GFX9GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s + +define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: uniform_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: s_mul_i32 s3, s6, -1 +; GFX8DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: uniform_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: s_mul_i32 s3, s6, -1 +; GFX8GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: uniform_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: s_mul_i32 s3, s6, -1 +; GFX9DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: uniform_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: s_mul_i32 s3, s6, -1 +; GFX9GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: uniform_value: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_clause 0x1 +; GFX1064DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: s_mul_i32 s3, s6, -1 +; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: uniform_value: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_clause 0x1 +; GFX1064GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: s_mul_i32 s3, s6, -1 +; GFX1064GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: uniform_value: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_clause 0x1 +; GFX1032DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s2, -1 +; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s2, s3 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: uniform_value: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_clause 0x1 +; GFX1032GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: s_mul_i32 s2, s2, -1 +; GFX1032GISEL-NEXT: s_mul_i32 s2, s2, s3 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: uniform_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_clause 0x1 +; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: s_mul_i32 s3, s6, -1 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: uniform_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_clause 0x1 +; GFX1164GISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: s_mul_i32 s3, s6, -1 +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: uniform_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_clause 0x1 +; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s2, -1 +; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s2, s3 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: uniform_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_clause 0x1 +; GFX1132GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: s_mul_i32 s2, s2, -1 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: s_mul_i32 s2, s2, s3 +; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 %in, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: const_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_mul_i32 s4, -1, 0x7b +; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8DAGISEL-NEXT: s_mul_i32 s2, s4, s2 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: const_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8GISEL-NEXT: s_mul_i32 s3, -1, 0x7b +; GFX8GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: const_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9DAGISEL-NEXT: s_mul_i32 s3, -1, 0x7b +; GFX9DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: const_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9GISEL-NEXT: s_mul_i32 s3, -1, 0x7b +; GFX9GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: const_value: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064DAGISEL-NEXT: s_mul_i32 s3, -1, 0x7b +; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: const_value: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064GISEL-NEXT: s_mul_i32 s3, -1, 0x7b +; GFX1064GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: const_value: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032DAGISEL-NEXT: s_mul_i32 s3, -1, 0x7b +; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: const_value: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032GISEL-NEXT: s_mul_i32 s3, -1, 0x7b +; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: const_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164DAGISEL-NEXT: s_mul_i32 s3, -1, 0x7b +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: const_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164GISEL-NEXT: s_mul_i32 s3, -1, 0x7b +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: const_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132DAGISEL-NEXT: s_mul_i32 s3, -1, 0x7b +; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: const_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132GISEL-NEXT: s_mul_i32 s3, -1, 0x7b +; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm + entry: + %result = call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 123, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: poison_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: s_mul_i32 s4, s0, -1 +; GFX8DAGISEL-NEXT: s_mul_i32 s2, s4, s2 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: poison_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: s_mul_i32 s3, s0, -1 +; GFX8GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: poison_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: s_mul_i32 s3, s0, -1 +; GFX9DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: poison_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: s_mul_i32 s3, s0, -1 +; GFX9GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: poison_value: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: s_mul_i32 s3, s0, -1 +; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: poison_value: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: s_mul_i32 s3, s0, -1 +; GFX1064GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: poison_value: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: s_mul_i32 s3, s0, -1 +; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: poison_value: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: s_mul_i32 s3, s0, -1 +; GFX1032GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: poison_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: s_mul_i32 s3, s0, -1 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: poison_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: s_mul_i32 s3, s0, -1 +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: poison_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: s_mul_i32 s3, s0, -1 +; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: poison_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: s_mul_i32 s3, s0, -1 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 poison, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: divergent_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_mov_b32 s4, 0 +; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX8DAGISEL-NEXT: s_sub_i32 s4, s4, s6 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8DAGISEL-NEXT: ; %bb.2: +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: divergent_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_mov_b32 s4, 0 +; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX8GISEL-NEXT: s_sub_i32 s4, s4, s6 +; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8GISEL-NEXT: ; %bb.2: +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: divergent_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_mov_b32 s4, 0 +; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX9DAGISEL-NEXT: s_sub_i32 s4, s4, s6 +; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9DAGISEL-NEXT: ; %bb.2: +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: divergent_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_mov_b32 s4, 0 +; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX9GISEL-NEXT: s_sub_i32 s4, s4, s6 +; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9GISEL-NEXT: ; %bb.2: +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: divergent_value: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: s_mov_b32 s4, 0 +; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1064DAGISEL-NEXT: s_sub_i32 s4, s4, s6 +; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1064DAGISEL-NEXT: ; %bb.2: +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: divergent_value: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: s_mov_b32 s4, 0 +; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1064GISEL-NEXT: s_sub_i32 s4, s4, s6 +; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1064GISEL-NEXT: ; %bb.2: +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: divergent_value: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, 0 +; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1032DAGISEL-NEXT: s_sub_i32 s2, s2, s5 +; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1032DAGISEL-NEXT: ; %bb.2: +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: divergent_value: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032GISEL-NEXT: s_mov_b32 s2, 0 +; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1032GISEL-NEXT: s_sub_i32 s2, s2, s5 +; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1032GISEL-NEXT: ; %bb.2: +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: divergent_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: s_mov_b32 s4, 0 +; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: s_sub_i32 s4, s4, s6 +; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1164DAGISEL-NEXT: ; %bb.2: +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: divergent_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: s_mov_b32 s4, 0 +; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: s_sub_i32 s4, s4, s6 +; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1164GISEL-NEXT: ; %bb.2: +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: divergent_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, 0 +; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: s_sub_i32 s2, s2, s5 +; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1132DAGISEL-NEXT: ; %bb.2: +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: divergent_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132GISEL-NEXT: s_mov_b32 s2, 0 +; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: s_sub_i32 s2, s2, s5 +; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1132GISEL-NEXT: ; %bb.2: +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %result = call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 %id.x, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: divergent_cfg: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr2 +; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX8DAGISEL-NEXT: ; %bb.1: ; %else +; GFX8DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: s_mul_i32 s3, s6, -1 +; GFX8DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX8DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX8DAGISEL-NEXT: ; %bb.3: ; %if +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX8DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX8DAGISEL-NEXT: s_sub_i32 s6, s6, s8 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX8DAGISEL-NEXT: ; %bb.5: +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v1 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: divergent_cfg: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX8GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX8GISEL-NEXT: ; %bb.1: ; %else +; GFX8GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: s_mul_i32 s3, s6, -1 +; GFX8GISEL-NEXT: s_mul_i32 s6, s3, s2 +; GFX8GISEL-NEXT: .LBB4_2: ; %Flow +; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX8GISEL-NEXT: ; %bb.3: ; %if +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_mov_b32 s6, 0 +; GFX8GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX8GISEL-NEXT: s_sub_i32 s6, s6, s8 +; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX8GISEL-NEXT: .LBB4_5: ; %endif +; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: divergent_cfg: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr2 +; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX9DAGISEL-NEXT: ; %bb.1: ; %else +; GFX9DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: s_mul_i32 s3, s6, -1 +; GFX9DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX9DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX9DAGISEL-NEXT: ; %bb.3: ; %if +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX9DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX9DAGISEL-NEXT: s_sub_i32 s6, s6, s8 +; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX9DAGISEL-NEXT: ; %bb.5: +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: divergent_cfg: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX9GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX9GISEL-NEXT: ; %bb.1: ; %else +; GFX9GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: s_mul_i32 s3, s6, -1 +; GFX9GISEL-NEXT: s_mul_i32 s6, s3, s2 +; GFX9GISEL-NEXT: .LBB4_2: ; %Flow +; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX9GISEL-NEXT: ; %bb.3: ; %if +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_mov_b32 s6, 0 +; GFX9GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX9GISEL-NEXT: s_sub_i32 s6, s6, s8 +; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX9GISEL-NEXT: .LBB4_5: ; %endif +; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: divergent_cfg: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr2 +; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1064DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: s_mul_i32 s3, s6, -1 +; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1064DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX1064DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX1064DAGISEL-NEXT: s_sub_i32 s6, s6, s8 +; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1064DAGISEL-NEXT: ; %bb.5: +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: divergent_cfg: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1064GISEL-NEXT: ; %bb.1: ; %else +; GFX1064GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: s_mul_i32 s3, s6, -1 +; GFX1064GISEL-NEXT: s_mul_i32 s6, s3, s2 +; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1064GISEL-NEXT: ; %bb.3: ; %if +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: s_mov_b32 s6, 0 +; GFX1064GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX1064GISEL-NEXT: s_sub_i32 s6, s6, s8 +; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1064GISEL-NEXT: .LBB4_5: ; %endif +; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: divergent_cfg: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0 +; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1 +; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1032DAGISEL-NEXT: s_load_dword s1, s[4:5], 0x2c +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: s_mul_i32 s1, s1, -1 +; GFX1032DAGISEL-NEXT: s_mul_i32 s1, s1, s2 +; GFX1032DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032DAGISEL-NEXT: s_mov_b32 s1, 0 +; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3 +; GFX1032DAGISEL-NEXT: s_sub_i32 s1, s1, s6 +; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1032DAGISEL-NEXT: ; %bb.5: +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: divergent_cfg: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0 +; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0 +; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1032GISEL-NEXT: ; %bb.1: ; %else +; GFX1032GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c +; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: s_mul_i32 s0, s0, -1 +; GFX1032GISEL-NEXT: s_mul_i32 s0, s0, s2 +; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1032GISEL-NEXT: ; %bb.3: ; %if +; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032GISEL-NEXT: s_mov_b32 s0, 0 +; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3 +; GFX1032GISEL-NEXT: s_sub_i32 s0, s0, s6 +; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1032GISEL-NEXT: .LBB4_5: ; %endif +; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: divergent_cfg: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr2 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 +; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: s_mul_i32 s3, s6, -1 +; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1164DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX1164DAGISEL-NEXT: s_sub_i32 s6, s6, s8 +; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1164DAGISEL-NEXT: ; %bb.5: +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: divergent_cfg: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 +; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1164GISEL-NEXT: ; %bb.1: ; %else +; GFX1164GISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: s_mul_i32 s3, s6, -1 +; GFX1164GISEL-NEXT: s_mul_i32 s6, s3, s2 +; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1] +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1164GISEL-NEXT: ; %bb.3: ; %if +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: s_mov_b32 s6, 0 +; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX1164GISEL-NEXT: s_sub_i32 s6, s6, s8 +; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1164GISEL-NEXT: .LBB4_5: ; %endif +; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: divergent_cfg: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr1 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 +; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: s_mul_i32 s1, s1, -1 +; GFX1132DAGISEL-NEXT: s_mul_i32 s1, s1, s2 +; GFX1132DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0 +; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3 +; GFX1132DAGISEL-NEXT: s_sub_i32 s1, s1, s6 +; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1132DAGISEL-NEXT: ; %bb.5: +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: divergent_cfg: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo +; GFX1132GISEL-NEXT: ; implicit-def: $sgpr0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 +; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1132GISEL-NEXT: ; %bb.1: ; %else +; GFX1132GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c +; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: s_mul_i32 s0, s0, -1 +; GFX1132GISEL-NEXT: s_mul_i32 s0, s0, s2 +; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1132GISEL-NEXT: ; %bb.3: ; %if +; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132GISEL-NEXT: s_mov_b32 s0, 0 +; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3 +; GFX1132GISEL-NEXT: s_sub_i32 s0, s0, s6 +; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1132GISEL-NEXT: .LBB4_5: ; %endif +; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX1132GISEL-NEXT: s_endpgm + entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %d_cmp = icmp ult i32 %tid, 16 + br i1 %d_cmp, label %if, label %else + +if: + %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 %tid, i32 1) + br label %endif + +else: + %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 %in, i32 1) + br label %endif + +endif: + %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else] + store i32 %combine, ptr addrspace(1) %out + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX10DAGISEL: {{.*}} +; GFX10GISEL: {{.*}} +; GFX11DAGISEL: {{.*}} +; GFX11GISEL: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.uadd.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.uadd.ll new file mode 100644 index 0000000000000..d7b0ebb709a77 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.uadd.ll @@ -0,0 +1,1240 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 < %s | FileCheck -check-prefixes=GFX8DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 < %s | FileCheck -check-prefixes=GFX8GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -check-prefixes=GFX9DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck -check-prefixes=GFX9GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s + +declare i32 @llvm.amdgcn.wave.reduce.uadd.i32(i32, i32 immarg) +declare i32 @llvm.amdgcn.workitem.id.x() + +define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: uniform_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: uniform_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: uniform_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: uniform_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: uniform_value: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_clause 0x1 +; GFX1064DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: uniform_value: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_clause 0x1 +; GFX1064GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: uniform_value: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_clause 0x1 +; GFX1032DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s2, s3 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: uniform_value: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_clause 0x1 +; GFX1032GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: s_mul_i32 s2, s2, s3 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: uniform_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_clause 0x1 +; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: uniform_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_clause 0x1 +; GFX1164GISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: uniform_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_clause 0x1 +; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s2, s3 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: uniform_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_clause 0x1 +; GFX1132GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: s_mul_i32 s2, s2, s3 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.wave.reduce.uadd.i32(i32 %in, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: const_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8DAGISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: const_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8GISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: const_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9DAGISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: const_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9GISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: const_value: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064DAGISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: const_value: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064GISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: const_value: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032DAGISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: const_value: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032GISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: const_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: const_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: const_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132DAGISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: const_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm + entry: + %result = call i32 @llvm.amdgcn.wave.reduce.uadd.i32(i32 123, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: poison_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: poison_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: poison_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: poison_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: poison_value: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: poison_value: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: poison_value: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: poison_value: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: poison_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: poison_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: poison_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: poison_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.wave.reduce.uadd.i32(i32 poison, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: divergent_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_mov_b32 s4, 0 +; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX8DAGISEL-NEXT: s_add_i32 s4, s4, s6 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8DAGISEL-NEXT: ; %bb.2: +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: divergent_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_mov_b32 s4, 0 +; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX8GISEL-NEXT: s_add_i32 s4, s4, s6 +; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8GISEL-NEXT: ; %bb.2: +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: divergent_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_mov_b32 s4, 0 +; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX9DAGISEL-NEXT: s_add_i32 s4, s4, s6 +; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9DAGISEL-NEXT: ; %bb.2: +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: divergent_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_mov_b32 s4, 0 +; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX9GISEL-NEXT: s_add_i32 s4, s4, s6 +; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9GISEL-NEXT: ; %bb.2: +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: divergent_value: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: s_mov_b32 s4, 0 +; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1064DAGISEL-NEXT: s_add_i32 s4, s4, s6 +; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1064DAGISEL-NEXT: ; %bb.2: +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: divergent_value: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: s_mov_b32 s4, 0 +; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1064GISEL-NEXT: s_add_i32 s4, s4, s6 +; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1064GISEL-NEXT: ; %bb.2: +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: divergent_value: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, 0 +; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1032DAGISEL-NEXT: s_add_i32 s2, s2, s5 +; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1032DAGISEL-NEXT: ; %bb.2: +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: divergent_value: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032GISEL-NEXT: s_mov_b32 s2, 0 +; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1032GISEL-NEXT: s_add_i32 s2, s2, s5 +; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1032GISEL-NEXT: ; %bb.2: +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: divergent_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: s_mov_b32 s4, 0 +; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: s_add_i32 s4, s4, s6 +; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1164DAGISEL-NEXT: ; %bb.2: +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: divergent_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: s_mov_b32 s4, 0 +; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: s_add_i32 s4, s4, s6 +; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1164GISEL-NEXT: ; %bb.2: +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: divergent_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, 0 +; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: s_add_i32 s2, s2, s5 +; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1132DAGISEL-NEXT: ; %bb.2: +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: divergent_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132GISEL-NEXT: s_mov_b32 s2, 0 +; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: s_add_i32 s2, s2, s5 +; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1132GISEL-NEXT: ; %bb.2: +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %result = call i32 @llvm.amdgcn.wave.reduce.uadd.i32(i32 %id.x, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: divergent_cfg: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr2 +; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX8DAGISEL-NEXT: ; %bb.1: ; %else +; GFX8DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX8DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX8DAGISEL-NEXT: ; %bb.3: ; %if +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX8DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX8DAGISEL-NEXT: s_add_i32 s6, s6, s8 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX8DAGISEL-NEXT: ; %bb.5: +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v1 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: divergent_cfg: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX8GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX8GISEL-NEXT: ; %bb.1: ; %else +; GFX8GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: s_mul_i32 s6, s6, s2 +; GFX8GISEL-NEXT: .LBB4_2: ; %Flow +; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX8GISEL-NEXT: ; %bb.3: ; %if +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_mov_b32 s6, 0 +; GFX8GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX8GISEL-NEXT: s_add_i32 s6, s6, s8 +; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX8GISEL-NEXT: .LBB4_5: ; %endif +; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: divergent_cfg: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr2 +; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX9DAGISEL-NEXT: ; %bb.1: ; %else +; GFX9DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX9DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX9DAGISEL-NEXT: ; %bb.3: ; %if +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX9DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX9DAGISEL-NEXT: s_add_i32 s6, s6, s8 +; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX9DAGISEL-NEXT: ; %bb.5: +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: divergent_cfg: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX9GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX9GISEL-NEXT: ; %bb.1: ; %else +; GFX9GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: s_mul_i32 s6, s6, s2 +; GFX9GISEL-NEXT: .LBB4_2: ; %Flow +; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX9GISEL-NEXT: ; %bb.3: ; %if +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_mov_b32 s6, 0 +; GFX9GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX9GISEL-NEXT: s_add_i32 s6, s6, s8 +; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX9GISEL-NEXT: .LBB4_5: ; %endif +; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: divergent_cfg: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr2 +; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1064DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX1064DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX1064DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX1064DAGISEL-NEXT: s_add_i32 s6, s6, s8 +; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1064DAGISEL-NEXT: ; %bb.5: +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: divergent_cfg: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1064GISEL-NEXT: ; %bb.1: ; %else +; GFX1064GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: s_mul_i32 s6, s6, s2 +; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1064GISEL-NEXT: ; %bb.3: ; %if +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: s_mov_b32 s6, 0 +; GFX1064GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX1064GISEL-NEXT: s_add_i32 s6, s6, s8 +; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1064GISEL-NEXT: .LBB4_5: ; %endif +; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: divergent_cfg: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0 +; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1 +; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1032DAGISEL-NEXT: s_load_dword s1, s[4:5], 0x2c +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: s_mul_i32 s1, s1, s2 +; GFX1032DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032DAGISEL-NEXT: s_mov_b32 s1, 0 +; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3 +; GFX1032DAGISEL-NEXT: s_add_i32 s1, s1, s6 +; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1032DAGISEL-NEXT: ; %bb.5: +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: divergent_cfg: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0 +; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0 +; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1032GISEL-NEXT: ; %bb.1: ; %else +; GFX1032GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c +; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: s_mul_i32 s0, s0, s2 +; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1032GISEL-NEXT: ; %bb.3: ; %if +; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032GISEL-NEXT: s_mov_b32 s0, 0 +; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3 +; GFX1032GISEL-NEXT: s_add_i32 s0, s0, s6 +; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1032GISEL-NEXT: .LBB4_5: ; %endif +; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: divergent_cfg: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr2 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 +; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX1164DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX1164DAGISEL-NEXT: s_add_i32 s6, s6, s8 +; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1164DAGISEL-NEXT: ; %bb.5: +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: divergent_cfg: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 +; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1164GISEL-NEXT: ; %bb.1: ; %else +; GFX1164GISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: s_mul_i32 s6, s6, s2 +; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1] +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1164GISEL-NEXT: ; %bb.3: ; %if +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: s_mov_b32 s6, 0 +; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX1164GISEL-NEXT: s_add_i32 s6, s6, s8 +; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1164GISEL-NEXT: .LBB4_5: ; %endif +; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: divergent_cfg: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr1 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 +; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: s_mul_i32 s1, s1, s2 +; GFX1132DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0 +; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3 +; GFX1132DAGISEL-NEXT: s_add_i32 s1, s1, s6 +; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1132DAGISEL-NEXT: ; %bb.5: +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: divergent_cfg: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo +; GFX1132GISEL-NEXT: ; implicit-def: $sgpr0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 +; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1132GISEL-NEXT: ; %bb.1: ; %else +; GFX1132GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c +; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: s_mul_i32 s0, s0, s2 +; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1132GISEL-NEXT: ; %bb.3: ; %if +; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132GISEL-NEXT: s_mov_b32 s0, 0 +; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3 +; GFX1132GISEL-NEXT: s_add_i32 s0, s0, s6 +; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1132GISEL-NEXT: .LBB4_5: ; %endif +; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX1132GISEL-NEXT: s_endpgm + entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %d_cmp = icmp ult i32 %tid, 16 + br i1 %d_cmp, label %if, label %else + +if: + %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.uadd.i32(i32 %tid, i32 1) + br label %endif + +else: + %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.uadd.i32(i32 %in, i32 1) + br label %endif + +endif: + %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else] + store i32 %combine, ptr addrspace(1) %out + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX10DAGISEL: {{.*}} +; GFX10GISEL: {{.*}} +; GFX11DAGISEL: {{.*}} +; GFX11GISEL: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll index f72f1e52d135f..8bf90c9ce7378 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll @@ -1,16 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 -; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8DAGISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8GISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9DAGISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9GISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 < %s | FileCheck -check-prefixes=GFX8DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 < %s | FileCheck -check-prefixes=GFX8GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -check-prefixes=GFX9DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck -check-prefixes=GFX9GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s declare i32 @llvm.amdgcn.wave.reduce.umax.i32(i32, i32 immarg) declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll index 4551c60770bdf..996ae7256d584 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll @@ -1,16 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 -; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8DAGISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8GISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9DAGISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9GISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 < %s | FileCheck -check-prefixes=GFX8DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 < %s | FileCheck -check-prefixes=GFX8GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -check-prefixes=GFX9DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck -check-prefixes=GFX9GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s declare i32 @llvm.amdgcn.wave.reduce.umin.i32(i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.usub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.usub.ll new file mode 100644 index 0000000000000..daff91c48c814 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.usub.ll @@ -0,0 +1,1286 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 < %s | FileCheck -check-prefixes=GFX8DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 < %s | FileCheck -check-prefixes=GFX8GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -check-prefixes=GFX9DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck -check-prefixes=GFX9GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s + +define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: uniform_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: s_mul_i32 s3, s6, -1 +; GFX8DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: uniform_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: s_mul_i32 s3, s6, -1 +; GFX8GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: uniform_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: s_mul_i32 s3, s6, -1 +; GFX9DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: uniform_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: s_mul_i32 s3, s6, -1 +; GFX9GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: uniform_value: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_clause 0x1 +; GFX1064DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: s_mul_i32 s3, s6, -1 +; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: uniform_value: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_clause 0x1 +; GFX1064GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: s_mul_i32 s3, s6, -1 +; GFX1064GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: uniform_value: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_clause 0x1 +; GFX1032DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s2, -1 +; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s2, s3 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: uniform_value: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_clause 0x1 +; GFX1032GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: s_mul_i32 s2, s2, -1 +; GFX1032GISEL-NEXT: s_mul_i32 s2, s2, s3 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: uniform_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_clause 0x1 +; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: s_mul_i32 s3, s6, -1 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: uniform_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_clause 0x1 +; GFX1164GISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: s_mul_i32 s3, s6, -1 +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: uniform_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_clause 0x1 +; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s2, -1 +; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s2, s3 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: uniform_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_clause 0x1 +; GFX1132GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: s_mul_i32 s2, s2, -1 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: s_mul_i32 s2, s2, s3 +; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.wave.reduce.usub.i32(i32 %in, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: const_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_mul_i32 s4, -1, 0x7b +; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8DAGISEL-NEXT: s_mul_i32 s2, s4, s2 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: const_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8GISEL-NEXT: s_mul_i32 s3, -1, 0x7b +; GFX8GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: const_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9DAGISEL-NEXT: s_mul_i32 s3, -1, 0x7b +; GFX9DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: const_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9GISEL-NEXT: s_mul_i32 s3, -1, 0x7b +; GFX9GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: const_value: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064DAGISEL-NEXT: s_mul_i32 s3, -1, 0x7b +; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: const_value: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064GISEL-NEXT: s_mul_i32 s3, -1, 0x7b +; GFX1064GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: const_value: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032DAGISEL-NEXT: s_mul_i32 s3, -1, 0x7b +; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: const_value: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032GISEL-NEXT: s_mul_i32 s3, -1, 0x7b +; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: const_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164DAGISEL-NEXT: s_mul_i32 s3, -1, 0x7b +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: const_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164GISEL-NEXT: s_mul_i32 s3, -1, 0x7b +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: const_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132DAGISEL-NEXT: s_mul_i32 s3, -1, 0x7b +; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: const_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132GISEL-NEXT: s_mul_i32 s3, -1, 0x7b +; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm + entry: + %result = call i32 @llvm.amdgcn.wave.reduce.usub.i32(i32 123, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: poison_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: s_mul_i32 s4, s0, -1 +; GFX8DAGISEL-NEXT: s_mul_i32 s2, s4, s2 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: poison_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: s_mul_i32 s3, s0, -1 +; GFX8GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: poison_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: s_mul_i32 s3, s0, -1 +; GFX9DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: poison_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: s_mul_i32 s3, s0, -1 +; GFX9GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: poison_value: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: s_mul_i32 s3, s0, -1 +; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: poison_value: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: s_mul_i32 s3, s0, -1 +; GFX1064GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: poison_value: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: s_mul_i32 s3, s0, -1 +; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: poison_value: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: s_mul_i32 s3, s0, -1 +; GFX1032GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: poison_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: s_mul_i32 s3, s0, -1 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: poison_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: s_mul_i32 s3, s0, -1 +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: poison_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: s_mul_i32 s3, s0, -1 +; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: poison_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: s_mul_i32 s3, s0, -1 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.wave.reduce.usub.i32(i32 poison, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: divergent_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_mov_b32 s4, 0 +; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX8DAGISEL-NEXT: s_sub_i32 s4, s4, s6 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8DAGISEL-NEXT: ; %bb.2: +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: divergent_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_mov_b32 s4, 0 +; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX8GISEL-NEXT: s_sub_i32 s4, s4, s6 +; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8GISEL-NEXT: ; %bb.2: +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: divergent_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_mov_b32 s4, 0 +; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX9DAGISEL-NEXT: s_sub_i32 s4, s4, s6 +; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9DAGISEL-NEXT: ; %bb.2: +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: divergent_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_mov_b32 s4, 0 +; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX9GISEL-NEXT: s_sub_i32 s4, s4, s6 +; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9GISEL-NEXT: ; %bb.2: +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: divergent_value: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: s_mov_b32 s4, 0 +; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1064DAGISEL-NEXT: s_sub_i32 s4, s4, s6 +; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1064DAGISEL-NEXT: ; %bb.2: +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: divergent_value: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: s_mov_b32 s4, 0 +; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1064GISEL-NEXT: s_sub_i32 s4, s4, s6 +; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1064GISEL-NEXT: ; %bb.2: +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: divergent_value: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, 0 +; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1032DAGISEL-NEXT: s_sub_i32 s2, s2, s5 +; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1032DAGISEL-NEXT: ; %bb.2: +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: divergent_value: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032GISEL-NEXT: s_mov_b32 s2, 0 +; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1032GISEL-NEXT: s_sub_i32 s2, s2, s5 +; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1032GISEL-NEXT: ; %bb.2: +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: divergent_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: s_mov_b32 s4, 0 +; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: s_sub_i32 s4, s4, s6 +; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1164DAGISEL-NEXT: ; %bb.2: +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: divergent_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: s_mov_b32 s4, 0 +; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: s_sub_i32 s4, s4, s6 +; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1164GISEL-NEXT: ; %bb.2: +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: divergent_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, 0 +; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: s_sub_i32 s2, s2, s5 +; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1132DAGISEL-NEXT: ; %bb.2: +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: divergent_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132GISEL-NEXT: s_mov_b32 s2, 0 +; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: s_sub_i32 s2, s2, s5 +; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1132GISEL-NEXT: ; %bb.2: +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %result = call i32 @llvm.amdgcn.wave.reduce.usub.i32(i32 %id.x, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: divergent_cfg: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr2 +; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX8DAGISEL-NEXT: ; %bb.1: ; %else +; GFX8DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: s_mul_i32 s3, s6, -1 +; GFX8DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX8DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX8DAGISEL-NEXT: ; %bb.3: ; %if +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX8DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX8DAGISEL-NEXT: s_sub_i32 s6, s6, s8 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX8DAGISEL-NEXT: ; %bb.5: +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v1 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: divergent_cfg: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX8GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX8GISEL-NEXT: ; %bb.1: ; %else +; GFX8GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: s_mul_i32 s3, s6, -1 +; GFX8GISEL-NEXT: s_mul_i32 s6, s3, s2 +; GFX8GISEL-NEXT: .LBB4_2: ; %Flow +; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX8GISEL-NEXT: ; %bb.3: ; %if +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_mov_b32 s6, 0 +; GFX8GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX8GISEL-NEXT: s_sub_i32 s6, s6, s8 +; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX8GISEL-NEXT: .LBB4_5: ; %endif +; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: divergent_cfg: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr2 +; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX9DAGISEL-NEXT: ; %bb.1: ; %else +; GFX9DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: s_mul_i32 s3, s6, -1 +; GFX9DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX9DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX9DAGISEL-NEXT: ; %bb.3: ; %if +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX9DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX9DAGISEL-NEXT: s_sub_i32 s6, s6, s8 +; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX9DAGISEL-NEXT: ; %bb.5: +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: divergent_cfg: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX9GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX9GISEL-NEXT: ; %bb.1: ; %else +; GFX9GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: s_mul_i32 s3, s6, -1 +; GFX9GISEL-NEXT: s_mul_i32 s6, s3, s2 +; GFX9GISEL-NEXT: .LBB4_2: ; %Flow +; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX9GISEL-NEXT: ; %bb.3: ; %if +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_mov_b32 s6, 0 +; GFX9GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX9GISEL-NEXT: s_sub_i32 s6, s6, s8 +; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX9GISEL-NEXT: .LBB4_5: ; %endif +; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: divergent_cfg: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr2 +; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1064DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: s_mul_i32 s3, s6, -1 +; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1064DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX1064DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX1064DAGISEL-NEXT: s_sub_i32 s6, s6, s8 +; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1064DAGISEL-NEXT: ; %bb.5: +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: divergent_cfg: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1064GISEL-NEXT: ; %bb.1: ; %else +; GFX1064GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: s_mul_i32 s3, s6, -1 +; GFX1064GISEL-NEXT: s_mul_i32 s6, s3, s2 +; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1064GISEL-NEXT: ; %bb.3: ; %if +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: s_mov_b32 s6, 0 +; GFX1064GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX1064GISEL-NEXT: s_sub_i32 s6, s6, s8 +; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1064GISEL-NEXT: .LBB4_5: ; %endif +; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: divergent_cfg: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0 +; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1 +; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1032DAGISEL-NEXT: s_load_dword s1, s[4:5], 0x2c +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: s_mul_i32 s1, s1, -1 +; GFX1032DAGISEL-NEXT: s_mul_i32 s1, s1, s2 +; GFX1032DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032DAGISEL-NEXT: s_mov_b32 s1, 0 +; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3 +; GFX1032DAGISEL-NEXT: s_sub_i32 s1, s1, s6 +; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1032DAGISEL-NEXT: ; %bb.5: +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: divergent_cfg: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0 +; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0 +; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1032GISEL-NEXT: ; %bb.1: ; %else +; GFX1032GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c +; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: s_mul_i32 s0, s0, -1 +; GFX1032GISEL-NEXT: s_mul_i32 s0, s0, s2 +; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1032GISEL-NEXT: ; %bb.3: ; %if +; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032GISEL-NEXT: s_mov_b32 s0, 0 +; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3 +; GFX1032GISEL-NEXT: s_sub_i32 s0, s0, s6 +; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1032GISEL-NEXT: .LBB4_5: ; %endif +; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: divergent_cfg: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr2 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 +; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: s_mul_i32 s3, s6, -1 +; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1164DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX1164DAGISEL-NEXT: s_sub_i32 s6, s6, s8 +; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1164DAGISEL-NEXT: ; %bb.5: +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: divergent_cfg: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 +; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1164GISEL-NEXT: ; %bb.1: ; %else +; GFX1164GISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: s_mul_i32 s3, s6, -1 +; GFX1164GISEL-NEXT: s_mul_i32 s6, s3, s2 +; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1] +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1164GISEL-NEXT: ; %bb.3: ; %if +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: s_mov_b32 s6, 0 +; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX1164GISEL-NEXT: s_sub_i32 s6, s6, s8 +; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1164GISEL-NEXT: .LBB4_5: ; %endif +; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: divergent_cfg: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr1 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 +; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: s_mul_i32 s1, s1, -1 +; GFX1132DAGISEL-NEXT: s_mul_i32 s1, s1, s2 +; GFX1132DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0 +; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3 +; GFX1132DAGISEL-NEXT: s_sub_i32 s1, s1, s6 +; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1132DAGISEL-NEXT: ; %bb.5: +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: divergent_cfg: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo +; GFX1132GISEL-NEXT: ; implicit-def: $sgpr0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 +; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1132GISEL-NEXT: ; %bb.1: ; %else +; GFX1132GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c +; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: s_mul_i32 s0, s0, -1 +; GFX1132GISEL-NEXT: s_mul_i32 s0, s0, s2 +; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1132GISEL-NEXT: ; %bb.3: ; %if +; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132GISEL-NEXT: s_mov_b32 s0, 0 +; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3 +; GFX1132GISEL-NEXT: s_sub_i32 s0, s0, s6 +; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1132GISEL-NEXT: .LBB4_5: ; %endif +; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX1132GISEL-NEXT: s_endpgm + entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %d_cmp = icmp ult i32 %tid, 16 + br i1 %d_cmp, label %if, label %else + +if: + %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.usub.i32(i32 %tid, i32 1) + br label %endif + +else: + %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.usub.i32(i32 %in, i32 1) + br label %endif + +endif: + %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else] + store i32 %combine, ptr addrspace(1) %out + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX10DAGISEL: {{.*}} +; GFX10GISEL: {{.*}} +; GFX11DAGISEL: {{.*}} +; GFX11GISEL: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll new file mode 100644 index 0000000000000..6194aea667953 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll @@ -0,0 +1,1290 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 < %s | FileCheck -check-prefixes=GFX8DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 < %s | FileCheck -check-prefixes=GFX8GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -check-prefixes=GFX9DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck -check-prefixes=GFX9GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s + +define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: uniform_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8DAGISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: uniform_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8GISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: uniform_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9DAGISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: uniform_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9GISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: uniform_value: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_clause 0x1 +; GFX1064DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064DAGISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: uniform_value: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_clause 0x1 +; GFX1064GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064GISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: uniform_value: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_clause 0x1 +; GFX1032DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032DAGISEL-NEXT: s_and_b32 s3, s3, 1 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s2, s3 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: uniform_value: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_clause 0x1 +; GFX1032GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032GISEL-NEXT: s_and_b32 s3, s3, 1 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: s_mul_i32 s2, s2, s3 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: uniform_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_clause 0x1 +; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: uniform_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_clause 0x1 +; GFX1164GISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: uniform_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_clause 0x1 +; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132DAGISEL-NEXT: s_and_b32 s3, s3, 1 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s2, s3 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: uniform_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_clause 0x1 +; GFX1132GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: s_and_b32 s3, s3, 1 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: s_mul_i32 s2, s2, s3 +; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 %in, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: const_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8DAGISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX8DAGISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: const_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8GISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX8GISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: const_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9DAGISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX9DAGISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: const_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9GISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX9GISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: const_value: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064DAGISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1064DAGISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: const_value: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064GISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1064GISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: const_value: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032DAGISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1032DAGISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: const_value: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032GISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1032GISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: const_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1164DAGISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: const_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1164GISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: const_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132DAGISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: const_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1132GISEL-NEXT: s_mulk_i32 s2, 0x7b +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm + entry: + %result = call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 123, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: poison_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8DAGISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: poison_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8GISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: poison_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9DAGISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: poison_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9GISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: poison_value: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064DAGISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: poison_value: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064GISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: poison_value: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032DAGISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: poison_value: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032GISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: poison_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: poison_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: poison_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132DAGISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: poison_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: s_mul_i32 s2, s0, s2 +; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 poison, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: divergent_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_mov_b32 s4, 0 +; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX8DAGISEL-NEXT: s_xor_b32 s4, s4, s6 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8DAGISEL-NEXT: ; %bb.2: +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: divergent_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_mov_b32 s4, 0 +; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX8GISEL-NEXT: s_xor_b32 s4, s4, s6 +; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8GISEL-NEXT: ; %bb.2: +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: divergent_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_mov_b32 s4, 0 +; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX9DAGISEL-NEXT: s_xor_b32 s4, s4, s6 +; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9DAGISEL-NEXT: ; %bb.2: +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: divergent_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_mov_b32 s4, 0 +; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX9GISEL-NEXT: s_xor_b32 s4, s4, s6 +; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9GISEL-NEXT: ; %bb.2: +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: divergent_value: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: s_mov_b32 s4, 0 +; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1064DAGISEL-NEXT: s_xor_b32 s4, s4, s6 +; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1064DAGISEL-NEXT: ; %bb.2: +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: divergent_value: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: s_mov_b32 s4, 0 +; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1064GISEL-NEXT: s_xor_b32 s4, s4, s6 +; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1064GISEL-NEXT: ; %bb.2: +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: divergent_value: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, 0 +; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1032DAGISEL-NEXT: s_xor_b32 s2, s2, s5 +; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1032DAGISEL-NEXT: ; %bb.2: +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: divergent_value: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032GISEL-NEXT: s_mov_b32 s2, 0 +; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1032GISEL-NEXT: s_xor_b32 s2, s2, s5 +; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1032GISEL-NEXT: ; %bb.2: +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: divergent_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: s_mov_b32 s4, 0 +; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: s_xor_b32 s4, s4, s6 +; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1164DAGISEL-NEXT: ; %bb.2: +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: divergent_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: s_mov_b32 s4, 0 +; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: s_xor_b32 s4, s4, s6 +; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1164GISEL-NEXT: ; %bb.2: +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: divergent_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, 0 +; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: s_xor_b32 s2, s2, s5 +; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1132DAGISEL-NEXT: ; %bb.2: +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: divergent_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132GISEL-NEXT: s_mov_b32 s2, 0 +; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: s_xor_b32 s2, s2, s5 +; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1132GISEL-NEXT: ; %bb.2: +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %result = call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 %id.x, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: divergent_cfg: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr2 +; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX8DAGISEL-NEXT: ; %bb.1: ; %else +; GFX8DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8DAGISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX8DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX8DAGISEL-NEXT: ; %bb.3: ; %if +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX8DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX8DAGISEL-NEXT: s_xor_b32 s6, s6, s8 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX8DAGISEL-NEXT: ; %bb.5: +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v1 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: divergent_cfg: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX8GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX8GISEL-NEXT: ; %bb.1: ; %else +; GFX8GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8GISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX8GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: s_mul_i32 s6, s6, s2 +; GFX8GISEL-NEXT: .LBB4_2: ; %Flow +; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX8GISEL-NEXT: ; %bb.3: ; %if +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_mov_b32 s6, 0 +; GFX8GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX8GISEL-NEXT: s_xor_b32 s6, s6, s8 +; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX8GISEL-NEXT: .LBB4_5: ; %endif +; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: divergent_cfg: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr2 +; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX9DAGISEL-NEXT: ; %bb.1: ; %else +; GFX9DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9DAGISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX9DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX9DAGISEL-NEXT: ; %bb.3: ; %if +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX9DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX9DAGISEL-NEXT: s_xor_b32 s6, s6, s8 +; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX9DAGISEL-NEXT: ; %bb.5: +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: divergent_cfg: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX9GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX9GISEL-NEXT: ; %bb.1: ; %else +; GFX9GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9GISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX9GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: s_mul_i32 s6, s6, s2 +; GFX9GISEL-NEXT: .LBB4_2: ; %Flow +; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX9GISEL-NEXT: ; %bb.3: ; %if +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_mov_b32 s6, 0 +; GFX9GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX9GISEL-NEXT: s_xor_b32 s6, s6, s8 +; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX9GISEL-NEXT: .LBB4_5: ; %endif +; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: divergent_cfg: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr2 +; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1064DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064DAGISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX1064DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX1064DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX1064DAGISEL-NEXT: s_xor_b32 s6, s6, s8 +; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1064DAGISEL-NEXT: ; %bb.5: +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: divergent_cfg: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1064GISEL-NEXT: ; %bb.1: ; %else +; GFX1064GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064GISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: s_mul_i32 s6, s6, s2 +; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1064GISEL-NEXT: ; %bb.3: ; %if +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: s_mov_b32 s6, 0 +; GFX1064GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] +; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX1064GISEL-NEXT: s_xor_b32 s6, s6, s8 +; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1064GISEL-NEXT: .LBB4_5: ; %endif +; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: divergent_cfg: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0 +; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1 +; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1032DAGISEL-NEXT: s_load_dword s1, s[4:5], 0x2c +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032DAGISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: s_mul_i32 s1, s1, s2 +; GFX1032DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032DAGISEL-NEXT: s_mov_b32 s1, 0 +; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3 +; GFX1032DAGISEL-NEXT: s_xor_b32 s1, s1, s6 +; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1032DAGISEL-NEXT: ; %bb.5: +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: divergent_cfg: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0 +; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0 +; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1032GISEL-NEXT: ; %bb.1: ; %else +; GFX1032GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c +; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032GISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: s_mul_i32 s0, s0, s2 +; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1032GISEL-NEXT: ; %bb.3: ; %if +; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032GISEL-NEXT: s_mov_b32 s0, 0 +; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3 +; GFX1032GISEL-NEXT: s_xor_b32 s0, s0, s6 +; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1032GISEL-NEXT: .LBB4_5: ; %endif +; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: divergent_cfg: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr2 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 +; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164DAGISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX1164DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX1164DAGISEL-NEXT: s_xor_b32 s6, s6, s8 +; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1164DAGISEL-NEXT: ; %bb.5: +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: divergent_cfg: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 +; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1164GISEL-NEXT: ; %bb.1: ; %else +; GFX1164GISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164GISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: s_mul_i32 s6, s6, s2 +; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1] +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1164GISEL-NEXT: ; %bb.3: ; %if +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: s_mov_b32 s6, 0 +; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7 +; GFX1164GISEL-NEXT: s_xor_b32 s6, s6, s8 +; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1164GISEL-NEXT: .LBB4_5: ; %endif +; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: divergent_cfg: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr1 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 +; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132DAGISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: s_mul_i32 s1, s1, s2 +; GFX1132DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0 +; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3 +; GFX1132DAGISEL-NEXT: s_xor_b32 s1, s1, s6 +; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1132DAGISEL-NEXT: ; %bb.5: +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif +; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: divergent_cfg: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo +; GFX1132GISEL-NEXT: ; implicit-def: $sgpr0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 +; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1132GISEL-NEXT: ; %bb.1: ; %else +; GFX1132GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c +; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132GISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: s_mul_i32 s0, s0, s2 +; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow +; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1132GISEL-NEXT: ; %bb.3: ; %if +; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132GISEL-NEXT: s_mov_b32 s0, 0 +; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3 +; GFX1132GISEL-NEXT: s_xor_b32 s0, s0, s6 +; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1132GISEL-NEXT: .LBB4_5: ; %endif +; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX1132GISEL-NEXT: s_endpgm + entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %d_cmp = icmp ult i32 %tid, 16 + br i1 %d_cmp, label %if, label %else + +if: + %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 %tid, i32 1) + br label %endif + +else: + %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 %in, i32 1) + br label %endif + +endif: + %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else] + store i32 %combine, ptr addrspace(1) %out + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX10DAGISEL: {{.*}} +; GFX10GISEL: {{.*}} +; GFX11DAGISEL: {{.*}} +; GFX11GISEL: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.add.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.add.mir new file mode 100644 index 0000000000000..ed9005010a71d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.add.mir @@ -0,0 +1,90 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3 +# RUN: llc -mtriple=amdgcn -run-pass=finalize-isel %s -o - | FileCheck -check-prefix=GCN %s + +--- +name: uniform_value +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0.entry: + liveins: $sgpr0_sgpr1 + + ; GCN-LABEL: name: uniform_value + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 + ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 44, 0 + ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec + ; GCN-NEXT: [[S_BCNT1_I32_B64_:%[0-9]+]]:sgpr_32 = S_BCNT1_I32_B64 [[S_MOV_B64_]], implicit-def $scc + ; GCN-NEXT: [[S_MUL_I32_:%[0-9]+]]:sgpr_32 = S_MUL_I32 [[S_LOAD_DWORD_IMM]], [[S_BCNT1_I32_B64_]] + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MUL_I32_]] + ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY1]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + %1:sgpr_64(p4) = COPY $sgpr0_sgpr1 + %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %5:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0 + %6:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1(p4), 44, 0 + %7:sgpr_32 = WAVE_REDUCE_ADD_PSEUDO_I32 killed %6, 1, implicit $exec + %8:vgpr_32 = COPY %7 + GLOBAL_STORE_DWORD_SADDR killed %4, killed %8, killed %5, 0, 0, implicit $exec + S_ENDPGM 0 + +... + +--- +name: divergent_value +machineFunctionInfo: + isEntryFunction: true +tracksRegLiveness: true +body: | + ; GCN-LABEL: name: divergent_value + ; GCN: bb.0.entry: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: liveins: $vgpr0, $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 + ; GCN-NEXT: S_BRANCH %bb.2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[PHI:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_]], %bb.0, %4, %bb.2 + ; GCN-NEXT: [[PHI1:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_]], %bb.0, %11, %bb.2 + ; GCN-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sgpr_32 = S_FF1_I32_B64 [[PHI1]] + ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sgpr_32 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]] + ; GCN-NEXT: [[S_ADD_I32_:%[0-9]+]]:sgpr_32 = S_ADD_I32 [[PHI]], [[V_READLANE_B32_]], implicit-def $scc + ; GCN-NEXT: [[S_BITSET0_B64_:%[0-9]+]]:sreg_64_xexec = S_BITSET0_B64 [[S_FF1_I32_B64_]], [[PHI1]] + ; GCN-NEXT: S_CMP_LG_U64 [[S_BITSET0_B64_]], 0, implicit-def $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3: + ; GCN-NEXT: successors: %bb.1(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] + ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY2]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY1]], %bb.3 + ; GCN-NEXT: S_ENDPGM 0 + bb.0.entry: + liveins: $vgpr0, $sgpr0_sgpr1 + %1:sgpr_64(p4) = COPY $sgpr0_sgpr1 + %0:vgpr_32 = COPY $vgpr0 + %4:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0 + %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %6:sgpr_32 = WAVE_REDUCE_ADD_PSEUDO_I32 %0, 1, implicit $exec + %7:vgpr_32 = COPY %6 + GLOBAL_STORE_DWORD_SADDR killed %5, killed %7, killed %4, 0, 0, implicit $exec + bb.1: + %8:vgpr_32 = PHI %0, %bb.0 + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.and.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.and.mir new file mode 100644 index 0000000000000..8ab1905295a8d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.and.mir @@ -0,0 +1,89 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3 +# RUN: llc -mtriple=amdgcn -run-pass=finalize-isel %s -o - | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -passes=finalize-isel %s -o - | FileCheck -check-prefix=GCN %s + +--- +name: uniform_value +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0.entry: + liveins: $sgpr0_sgpr1 + + ; GCN-LABEL: name: uniform_value + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 + ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 44, 0 + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 [[S_LOAD_DWORD_IMM]] + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY1]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + %1:sgpr_64(p4) = COPY $sgpr0_sgpr1 + %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %5:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0 + %6:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1(p4), 44, 0 + %7:sgpr_32 = WAVE_REDUCE_AND_PSEUDO_B32 killed %6, 1, implicit $exec + %8:vgpr_32 = COPY %7 + GLOBAL_STORE_DWORD_SADDR killed %4, killed %8, killed %5, 0, 0, implicit $exec + S_ENDPGM 0 + +... + +--- +name: divergent_value +machineFunctionInfo: + isEntryFunction: true +tracksRegLiveness: true +body: | + ; GCN-LABEL: name: divergent_value + ; GCN: bb.0.entry: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: liveins: $vgpr0, $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 4294967295 + ; GCN-NEXT: S_BRANCH %bb.2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[PHI:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_]], %bb.0, %4, %bb.2 + ; GCN-NEXT: [[PHI1:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_]], %bb.0, %11, %bb.2 + ; GCN-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sgpr_32 = S_FF1_I32_B64 [[PHI1]] + ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sgpr_32 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]] + ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sgpr_32 = S_AND_B32 [[PHI]], [[V_READLANE_B32_]], implicit-def $scc + ; GCN-NEXT: [[S_BITSET0_B64_:%[0-9]+]]:sreg_64_xexec = S_BITSET0_B64 [[S_FF1_I32_B64_]], [[PHI1]] + ; GCN-NEXT: S_CMP_LG_U64 [[S_BITSET0_B64_]], 0, implicit-def $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3: + ; GCN-NEXT: successors: %bb.1(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_AND_B32_]] + ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY2]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY1]], %bb.3 + ; GCN-NEXT: S_ENDPGM 0 + bb.0.entry: + liveins: $vgpr0, $sgpr0_sgpr1 + %1:sgpr_64(p4) = COPY $sgpr0_sgpr1 + %0:vgpr_32 = COPY $vgpr0 + %4:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0 + %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %6:sgpr_32 = WAVE_REDUCE_AND_PSEUDO_B32 %0, 1, implicit $exec + %7:vgpr_32 = COPY %6 + GLOBAL_STORE_DWORD_SADDR killed %5, killed %7, killed %4, 0, 0, implicit $exec + bb.1: + %8:vgpr_32 = PHI %0, %bb.0 + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.max.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.max.mir new file mode 100644 index 0000000000000..a5f8b0dbe52b2 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.max.mir @@ -0,0 +1,89 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3 +# RUN: llc -mtriple=amdgcn -run-pass=finalize-isel %s -o - | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -passes=finalize-isel %s -o - | FileCheck -check-prefix=GCN %s + +--- +name: uniform_value +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0.entry: + liveins: $sgpr0_sgpr1 + + ; GCN-LABEL: name: uniform_value + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 + ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 44, 0 + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 [[S_LOAD_DWORD_IMM]] + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY1]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + %1:sgpr_64(p4) = COPY $sgpr0_sgpr1 + %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %5:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0 + %6:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1(p4), 44, 0 + %7:sgpr_32 = WAVE_REDUCE_MAX_PSEUDO_I32 killed %6, 1, implicit $exec + %8:vgpr_32 = COPY %7 + GLOBAL_STORE_DWORD_SADDR killed %4, killed %8, killed %5, 0, 0, implicit $exec + S_ENDPGM 0 + +... + +--- +name: divergent_value +machineFunctionInfo: + isEntryFunction: true +tracksRegLiveness: true +body: | + ; GCN-LABEL: name: divergent_value + ; GCN: bb.0.entry: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: liveins: $vgpr0, $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 2147483648 + ; GCN-NEXT: S_BRANCH %bb.2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[PHI:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_]], %bb.0, %4, %bb.2 + ; GCN-NEXT: [[PHI1:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_]], %bb.0, %11, %bb.2 + ; GCN-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sgpr_32 = S_FF1_I32_B64 [[PHI1]] + ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sgpr_32 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]] + ; GCN-NEXT: [[S_MAX_I32_:%[0-9]+]]:sgpr_32 = S_MAX_I32 [[PHI]], [[V_READLANE_B32_]], implicit-def $scc + ; GCN-NEXT: [[S_BITSET0_B64_:%[0-9]+]]:sreg_64_xexec = S_BITSET0_B64 [[S_FF1_I32_B64_]], [[PHI1]] + ; GCN-NEXT: S_CMP_LG_U64 [[S_BITSET0_B64_]], 0, implicit-def $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3: + ; GCN-NEXT: successors: %bb.1(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MAX_I32_]] + ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY2]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY1]], %bb.3 + ; GCN-NEXT: S_ENDPGM 0 + bb.0.entry: + liveins: $vgpr0, $sgpr0_sgpr1 + %1:sgpr_64(p4) = COPY $sgpr0_sgpr1 + %0:vgpr_32 = COPY $vgpr0 + %4:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0 + %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %6:sgpr_32 = WAVE_REDUCE_MAX_PSEUDO_I32 %0, 1, implicit $exec + %7:vgpr_32 = COPY %6 + GLOBAL_STORE_DWORD_SADDR killed %5, killed %7, killed %4, 0, 0, implicit $exec + bb.1: + %8:vgpr_32 = PHI %0, %bb.0 + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.min.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.min.mir new file mode 100644 index 0000000000000..7a0a522d1d3dd --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.min.mir @@ -0,0 +1,89 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3 +# RUN: llc -mtriple=amdgcn -run-pass=finalize-isel %s -o - | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -passes=finalize-isel %s -o - | FileCheck -check-prefix=GCN %s + +--- +name: uniform_value +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0.entry: + liveins: $sgpr0_sgpr1 + + ; GCN-LABEL: name: uniform_value + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 + ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 44, 0 + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 [[S_LOAD_DWORD_IMM]] + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY1]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + %1:sgpr_64(p4) = COPY $sgpr0_sgpr1 + %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %5:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0 + %6:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1(p4), 44, 0 + %7:sgpr_32 = WAVE_REDUCE_MIN_PSEUDO_I32 killed %6, 1, implicit $exec + %8:vgpr_32 = COPY %7 + GLOBAL_STORE_DWORD_SADDR killed %4, killed %8, killed %5, 0, 0, implicit $exec + S_ENDPGM 0 + +... + +--- +name: divergent_value +machineFunctionInfo: + isEntryFunction: true +tracksRegLiveness: true +body: | + ; GCN-LABEL: name: divergent_value + ; GCN: bb.0.entry: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: liveins: $vgpr0, $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 2147483647 + ; GCN-NEXT: S_BRANCH %bb.2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[PHI:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_]], %bb.0, %4, %bb.2 + ; GCN-NEXT: [[PHI1:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_]], %bb.0, %11, %bb.2 + ; GCN-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sgpr_32 = S_FF1_I32_B64 [[PHI1]] + ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sgpr_32 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]] + ; GCN-NEXT: [[S_MIN_I32_:%[0-9]+]]:sgpr_32 = S_MIN_I32 [[PHI]], [[V_READLANE_B32_]], implicit-def $scc + ; GCN-NEXT: [[S_BITSET0_B64_:%[0-9]+]]:sreg_64_xexec = S_BITSET0_B64 [[S_FF1_I32_B64_]], [[PHI1]] + ; GCN-NEXT: S_CMP_LG_U64 [[S_BITSET0_B64_]], 0, implicit-def $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3: + ; GCN-NEXT: successors: %bb.1(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MIN_I32_]] + ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY2]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY1]], %bb.3 + ; GCN-NEXT: S_ENDPGM 0 + bb.0.entry: + liveins: $vgpr0, $sgpr0_sgpr1 + %1:sgpr_64(p4) = COPY $sgpr0_sgpr1 + %0:vgpr_32 = COPY $vgpr0 + %4:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0 + %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %6:sgpr_32 = WAVE_REDUCE_MIN_PSEUDO_I32 %0, 1, implicit $exec + %7:vgpr_32 = COPY %6 + GLOBAL_STORE_DWORD_SADDR killed %5, killed %7, killed %4, 0, 0, implicit $exec + bb.1: + %8:vgpr_32 = PHI %0, %bb.0 + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.or.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.or.mir new file mode 100644 index 0000000000000..4711240af3ecf --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.or.mir @@ -0,0 +1,89 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3 +# RUN: llc -mtriple=amdgcn -run-pass=finalize-isel %s -o - | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -passes=finalize-isel %s -o - | FileCheck -check-prefix=GCN %s + +--- +name: uniform_value +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0.entry: + liveins: $sgpr0_sgpr1 + + ; GCN-LABEL: name: uniform_value + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 + ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 44, 0 + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 [[S_LOAD_DWORD_IMM]] + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY1]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + %1:sgpr_64(p4) = COPY $sgpr0_sgpr1 + %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %5:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0 + %6:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1(p4), 44, 0 + %7:sgpr_32 = WAVE_REDUCE_OR_PSEUDO_B32 killed %6, 1, implicit $exec + %8:vgpr_32 = COPY %7 + GLOBAL_STORE_DWORD_SADDR killed %4, killed %8, killed %5, 0, 0, implicit $exec + S_ENDPGM 0 + +... + +--- +name: divergent_value +machineFunctionInfo: + isEntryFunction: true +tracksRegLiveness: true +body: | + ; GCN-LABEL: name: divergent_value + ; GCN: bb.0.entry: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: liveins: $vgpr0, $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 + ; GCN-NEXT: S_BRANCH %bb.2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[PHI:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_]], %bb.0, %4, %bb.2 + ; GCN-NEXT: [[PHI1:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_]], %bb.0, %11, %bb.2 + ; GCN-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sgpr_32 = S_FF1_I32_B64 [[PHI1]] + ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sgpr_32 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]] + ; GCN-NEXT: [[S_OR_B32_:%[0-9]+]]:sgpr_32 = S_OR_B32 [[PHI]], [[V_READLANE_B32_]], implicit-def $scc + ; GCN-NEXT: [[S_BITSET0_B64_:%[0-9]+]]:sreg_64_xexec = S_BITSET0_B64 [[S_FF1_I32_B64_]], [[PHI1]] + ; GCN-NEXT: S_CMP_LG_U64 [[S_BITSET0_B64_]], 0, implicit-def $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3: + ; GCN-NEXT: successors: %bb.1(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_]] + ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY2]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY1]], %bb.3 + ; GCN-NEXT: S_ENDPGM 0 + bb.0.entry: + liveins: $vgpr0, $sgpr0_sgpr1 + %1:sgpr_64(p4) = COPY $sgpr0_sgpr1 + %0:vgpr_32 = COPY $vgpr0 + %4:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0 + %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %6:sgpr_32 = WAVE_REDUCE_OR_PSEUDO_B32 %0, 1, implicit $exec + %7:vgpr_32 = COPY %6 + GLOBAL_STORE_DWORD_SADDR killed %5, killed %7, killed %4, 0, 0, implicit $exec + bb.1: + %8:vgpr_32 = PHI %0, %bb.0 + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.sub.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.sub.mir new file mode 100644 index 0000000000000..7b729baaaf21f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.sub.mir @@ -0,0 +1,92 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3 +# RUN: llc -mtriple=amdgcn -run-pass=finalize-isel %s -o - | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -passes=finalize-isel %s -o - | FileCheck -check-prefix=GCN %s + +--- +name: uniform_value +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0.entry: + liveins: $sgpr0_sgpr1 + + ; GCN-LABEL: name: uniform_value + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 + ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 44, 0 + ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec + ; GCN-NEXT: [[S_BCNT1_I32_B64_:%[0-9]+]]:sgpr_32 = S_BCNT1_I32_B64 [[S_MOV_B64_]], implicit-def $scc + ; GCN-NEXT: [[S_MUL_I32_:%[0-9]+]]:sgpr_32 = S_MUL_I32 -1, [[S_LOAD_DWORD_IMM]] + ; GCN-NEXT: [[S_MUL_I32_1:%[0-9]+]]:sgpr_32 = S_MUL_I32 [[S_MUL_I32_]], [[S_BCNT1_I32_B64_]] + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MUL_I32_1]] + ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY1]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + %1:sgpr_64(p4) = COPY $sgpr0_sgpr1 + %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %5:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0 + %6:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1(p4), 44, 0 + %7:sgpr_32 = WAVE_REDUCE_SUB_PSEUDO_I32 killed %6, 1, implicit $exec + %8:vgpr_32 = COPY %7 + GLOBAL_STORE_DWORD_SADDR killed %4, killed %8, killed %5, 0, 0, implicit $exec + S_ENDPGM 0 + +... + +--- +name: divergent_value +machineFunctionInfo: + isEntryFunction: true +tracksRegLiveness: true +body: | + ; GCN-LABEL: name: divergent_value + ; GCN: bb.0.entry: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: liveins: $vgpr0, $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 + ; GCN-NEXT: S_BRANCH %bb.2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[PHI:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_]], %bb.0, %4, %bb.2 + ; GCN-NEXT: [[PHI1:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_]], %bb.0, %11, %bb.2 + ; GCN-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sgpr_32 = S_FF1_I32_B64 [[PHI1]] + ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sgpr_32 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]] + ; GCN-NEXT: [[S_SUB_I32_:%[0-9]+]]:sgpr_32 = S_SUB_I32 [[PHI]], [[V_READLANE_B32_]], implicit-def $scc + ; GCN-NEXT: [[S_BITSET0_B64_:%[0-9]+]]:sreg_64_xexec = S_BITSET0_B64 [[S_FF1_I32_B64_]], [[PHI1]] + ; GCN-NEXT: S_CMP_LG_U64 [[S_BITSET0_B64_]], 0, implicit-def $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3: + ; GCN-NEXT: successors: %bb.1(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_SUB_I32_]] + ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY2]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY1]], %bb.3 + ; GCN-NEXT: S_ENDPGM 0 + bb.0.entry: + liveins: $vgpr0, $sgpr0_sgpr1 + %1:sgpr_64(p4) = COPY $sgpr0_sgpr1 + %0:vgpr_32 = COPY $vgpr0 + %4:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0 + %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %6:sgpr_32 = WAVE_REDUCE_SUB_PSEUDO_I32 %0, 1, implicit $exec + %7:vgpr_32 = COPY %6 + GLOBAL_STORE_DWORD_SADDR killed %5, killed %7, killed %4, 0, 0, implicit $exec + bb.1: + %8:vgpr_32 = PHI %0, %bb.0 + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umax.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umax.mir index 179c9f4f8dc4d..4899d9ee08d80 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umax.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umax.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3 -# RUN: llc -mtriple=amdgcn -run-pass=finalize-isel -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s -# RUN: llc -mtriple=amdgcn -passes=finalize-isel -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -run-pass=finalize-isel %s -o - | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -passes=finalize-isel %s -o - | FileCheck -check-prefix=GCN %s --- name: uniform_value diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umin.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umin.mir index 88c35a6417d23..ab45e03508182 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umin.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umin.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3 -# RUN: llc -mtriple=amdgcn -run-pass=finalize-isel -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s -# RUN: llc -mtriple=amdgcn -passes=finalize-isel -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -run-pass=finalize-isel %s -o - | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -passes=finalize-isel %s -o - | FileCheck -check-prefix=GCN %s --- name: uniform_value diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.xor.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.xor.mir new file mode 100644 index 0000000000000..3dbf0c063af92 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.xor.mir @@ -0,0 +1,92 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3 +# RUN: llc -mtriple=amdgcn -run-pass=finalize-isel %s -o - | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -passes=finalize-isel %s -o - | FileCheck -check-prefix=GCN %s + +--- +name: uniform_value +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0.entry: + liveins: $sgpr0_sgpr1 + + ; GCN-LABEL: name: uniform_value + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 + ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 44, 0 + ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec + ; GCN-NEXT: [[S_BCNT1_I32_B64_:%[0-9]+]]:sgpr_32 = S_BCNT1_I32_B64 [[S_MOV_B64_]], implicit-def $scc + ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sgpr_32 = S_AND_B32 [[S_BCNT1_I32_B64_]], 1, implicit-def $scc + ; GCN-NEXT: [[S_MUL_I32_:%[0-9]+]]:sgpr_32 = S_MUL_I32 [[S_LOAD_DWORD_IMM]], [[S_AND_B32_]] + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MUL_I32_]] + ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY1]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + %1:sgpr_64(p4) = COPY $sgpr0_sgpr1 + %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %5:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0 + %6:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1(p4), 44, 0 + %7:sgpr_32 = WAVE_REDUCE_XOR_PSEUDO_B32 killed %6, 1, implicit $exec + %8:vgpr_32 = COPY %7 + GLOBAL_STORE_DWORD_SADDR killed %4, killed %8, killed %5, 0, 0, implicit $exec + S_ENDPGM 0 + +... + +--- +name: divergent_value +machineFunctionInfo: + isEntryFunction: true +tracksRegLiveness: true +body: | + ; GCN-LABEL: name: divergent_value + ; GCN: bb.0.entry: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: liveins: $vgpr0, $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 + ; GCN-NEXT: S_BRANCH %bb.2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[PHI:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_]], %bb.0, %4, %bb.2 + ; GCN-NEXT: [[PHI1:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_]], %bb.0, %11, %bb.2 + ; GCN-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sgpr_32 = S_FF1_I32_B64 [[PHI1]] + ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sgpr_32 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]] + ; GCN-NEXT: [[S_XOR_B32_:%[0-9]+]]:sgpr_32 = S_XOR_B32 [[PHI]], [[V_READLANE_B32_]], implicit-def $scc + ; GCN-NEXT: [[S_BITSET0_B64_:%[0-9]+]]:sreg_64_xexec = S_BITSET0_B64 [[S_FF1_I32_B64_]], [[PHI1]] + ; GCN-NEXT: S_CMP_LG_U64 [[S_BITSET0_B64_]], 0, implicit-def $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3: + ; GCN-NEXT: successors: %bb.1(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_XOR_B32_]] + ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY2]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY1]], %bb.3 + ; GCN-NEXT: S_ENDPGM 0 + bb.0.entry: + liveins: $vgpr0, $sgpr0_sgpr1 + %1:sgpr_64(p4) = COPY $sgpr0_sgpr1 + %0:vgpr_32 = COPY $vgpr0 + %4:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0 + %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %6:sgpr_32 = WAVE_REDUCE_XOR_PSEUDO_B32 %0, 1, implicit $exec + %7:vgpr_32 = COPY %6 + GLOBAL_STORE_DWORD_SADDR killed %5, killed %7, killed %4, 0, 0, implicit $exec + bb.1: + %8:vgpr_32 = PHI %0, %bb.0 + S_ENDPGM 0 + +... From 48783dc1dfd5e2f4f1ed559ef9d73036fe6ae837 Mon Sep 17 00:00:00 2001 From: easyonaadit Date: Wed, 12 Feb 2025 17:21:29 +0530 Subject: [PATCH 2/5] Renamed function to getIdentityValueForWaveReduction --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 2f239ddecc793..6711809bf0d31 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4940,7 +4940,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, return LoopBB; } -static uint32_t getInitialValueForWaveReduction(unsigned Opc) { +static uint32_t getIdentityValueForWaveReduction(unsigned Opc) { switch (Opc) { case AMDGPU::S_MIN_U32: return std::numeric_limits::max(); @@ -4958,7 +4958,7 @@ static uint32_t getInitialValueForWaveReduction(unsigned Opc) { case AMDGPU::S_AND_B32: return std::numeric_limits::max(); default: - llvm_unreachable("Unexpected opcode in getInitialValueForWaveReduction"); + llvm_unreachable("Unexpected opcode in getIdentityValueForWaveReduction"); } } @@ -5085,7 +5085,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, // Create initail values of induction variable from Exec, Accumulator and // insert branch instr to newly created ComputeBlock - uint32_t InitalValue = getInitialValueForWaveReduction(Opc); + uint32_t InitalValue = getIdentityValueForWaveReduction(Opc); auto TmpSReg = BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg); BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg) From 38642e44f9ebd099b48304d72458eae6b7053066 Mon Sep 17 00:00:00 2001 From: easyonaadit Date: Thu, 13 Feb 2025 10:41:30 +0530 Subject: [PATCH 3/5] [AMDGPU] Add builtins for wave reduction intrinsics Currently there are no plans to push these into a public header. Initial use is for testing purposes. --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 18 +++++ clang/lib/CodeGen/CGBuiltin.cpp | 53 ++++++++++++++ clang/test/CodeGenOpenCL/builtins-amdgcn.cl | 77 ++++++++++++++++++++ 3 files changed, 148 insertions(+) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 39e295aced96b..2018f41f1007f 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -346,6 +346,24 @@ BUILTIN(__builtin_amdgcn_endpgm, "v", "nr") BUILTIN(__builtin_amdgcn_get_fpenv, "WUi", "n") BUILTIN(__builtin_amdgcn_set_fpenv, "vWUi", "n") +//===----------------------------------------------------------------------===// + +// Wave Reduction builtins. + +//===----------------------------------------------------------------------===// + +BUILTIN(__builtin_amdgcn_wave_reduce_add_i32, "ii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_uadd_i32, "ii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_sub_i32, "ii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_usub_i32, "ii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_min_i32, "ii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_umin_i32, "ii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_max_i32, "ii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_umax_i32, "ii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_and_i32, "ii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_or_i32, "ii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_xor_i32, "ii", "nc") + //===----------------------------------------------------------------------===// // R600-NI only builtins. //===----------------------------------------------------------------------===// diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 361e4c4bf2e2e..1cd4fe6a974a0 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -20212,6 +20212,59 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, llvm::Value *Env = EmitScalarExpr(E->getArg(0)); return Builder.CreateCall(F, {Env}); } + case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_i32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_uadd_i32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_i32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_usub_i32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_umin_i32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_umax_i32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_i32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_i32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_i32: { + Intrinsic::ID IID; + switch (BuiltinID) { + case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_i32: + IID = Intrinsic::amdgcn_wave_reduce_add; + break; + case AMDGPU::BI__builtin_amdgcn_wave_reduce_uadd_i32: + IID = Intrinsic::amdgcn_wave_reduce_uadd; + break; + case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_i32: + IID = Intrinsic::amdgcn_wave_reduce_sub; + break; + case AMDGPU::BI__builtin_amdgcn_wave_reduce_usub_i32: + IID = Intrinsic::amdgcn_wave_reduce_usub; + break; + case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32: + IID = Intrinsic::amdgcn_wave_reduce_min; + break; + case AMDGPU::BI__builtin_amdgcn_wave_reduce_umin_i32: + IID = Intrinsic::amdgcn_wave_reduce_umin; + break; + case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32: + IID = Intrinsic::amdgcn_wave_reduce_max; + break; + case AMDGPU::BI__builtin_amdgcn_wave_reduce_umax_i32: + IID = Intrinsic::amdgcn_wave_reduce_umax; + break; + case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_i32: + IID = Intrinsic::amdgcn_wave_reduce_and; + break; + case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_i32: + IID = Intrinsic::amdgcn_wave_reduce_or; + break; + case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_i32: + IID = Intrinsic::amdgcn_wave_reduce_xor; + break; + } + llvm::Value *Src0 = EmitScalarExpr(E->getArg(0)); + llvm::Function *F = CGM.getIntrinsic(IID, {Src0->getType()}); + llvm::Value *Strategy = + llvm::ConstantInt::get(llvm::Type::getInt32Ty(getLLVMContext()), 0); + return Builder.CreateCall(F, {Src0, Strategy}); + } case AMDGPU::BI__builtin_amdgcn_read_exec: return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, false); case AMDGPU::BI__builtin_amdgcn_read_exec_lo: diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl index ded5f6b5ac4fd..94a7b5ff9bbb2 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -398,6 +398,83 @@ void test_s_sendmsghalt_var(int in) __builtin_amdgcn_s_sendmsghalt(1, in); } +// CHECK-LABEL: @test_wave_reduce_add_i32 +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.add.i32( +void test_wave_reduce_add_i32(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_add_i32(in); +} + +// CHECK-LABEL: @test_wave_reduce_uadd_i32 +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.uadd.i32( +void test_wave_reduce_uadd_i32(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_uadd_i32(in); +} + +// CHECK-LABEL: @test_wave_reduce_sub_i32 +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32( +void test_wave_reduce_sub_i32(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_sub_i32(in); +} + +// CHECK-LABEL: @test_wave_reduce_usub_i32 +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.usub.i32( +void test_wave_reduce_usub_i32(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_usub_i32(in); +} + +// CHECK-LABEL: @test_wave_reduce_min_i32 +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.min.i32( +void test_wave_reduce_min_i32(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_min_i32(in); +} + +// CHECK-LABEL: @test_wave_reduce_umin_i32 +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umin.i32( +void test_wave_reduce_umin_i32(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_umin_i32(in); +} + +// CHECK-LABEL: @test_wave_reduce_max_i32 +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.max.i32( +void test_wave_reduce_max_i32(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_max_i32(in); +} + +// CHECK-LABEL: @test_wave_reduce_umax_i32 +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umax.i32( +void test_wave_reduce_umax_i32(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_umax_i32(in); +} + +// CHECK-LABEL: @test_wave_reduce_and_i32 +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.and.i32( +void test_wave_reduce_and_i32(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_and_i32(in); +} + +// CHECK-LABEL: @test_wave_reduce_or_i32 +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.or.i32( +void test_wave_reduce_or_i32(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_or_i32(in); +} + +// CHECK-LABEL: @test_wave_reduce_xor_i32 +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.xor.i32( +void test_wave_reduce_xor_i32(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_xor_i32(in); +} + // CHECK-LABEL: @test_s_barrier // CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.s.barrier( void test_s_barrier() From ad1dd4ecacae803581b8c6614681a1826051e5ac Mon Sep 17 00:00:00 2001 From: easyonaadit Date: Wed, 19 Feb 2025 14:37:52 +0530 Subject: [PATCH 4/5] add strategy argument in builtin --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 22 ++--- clang/lib/CodeGen/CGBuiltin.cpp | 89 +++++++++----------- clang/test/CodeGenOpenCL/builtins-amdgcn.cl | 22 ++--- 3 files changed, 63 insertions(+), 70 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 2018f41f1007f..af0491b835233 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -352,17 +352,17 @@ BUILTIN(__builtin_amdgcn_set_fpenv, "vWUi", "n") //===----------------------------------------------------------------------===// -BUILTIN(__builtin_amdgcn_wave_reduce_add_i32, "ii", "nc") -BUILTIN(__builtin_amdgcn_wave_reduce_uadd_i32, "ii", "nc") -BUILTIN(__builtin_amdgcn_wave_reduce_sub_i32, "ii", "nc") -BUILTIN(__builtin_amdgcn_wave_reduce_usub_i32, "ii", "nc") -BUILTIN(__builtin_amdgcn_wave_reduce_min_i32, "ii", "nc") -BUILTIN(__builtin_amdgcn_wave_reduce_umin_i32, "ii", "nc") -BUILTIN(__builtin_amdgcn_wave_reduce_max_i32, "ii", "nc") -BUILTIN(__builtin_amdgcn_wave_reduce_umax_i32, "ii", "nc") -BUILTIN(__builtin_amdgcn_wave_reduce_and_i32, "ii", "nc") -BUILTIN(__builtin_amdgcn_wave_reduce_or_i32, "ii", "nc") -BUILTIN(__builtin_amdgcn_wave_reduce_xor_i32, "ii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_add_i32, "iii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_add_u32, "UiUii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_sub_i32, "iii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_sub_u32, "UiUii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_min_i32, "iii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_min_u32, "UiUii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_max_i32, "iii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_max_u32, "UiUii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_and_b32, "iii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_or_b32, "iii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_xor_b32, "iii", "nc") //===----------------------------------------------------------------------===// // R600-NI only builtins. diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 1cd4fe6a974a0..bf7d40939dce3 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -19911,6 +19911,35 @@ void CodeGenFunction::AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst, Inst->setMetadata(LLVMContext::MD_mmra, MMRAMetadata::getMD(Ctx, MMRAs)); } +static Intrinsic::ID getIntrinsicIDforWaveReduction(unsigned BuiltinID) { + switch (BuiltinID) { + default: + llvm_unreachable("Unknown BuiltinID for wave reduction"); + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_i32: + return Intrinsic::amdgcn_wave_reduce_add; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32: + return Intrinsic::amdgcn_wave_reduce_uadd; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_i32: + return Intrinsic::amdgcn_wave_reduce_sub; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32: + return Intrinsic::amdgcn_wave_reduce_usub; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32: + return Intrinsic::amdgcn_wave_reduce_min; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32: + return Intrinsic::amdgcn_wave_reduce_umin; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32: + return Intrinsic::amdgcn_wave_reduce_max; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32: + return Intrinsic::amdgcn_wave_reduce_umax; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b32: + return Intrinsic::amdgcn_wave_reduce_and; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b32: + return Intrinsic::amdgcn_wave_reduce_or; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b32: + return Intrinsic::amdgcn_wave_reduce_xor; + } +} + Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, const CallExpr *E) { llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent; @@ -20213,57 +20242,21 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, return Builder.CreateCall(F, {Env}); } case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_i32: - case AMDGPU::BI__builtin_amdgcn_wave_reduce_uadd_i32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32: case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_i32: - case AMDGPU::BI__builtin_amdgcn_wave_reduce_usub_i32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32: case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32: - case AMDGPU::BI__builtin_amdgcn_wave_reduce_umin_i32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32: case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32: - case AMDGPU::BI__builtin_amdgcn_wave_reduce_umax_i32: - case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_i32: - case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_i32: - case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_i32: { - Intrinsic::ID IID; - switch (BuiltinID) { - case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_i32: - IID = Intrinsic::amdgcn_wave_reduce_add; - break; - case AMDGPU::BI__builtin_amdgcn_wave_reduce_uadd_i32: - IID = Intrinsic::amdgcn_wave_reduce_uadd; - break; - case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_i32: - IID = Intrinsic::amdgcn_wave_reduce_sub; - break; - case AMDGPU::BI__builtin_amdgcn_wave_reduce_usub_i32: - IID = Intrinsic::amdgcn_wave_reduce_usub; - break; - case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32: - IID = Intrinsic::amdgcn_wave_reduce_min; - break; - case AMDGPU::BI__builtin_amdgcn_wave_reduce_umin_i32: - IID = Intrinsic::amdgcn_wave_reduce_umin; - break; - case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32: - IID = Intrinsic::amdgcn_wave_reduce_max; - break; - case AMDGPU::BI__builtin_amdgcn_wave_reduce_umax_i32: - IID = Intrinsic::amdgcn_wave_reduce_umax; - break; - case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_i32: - IID = Intrinsic::amdgcn_wave_reduce_and; - break; - case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_i32: - IID = Intrinsic::amdgcn_wave_reduce_or; - break; - case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_i32: - IID = Intrinsic::amdgcn_wave_reduce_xor; - break; - } - llvm::Value *Src0 = EmitScalarExpr(E->getArg(0)); - llvm::Function *F = CGM.getIntrinsic(IID, {Src0->getType()}); - llvm::Value *Strategy = - llvm::ConstantInt::get(llvm::Type::getInt32Ty(getLLVMContext()), 0); - return Builder.CreateCall(F, {Src0, Strategy}); + case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b32: { + Intrinsic::ID IID = getIntrinsicIDforWaveReduction(BuiltinID); + llvm::Value *Value = EmitScalarExpr(E->getArg(0)); + llvm::Value *Strategy = EmitScalarExpr(E->getArg(1)); + llvm::Function *F = CGM.getIntrinsic(IID, {Value->getType()}); + return Builder.CreateCall(F, {Value, Strategy}); } case AMDGPU::BI__builtin_amdgcn_read_exec: return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, false); diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl index 94a7b5ff9bbb2..9ac02580f8e44 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -402,77 +402,77 @@ void test_s_sendmsghalt_var(int in) // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.add.i32( void test_wave_reduce_add_i32(global int* out, int in) { - *out = __builtin_amdgcn_wave_reduce_add_i32(in); + *out = __builtin_amdgcn_wave_reduce_add_i32(in, 0); } // CHECK-LABEL: @test_wave_reduce_uadd_i32 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.uadd.i32( void test_wave_reduce_uadd_i32(global int* out, int in) { - *out = __builtin_amdgcn_wave_reduce_uadd_i32(in); + *out = __builtin_amdgcn_wave_reduce_add_u32(in, 0); } // CHECK-LABEL: @test_wave_reduce_sub_i32 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32( void test_wave_reduce_sub_i32(global int* out, int in) { - *out = __builtin_amdgcn_wave_reduce_sub_i32(in); + *out = __builtin_amdgcn_wave_reduce_sub_i32(in, 0); } // CHECK-LABEL: @test_wave_reduce_usub_i32 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.usub.i32( void test_wave_reduce_usub_i32(global int* out, int in) { - *out = __builtin_amdgcn_wave_reduce_usub_i32(in); + *out = __builtin_amdgcn_wave_reduce_sub_u32(in, 0); } // CHECK-LABEL: @test_wave_reduce_min_i32 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.min.i32( void test_wave_reduce_min_i32(global int* out, int in) { - *out = __builtin_amdgcn_wave_reduce_min_i32(in); + *out = __builtin_amdgcn_wave_reduce_min_i32(in, 0); } // CHECK-LABEL: @test_wave_reduce_umin_i32 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umin.i32( void test_wave_reduce_umin_i32(global int* out, int in) { - *out = __builtin_amdgcn_wave_reduce_umin_i32(in); + *out = __builtin_amdgcn_wave_reduce_min_u32(in, 0); } // CHECK-LABEL: @test_wave_reduce_max_i32 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.max.i32( void test_wave_reduce_max_i32(global int* out, int in) { - *out = __builtin_amdgcn_wave_reduce_max_i32(in); + *out = __builtin_amdgcn_wave_reduce_max_i32(in, 0); } // CHECK-LABEL: @test_wave_reduce_umax_i32 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umax.i32( void test_wave_reduce_umax_i32(global int* out, int in) { - *out = __builtin_amdgcn_wave_reduce_umax_i32(in); + *out = __builtin_amdgcn_wave_reduce_max_u32(in, 0); } // CHECK-LABEL: @test_wave_reduce_and_i32 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.and.i32( void test_wave_reduce_and_i32(global int* out, int in) { - *out = __builtin_amdgcn_wave_reduce_and_i32(in); + *out = __builtin_amdgcn_wave_reduce_and_b32(in, 0); } // CHECK-LABEL: @test_wave_reduce_or_i32 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.or.i32( void test_wave_reduce_or_i32(global int* out, int in) { - *out = __builtin_amdgcn_wave_reduce_or_i32(in); + *out = __builtin_amdgcn_wave_reduce_or_b32(in, 0); } // CHECK-LABEL: @test_wave_reduce_xor_i32 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.xor.i32( void test_wave_reduce_xor_i32(global int* out, int in) { - *out = __builtin_amdgcn_wave_reduce_xor_i32(in); + *out = __builtin_amdgcn_wave_reduce_xor_b32(in, 0); } // CHECK-LABEL: @test_s_barrier From 054aa3994c6eb1216ad228794a6eafa72bad3b7e Mon Sep 17 00:00:00 2001 From: easyonaadit Date: Wed, 19 Feb 2025 14:55:23 +0530 Subject: [PATCH 5/5] test for stratergy --- clang/test/CodeGenOpenCL/builtins-amdgcn.cl | 198 +++++++++++++++++--- 1 file changed, 176 insertions(+), 22 deletions(-) diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl index 9ac02580f8e44..6183b2dde2b07 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -398,83 +398,237 @@ void test_s_sendmsghalt_var(int in) __builtin_amdgcn_s_sendmsghalt(1, in); } -// CHECK-LABEL: @test_wave_reduce_add_i32 +// CHECK-LABEL: @test_wave_reduce_add_i32_default // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.add.i32( -void test_wave_reduce_add_i32(global int* out, int in) +void test_wave_reduce_add_i32_default(global int* out, int in) { *out = __builtin_amdgcn_wave_reduce_add_i32(in, 0); } -// CHECK-LABEL: @test_wave_reduce_uadd_i32 +// CHECK-LABEL: @test_wave_reduce_add_i32_iterative +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.add.i32( +void test_wave_reduce_add_i32_iterative(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_add_i32(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_add_i32_dpp +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.add.i32( +void test_wave_reduce_add_i32_dpp(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_add_i32(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_uadd_i32_default // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.uadd.i32( -void test_wave_reduce_uadd_i32(global int* out, int in) +void test_wave_reduce_uadd_i32_default(global int* out, int in) { *out = __builtin_amdgcn_wave_reduce_add_u32(in, 0); } -// CHECK-LABEL: @test_wave_reduce_sub_i32 +// CHECK-LABEL: @test_wave_reduce_uadd_i32_iterative +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.uadd.i32( +void test_wave_reduce_uadd_i32_iterative(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_add_u32(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_uadd_i32_dpp +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.uadd.i32( +void test_wave_reduce_uadd_i32_dpp(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_add_u32(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_sub_i32_default // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32( -void test_wave_reduce_sub_i32(global int* out, int in) +void test_wave_reduce_sub_i32_default(global int* out, int in) { *out = __builtin_amdgcn_wave_reduce_sub_i32(in, 0); } -// CHECK-LABEL: @test_wave_reduce_usub_i32 +// CHECK-LABEL: @test_wave_reduce_sub_i32_iterative +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32( +void test_wave_reduce_sub_i32_iterative(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_sub_i32(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_sub_i32_dpp +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32( +void test_wave_reduce_sub_i32_dpp(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_sub_i32(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_usub_i32_default // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.usub.i32( -void test_wave_reduce_usub_i32(global int* out, int in) +void test_wave_reduce_usub_i32_default(global int* out, int in) { *out = __builtin_amdgcn_wave_reduce_sub_u32(in, 0); } -// CHECK-LABEL: @test_wave_reduce_min_i32 +// CHECK-LABEL: @test_wave_reduce_usub_i32_iterative +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.usub.i32( +void test_wave_reduce_usub_i32_iterative(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_sub_u32(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_usub_i32_dpp +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.usub.i32( +void test_wave_reduce_usub_i32_dpp(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_sub_u32(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_min_i32_default // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.min.i32( -void test_wave_reduce_min_i32(global int* out, int in) +void test_wave_reduce_min_i32_default(global int* out, int in) { *out = __builtin_amdgcn_wave_reduce_min_i32(in, 0); } -// CHECK-LABEL: @test_wave_reduce_umin_i32 +// CHECK-LABEL: @test_wave_reduce_min_i32_iterative +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.min.i32( +void test_wave_reduce_min_i32_iterative(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_min_i32(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_min_i32_dpp +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.min.i32( +void test_wave_reduce_min_i32_dpp(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_min_i32(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_umin_i32_default // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umin.i32( -void test_wave_reduce_umin_i32(global int* out, int in) +void test_wave_reduce_umin_i32_default(global int* out, int in) { *out = __builtin_amdgcn_wave_reduce_min_u32(in, 0); } -// CHECK-LABEL: @test_wave_reduce_max_i32 +// CHECK-LABEL: @test_wave_reduce_umin_i32_iterative +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umin.i32( +void test_wave_reduce_umin_i32_iterative(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_min_u32(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_umin_i32_dpp +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umin.i32( +void test_wave_reduce_umin_i32_dpp(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_min_u32(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_max_i32_default // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.max.i32( -void test_wave_reduce_max_i32(global int* out, int in) +void test_wave_reduce_max_i32_default(global int* out, int in) { *out = __builtin_amdgcn_wave_reduce_max_i32(in, 0); } -// CHECK-LABEL: @test_wave_reduce_umax_i32 +// CHECK-LABEL: @test_wave_reduce_max_i32_iterative +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.max.i32( +void test_wave_reduce_max_i32_iterative(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_max_i32(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_max_i32_dpp +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.max.i32( +void test_wave_reduce_max_i32_dpp(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_max_i32(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_umax_i32_default // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umax.i32( -void test_wave_reduce_umax_i32(global int* out, int in) +void test_wave_reduce_umax_i32_default(global int* out, int in) { *out = __builtin_amdgcn_wave_reduce_max_u32(in, 0); } -// CHECK-LABEL: @test_wave_reduce_and_i32 +// CHECK-LABEL: @test_wave_reduce_umax_i32_iterative +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umax.i32( +void test_wave_reduce_umax_i32_iterative(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_max_u32(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_umax_i32_dpp +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umax.i32( +void test_wave_reduce_umax_i32_dpp(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_max_u32(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_and_i32_default // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.and.i32( -void test_wave_reduce_and_i32(global int* out, int in) +void test_wave_reduce_and_i32_default(global int* out, int in) { *out = __builtin_amdgcn_wave_reduce_and_b32(in, 0); } -// CHECK-LABEL: @test_wave_reduce_or_i32 +// CHECK-LABEL: @test_wave_reduce_and_i32_iterative +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.and.i32( +void test_wave_reduce_and_i32_iterative(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_and_b32(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_and_i32_dpp +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.and.i32( +void test_wave_reduce_and_i32_dpp(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_and_b32(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_or_i32_default // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.or.i32( -void test_wave_reduce_or_i32(global int* out, int in) +void test_wave_reduce_or_i32_default(global int* out, int in) { *out = __builtin_amdgcn_wave_reduce_or_b32(in, 0); } -// CHECK-LABEL: @test_wave_reduce_xor_i32 +// CHECK-LABEL: @test_wave_reduce_or_i32_iterative +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.or.i32( +void test_wave_reduce_or_i32_iterative(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_or_b32(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_or_i32_dpp +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.or.i32( +void test_wave_reduce_or_i32_dpp(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_or_b32(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_xor_i32_default // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.xor.i32( -void test_wave_reduce_xor_i32(global int* out, int in) +void test_wave_reduce_xor_i32_default(global int* out, int in) { *out = __builtin_amdgcn_wave_reduce_xor_b32(in, 0); } +// CHECK-LABEL: @test_wave_reduce_xor_i32_iterative +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.xor.i32( +void test_wave_reduce_xor_i32_iterative(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_xor_b32(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_xor_i32_dpp +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.xor.i32( +void test_wave_reduce_xor_i32_dpp(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_xor_b32(in, 2); +} + // CHECK-LABEL: @test_s_barrier // CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.s.barrier( void test_s_barrier()