diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index a212a9218ca0d..ea51fa44f32fe 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -3997,10 +3997,11 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, InVals, /*IsThisReturn=*/false, SDValue()); } -// This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC, -// except for applying the wave size scale to the increment amount. -SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op, - SelectionDAG &DAG) const { +SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, + SelectionDAG &DAG) const { + // This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC, + // except for applying the wave size scale to the increment amount and doing a + // wave reduction for divergent allocation size. const MachineFunction &MF = DAG.getMachineFunction(); const SIMachineFunctionInfo *Info = MF.getInfo(); @@ -4018,6 +4019,8 @@ SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op, Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl); SDValue Size = Tmp2.getOperand(1); + + // Start address of the dynamically sized stack object SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); Chain = SP.getValue(1); MaybeAlign Alignment = cast(Tmp3)->getMaybeAlignValue(); @@ -4027,12 +4030,28 @@ SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op, ? ISD::ADD : ISD::SUB; - SDValue ScaledSize = DAG.getNode( - ISD::SHL, dl, VT, Size, - DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32)); + if (isa(Op.getOperand(1))){ + SDValue ScaledSize = DAG.getNode( + ISD::SHL, dl, VT, Size, + DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32)); + Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value + } + else{ + SDValue WaveReduction = + DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32); + Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, + WaveReduction, Size, DAG.getConstant(0, dl, MVT::i32)); + SDValue ScaledSize = DAG.getNode( + ISD::SHL, dl, VT, Size, + DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32)); + Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value in vgpr. + SDValue ReadFirstLaneID = + DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32); + Tmp1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, + ReadFirstLaneID, Tmp1); + } Align StackAlign = TFL->getStackAlign(); - Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value if (Alignment && *Alignment > StackAlign) { Tmp1 = DAG.getNode( ISD::AND, dl, VT, Tmp1, @@ -4042,25 +4061,12 @@ SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op, } Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain + Tmp1 = SP; Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl); return DAG.getMergeValues({Tmp1, Tmp2}, dl); } -SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, - SelectionDAG &DAG) const { - // We only handle constant sizes here to allow non-entry block, static sized - // allocas. A truly dynamic value is more difficult to support because we - // don't know if the size value is uniform or not. If the size isn't uniform, - // we would need to do a wave reduction to get the maximum size to know how - // much to increment the uniform stack pointer. - SDValue Size = Op.getOperand(1); - if (isa(Size)) - return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion. - - return AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(Op, DAG); -} - SDValue SITargetLowering::LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const { if (Op.getValueType() != MVT::i32) return Op; // Defer to cannot select error. diff --git a/llvm/test/CodeGen/AMDGPU/dynamic-alloca.ll b/llvm/test/CodeGen/AMDGPU/dynamic-alloca.ll new file mode 100644 index 0000000000000..4a4915ebf4e9d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/dynamic-alloca.ll @@ -0,0 +1,2141 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -march=amdgcn -mcpu=tonga -global-isel=0 < %s | FileCheck -check-prefixes=GFX8 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10,GFX1064 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10,GFX1032 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11,GFX1164 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11,GFX1132 %s + +define amdgpu_kernel void @constant_value() { +; GFX8DAGISEL-LABEL: constant_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX8DAGISEL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX8DAGISEL-NEXT: s_mov_b32 s14, -1 +; GFX8DAGISEL-NEXT: s_mov_b32 s15, 0xe00000 +; GFX8DAGISEL-NEXT: s_add_u32 s12, s12, s11 +; GFX8DAGISEL-NEXT: s_addc_u32 s13, s13, 0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX8DAGISEL-NEXT: buffer_store_dword v0, off, s[12:15], 0 +; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) +; GFX8DAGISEL-NEXT: s_endpgm +; GFX8-LABEL: constant_value: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s90, -1 +; GFX8-NEXT: s_mov_b32 s91, 0xe80000 +; GFX8-NEXT: s_add_u32 s88, s88, s11 +; GFX8-NEXT: s_addc_u32 s89, s89, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX8-NEXT: buffer_store_dword v0, off, s[88:91], 0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: constant_value: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-NEXT: s_add_u32 s12, s12, s11 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9-NEXT: buffer_store_dword v0, off, s[12:15], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: constant_value: +; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s14, -1 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s12, s12, s11 +; GFX1064-NEXT: s_addc_u32 s13, s13, 0 +; GFX1064-NEXT: buffer_store_dword v0, off, s[12:15], 0 +; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: constant_value: +; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s14, -1 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s12, s12, s11 +; GFX1032-NEXT: s_addc_u32 s13, s13, 0 +; GFX1032-NEXT: buffer_store_dword v0, off, s[12:15], 0 +; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-NEXT: s_endpgm +; +; GFX11-LABEL: constant_value: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX11-NEXT: scratch_store_b32 off, v0, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_endpgm +entry: + %n = add i32 5, 0 + %dyn_alloca = alloca i32, i32 %n, addrspace(5) + store volatile i32 123, ptr addrspace(5) %dyn_alloca + ret void +} + +define amdgpu_kernel void @uniform_value(i32 %n) { +; GFX8DAGISEL-LABEL: uniform_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX8DAGISEL-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX8DAGISEL-NEXT: s_mov_b32 s14, -1 +; GFX8DAGISEL-NEXT: s_mov_b32 s15, 0xe00000 +; GFX8DAGISEL-NEXT: s_add_u32 s12, s12, s11 +; GFX8DAGISEL-NEXT: s_addc_u32 s13, s13, 0 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: s_lshl_b32 s0, s0, 2 +; GFX8DAGISEL-NEXT: s_add_i32 s0, s0, 15 +; GFX8DAGISEL-NEXT: s_movk_i32 s32, 0x400 +; GFX8DAGISEL-NEXT: s_and_b32 s0, s0, -16 +; GFX8DAGISEL-NEXT: s_mov_b32 s1, s32 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX8DAGISEL-NEXT: s_lshl_b32 s0, s0, 6 +; GFX8DAGISEL-NEXT: s_mov_b32 s33, 0 +; GFX8DAGISEL-NEXT: s_add_i32 s32, s1, s0 +; GFX8DAGISEL-NEXT: buffer_store_dword v0, off, s[12:15], s1 +; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) +; GFX8DAGISEL-NEXT: s_endpgm +; GFX8-LABEL: uniform_value: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX8-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s90, -1 +; GFX8-NEXT: s_mov_b32 s91, 0xe80000 +; GFX8-NEXT: s_add_u32 s88, s88, s11 +; GFX8-NEXT: s_addc_u32 s89, s89, 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b32 s0, s0, 2 +; GFX8-NEXT: s_add_i32 s0, s0, 15 +; GFX8-NEXT: s_movk_i32 s32, 0x400 +; GFX8-NEXT: s_and_b32 s0, s0, -16 +; GFX8-NEXT: s_mov_b32 s1, s32 +; GFX8-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX8-NEXT: s_lshl_b32 s0, s0, 6 +; GFX8-NEXT: s_mov_b32 s33, 0 +; GFX8-NEXT: s_add_i32 s32, s1, s0 +; GFX8-NEXT: buffer_store_dword v0, off, s[88:91], s1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: uniform_value: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-NEXT: s_add_u32 s12, s12, s11 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b32 s0, s0, 2 +; GFX9-NEXT: s_add_i32 s0, s0, 15 +; GFX9-NEXT: s_movk_i32 s32, 0x400 +; GFX9-NEXT: s_and_b32 s0, s0, -16 +; GFX9-NEXT: s_mov_b32 s1, s32 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9-NEXT: s_lshl_b32 s0, s0, 6 +; GFX9-NEXT: s_mov_b32 s33, 0 +; GFX9-NEXT: s_add_i32 s32, s1, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[12:15], s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: uniform_value: +; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX1064-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s14, -1 +; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s12, s12, s11 +; GFX1064-NEXT: s_addc_u32 s13, s13, 0 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX1064-NEXT: s_movk_i32 s32, 0x400 +; GFX1064-NEXT: s_mov_b32 s33, 0 +; GFX1064-NEXT: s_mov_b32 s1, s32 +; GFX1064-NEXT: buffer_store_dword v0, off, s[12:15], s1 +; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_lshl_b32 s0, s0, 2 +; GFX1064-NEXT: s_add_i32 s0, s0, 15 +; GFX1064-NEXT: s_and_b32 s0, s0, -16 +; GFX1064-NEXT: s_lshl_b32 s0, s0, 6 +; GFX1064-NEXT: s_add_i32 s32, s1, s0 +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: uniform_value: +; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s14, -1 +; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s12, s12, s11 +; GFX1032-NEXT: s_addc_u32 s13, s13, 0 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX1032-NEXT: s_movk_i32 s32, 0x200 +; GFX1032-NEXT: s_mov_b32 s33, 0 +; GFX1032-NEXT: s_mov_b32 s1, s32 +; GFX1032-NEXT: buffer_store_dword v0, off, s[12:15], s1 +; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_lshl_b32 s0, s0, 2 +; GFX1032-NEXT: s_add_i32 s0, s0, 15 +; GFX1032-NEXT: s_and_b32 s0, s0, -16 +; GFX1032-NEXT: s_lshl_b32 s0, s0, 5 +; GFX1032-NEXT: s_add_i32 s32, s1, s0 +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: uniform_value: +; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX1164-NEXT: s_mov_b32 s32, 16 +; GFX1164-NEXT: s_mov_b32 s33, 0 +; GFX1164-NEXT: s_mov_b32 s1, s32 +; GFX1164-NEXT: scratch_store_b32 off, v0, s1 dlc +; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_lshl_b32 s0, s0, 2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s0, s0, 15 +; GFX1164-NEXT: s_and_b32 s0, s0, -16 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_lshl_b32 s0, s0, 6 +; GFX1164-NEXT: s_add_i32 s32, s1, s0 +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: uniform_value: +; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX1132-NEXT: s_mov_b32 s32, 16 +; GFX1132-NEXT: s_mov_b32 s33, 0 +; GFX1132-NEXT: s_mov_b32 s1, s32 +; GFX1132-NEXT: scratch_store_b32 off, v0, s1 dlc +; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_lshl_b32 s0, s0, 2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_add_i32 s0, s0, 15 +; GFX1132-NEXT: s_and_b32 s0, s0, -16 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_lshl_b32 s0, s0, 5 +; GFX1132-NEXT: s_add_i32 s32, s1, s0 +; GFX1132-NEXT: s_endpgm +entry: + %dyn_alloca = alloca i32, i32 %n, addrspace(5) + store volatile i32 123, ptr addrspace(5) %dyn_alloca + ret void +} + +define amdgpu_kernel void @divergent_value() { +; GFX8DAGISEL-LABEL: divergent_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX8DAGISEL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX8DAGISEL-NEXT: s_mov_b32 s14, -1 +; GFX8DAGISEL-NEXT: s_mov_b32 s15, 0xe00000 +; GFX8DAGISEL-NEXT: s_add_u32 s12, s12, s11 +; GFX8DAGISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX8DAGISEL-NEXT: s_addc_u32 s13, s13, 0 +; GFX8DAGISEL-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 +; GFX8DAGISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX8DAGISEL-NEXT: s_mov_b32 s2, 0 +; GFX8DAGISEL-NEXT: s_mov_b32 s33, 0 +; GFX8DAGISEL-NEXT: s_movk_i32 s32, 0x400 +; GFX8DAGISEL-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX8DAGISEL-NEXT: v_readlane_b32 s4, v0, s3 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[0:1], s3 +; GFX8DAGISEL-NEXT: s_max_u32 s2, s2, s4 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX8DAGISEL-NEXT: ; %bb.2: +; GFX8DAGISEL-NEXT: s_mov_b32 s0, s32 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_lshl_add_u32 v0, s2, 6, v0 +; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s32, v0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX8DAGISEL-NEXT: buffer_store_dword v0, off, s[12:15], s0 +; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) +; GFX8DAGISEL-NEXT: s_endpgm +; GFX8-LABEL: divergent_value: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s90, -1 +; GFX8-NEXT: s_mov_b32 s91, 0xe80000 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX8-NEXT: s_add_u32 s88, s88, s11 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 15, v0 +; GFX8-NEXT: s_addc_u32 s89, s89, 0 +; GFX8-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 +; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_mov_b32 s33, 0 +; GFX8-NEXT: s_movk_i32 s32, 0x400 +; GFX8-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX8-NEXT: v_readlane_b32 s4, v0, s3 +; GFX8-NEXT: s_bitset0_b64 s[0:1], s3 +; GFX8-NEXT: s_max_u32 s2, s2, s4 +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX8-NEXT: ; %bb.2: +; GFX8-NEXT: v_lshlrev_b32_e64 v0, 6, s2 +; GFX8-NEXT: s_mov_b32 s0, s32 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s32, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX8-NEXT: buffer_store_dword v0, off, s[88:91], s0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: divergent_value: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-NEXT: s_add_u32 s12, s12, s11 +; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_mov_b32 s33, 0 +; GFX9-NEXT: s_movk_i32 s32, 0x400 +; GFX9-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX9-NEXT: v_readlane_b32 s4, v0, s3 +; GFX9-NEXT: s_bitset0_b64 s[0:1], s3 +; GFX9-NEXT: s_max_u32 s2, s2, s4 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX9-NEXT: ; %bb.2: +; GFX9-NEXT: s_mov_b32 s0, s32 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_lshl_add_u32 v0, s2, 6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s32, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9-NEXT: buffer_store_dword v0, off, s[12:15], s0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: divergent_value: +; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX1064-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s14, -1 +; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000 +; GFX1064-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 +; GFX1064-NEXT: s_add_u32 s12, s12, s11 +; GFX1064-NEXT: s_addc_u32 s13, s13, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_mov_b32 s33, 0 +; GFX1064-NEXT: s_movk_i32 s32, 0x400 +; GFX1064-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX1064-NEXT: v_readlane_b32 s4, v0, s3 +; GFX1064-NEXT: s_bitset0_b64 s[0:1], s3 +; GFX1064-NEXT: s_max_u32 s2, s2, s4 +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1064-NEXT: ; %bb.2: +; GFX1064-NEXT: s_mov_b32 s0, s32 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX1064-NEXT: v_lshl_add_u32 v0, s2, 6, s0 +; GFX1064-NEXT: buffer_store_dword v1, off, s[12:15], s0 +; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-NEXT: v_readfirstlane_b32 s32, v0 +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: divergent_value: +; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s14, -1 +; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX1032-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 +; GFX1032-NEXT: s_add_u32 s12, s12, s11 +; GFX1032-NEXT: s_addc_u32 s13, s13, 0 +; GFX1032-NEXT: s_mov_b32 s1, exec_lo +; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: s_mov_b32 s33, 0 +; GFX1032-NEXT: s_movk_i32 s32, 0x200 +; GFX1032-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s2, s1 +; GFX1032-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1032-NEXT: s_bitset0_b32 s1, s2 +; GFX1032-NEXT: s_max_u32 s0, s0, s3 +; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1032-NEXT: ; %bb.2: +; GFX1032-NEXT: s_mov_b32 s1, s32 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX1032-NEXT: v_lshl_add_u32 v0, s0, 5, s1 +; GFX1032-NEXT: buffer_store_dword v1, off, s[12:15], s1 +; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-NEXT: v_readfirstlane_b32 s32, v0 +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: divergent_value: +; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_mov_b32 s33, 0 +; GFX1164-NEXT: s_mov_b32 s32, 16 +; GFX1164-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 +; GFX1164-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b64 s3, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_readlane_b32 s4, v0, s3 +; GFX1164-NEXT: s_bitset0_b64 s[0:1], s3 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: s_max_u32 s2, s2, s4 +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1164-NEXT: ; %bb.2: +; GFX1164-NEXT: s_mov_b32 s0, s32 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX1164-NEXT: v_lshl_add_u32 v0, s2, 6, s0 +; GFX1164-NEXT: scratch_store_b32 off, v1, s0 dlc +; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1164-NEXT: v_readfirstlane_b32 s32, v0 +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: divergent_value: +; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: s_mov_b32 s33, 0 +; GFX1132-NEXT: s_mov_b32 s32, 16 +; GFX1132-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 +; GFX1132-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s2, s1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1132-NEXT: s_bitset0_b32 s1, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: s_max_u32 s0, s0, s3 +; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1132-NEXT: ; %bb.2: +; GFX1132-NEXT: s_mov_b32 s1, s32 +; GFX1132-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX1132-NEXT: v_lshl_add_u32 v0, s0, 5, s1 +; GFX1132-NEXT: scratch_store_b32 off, v1, s1 dlc +; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1132-NEXT: v_readfirstlane_b32 s32, v0 +; GFX1132-NEXT: s_endpgm +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %dyn_alloca = alloca i32, i32 %idx, addrspace(5) + store volatile i32 123, ptr addrspace(5) %dyn_alloca + ret void +} + +define void @custom_alignment(i32 %n) { +; GFX8DAGISEL-LABEL: custom_alignment: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX8DAGISEL-NEXT: s_mov_b32 s9, s33 +; GFX8DAGISEL-NEXT: s_add_i32 s33, s32, 0xffc0 +; GFX8DAGISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX8DAGISEL-NEXT: s_and_b32 s33, s33, 0xffff0000 +; GFX8DAGISEL-NEXT: s_add_i32 s32, s32, 0x20000 +; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX8DAGISEL-NEXT: s_max_u32 s6, s6, s8 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8DAGISEL-NEXT: ; %bb.2: +; GFX8DAGISEL-NEXT: s_mov_b32 s4, s32 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX8DAGISEL-NEXT: v_lshl_add_u32 v0, s6, 6, v0 +; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s5, v0 +; GFX8DAGISEL-NEXT: s_and_b32 s32, s5, 0xffff0000 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX8DAGISEL-NEXT: buffer_store_dword v0, off, s[0:3], s4 +; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) +; GFX8DAGISEL-NEXT: s_add_i32 s32, s32, 0xfffe0000 +; GFX8DAGISEL-NEXT: s_mov_b32 s33, s9 +; GFX8DAGISEL-NEXT: s_setpc_b64 s[30:31] +; GFX8-LABEL: custom_alignment: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 15, v0 +; GFX8-NEXT: s_mov_b32 s9, s33 +; GFX8-NEXT: s_add_i32 s33, s32, 0xffc0 +; GFX8-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: s_mov_b32 s6, 0 +; GFX8-NEXT: s_and_b32 s33, s33, 0xffff0000 +; GFX8-NEXT: s_add_i32 s32, s32, 0x20000 +; GFX8-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX8-NEXT: v_readlane_b32 s8, v0, s7 +; GFX8-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX8-NEXT: s_max_u32 s6, s6, s8 +; GFX8-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8-NEXT: ; %bb.2: +; GFX8-NEXT: v_lshlrev_b32_e64 v0, 6, s6 +; GFX8-NEXT: s_mov_b32 s4, s32 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0 +; GFX8-NEXT: v_readfirstlane_b32 s5, v0 +; GFX8-NEXT: s_and_b32 s32, s5, 0xffff0000 +; GFX8-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], s4 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_add_i32 s32, s32, 0xfffe0000 +; GFX8-NEXT: s_mov_b32 s33, s9 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: custom_alignment: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-NEXT: s_mov_b32 s9, s33 +; GFX9-NEXT: s_add_i32 s33, s32, 0xffc0 +; GFX9-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: s_mov_b32 s6, 0 +; GFX9-NEXT: s_and_b32 s33, s33, 0xffff0000 +; GFX9-NEXT: s_add_i32 s32, s32, 0x20000 +; GFX9-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9-NEXT: s_max_u32 s6, s6, s8 +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9-NEXT: ; %bb.2: +; GFX9-NEXT: s_mov_b32 s4, s32 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_lshl_add_u32 v0, s6, 6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s5, v0 +; GFX9-NEXT: s_and_b32 s32, s5, 0xffff0000 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_add_i32 s32, s32, 0xfffe0000 +; GFX9-NEXT: s_mov_b32 s33, s9 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX1064-LABEL: custom_alignment: +; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1064-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX1064-NEXT: s_mov_b32 s9, s33 +; GFX1064-NEXT: s_add_i32 s33, s32, 0xffc0 +; GFX1064-NEXT: s_mov_b64 s[4:5], exec +; GFX1064-NEXT: s_mov_b32 s6, 0 +; GFX1064-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX1064-NEXT: s_and_b32 s33, s33, 0xffff0000 +; GFX1064-NEXT: s_add_i32 s32, s32, 0x20000 +; GFX1064-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX1064-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1064-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1064-NEXT: s_max_u32 s6, s6, s8 +; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1064-NEXT: ; %bb.2: +; GFX1064-NEXT: s_mov_b32 s4, s32 +; GFX1064-NEXT: s_mov_b32 s33, s9 +; GFX1064-NEXT: v_lshl_add_u32 v0, s6, 6, s4 +; GFX1064-NEXT: v_readfirstlane_b32 s5, v0 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX1064-NEXT: s_and_b32 s32, s5, 0xffff0000 +; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], s4 +; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-NEXT: s_add_i32 s32, s32, 0xfffe0000 +; GFX1064-NEXT: s_setpc_b64 s[30:31] +; +; GFX1032-LABEL: custom_alignment: +; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1032-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX1032-NEXT: s_mov_b32 s8, s33 +; GFX1032-NEXT: s_add_i32 s33, s32, 0x7fe0 +; GFX1032-NEXT: s_mov_b32 s5, exec_lo +; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX1032-NEXT: s_and_b32 s33, s33, 0xffff8000 +; GFX1032-NEXT: s_add_i32 s32, s32, 0x10000 +; GFX1032-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s6, s5 +; GFX1032-NEXT: v_readlane_b32 s7, v0, s6 +; GFX1032-NEXT: s_bitset0_b32 s5, s6 +; GFX1032-NEXT: s_max_u32 s4, s4, s7 +; GFX1032-NEXT: s_cmp_lg_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1032-NEXT: ; %bb.2: +; GFX1032-NEXT: s_mov_b32 s5, s32 +; GFX1032-NEXT: s_mov_b32 s33, s8 +; GFX1032-NEXT: v_lshl_add_u32 v0, s4, 5, s5 +; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX1032-NEXT: s_and_b32 s32, s4, 0xffff8000 +; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], s5 +; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-NEXT: s_add_i32 s32, s32, 0xffff0000 +; GFX1032-NEXT: s_setpc_b64 s[30:31] +; +; GFX1164-LABEL: custom_alignment: +; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1164-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX1164-NEXT: s_mov_b32 s5, s33 +; GFX1164-NEXT: s_add_i32 s33, s32, 0x3ff +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX1164-NEXT: s_and_b32 s33, s33, 0xfffffc00 +; GFX1164-NEXT: s_addk_i32 s32, 0x800 +; GFX1164-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b64 s3, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_readlane_b32 s4, v0, s3 +; GFX1164-NEXT: s_bitset0_b64 s[0:1], s3 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: s_max_u32 s2, s2, s4 +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1164-NEXT: ; %bb.2: +; GFX1164-NEXT: s_mov_b32 s0, s32 +; GFX1164-NEXT: s_mov_b32 s33, s5 +; GFX1164-NEXT: v_lshl_add_u32 v0, s2, 6, s0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-NEXT: v_readfirstlane_b32 s1, v0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX1164-NEXT: s_and_b32 s32, s1, 0xffff0000 +; GFX1164-NEXT: scratch_store_b32 off, v0, s0 dlc +; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1164-NEXT: s_addk_i32 s32, 0xf800 +; GFX1164-NEXT: s_setpc_b64 s[30:31] +; +; GFX1132-LABEL: custom_alignment: +; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1132-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX1132-NEXT: s_mov_b32 s4, s33 +; GFX1132-NEXT: s_add_i32 s33, s32, 0x3ff +; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX1132-NEXT: s_and_b32 s33, s33, 0xfffffc00 +; GFX1132-NEXT: s_addk_i32 s32, 0x800 +; GFX1132-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s2, s1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1132-NEXT: s_bitset0_b32 s1, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: s_max_u32 s0, s0, s3 +; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1132-NEXT: ; %bb.2: +; GFX1132-NEXT: s_mov_b32 s1, s32 +; GFX1132-NEXT: s_mov_b32 s33, s4 +; GFX1132-NEXT: v_lshl_add_u32 v0, s0, 5, s1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX1132-NEXT: s_and_b32 s32, s0, 0xffff8000 +; GFX1132-NEXT: scratch_store_b32 off, v0, s1 dlc +; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1132-NEXT: s_addk_i32 s32, 0xf800 +; GFX1132-NEXT: s_setpc_b64 s[30:31] +entry: + %dyn_alloca = alloca i32, i32 %n, align 1024, addrspace(5) + store volatile i32 123, ptr addrspace(5) %dyn_alloca + ret void +} + +define void @pointer_offset(<4 x i32> %a, <4 x i32> %b, i32 %n) { +; GFX8DAGISEL-LABEL: pointer_offset: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8DAGISEL-NEXT: s_mov_b32 s9, s33 +; GFX8DAGISEL-NEXT: v_lshlrev_b32_e32 v8, 4, v8 +; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX8DAGISEL-NEXT: s_mov_b32 s33, s32 +; GFX8DAGISEL-NEXT: s_addk_i32 s32, 0x400 +; GFX8DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v8, s7 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX8DAGISEL-NEXT: s_max_u32 s6, s6, s8 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX8DAGISEL-NEXT: ; %bb.2: +; GFX8DAGISEL-NEXT: s_mov_b32 s4, s32 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v8, s4 +; GFX8DAGISEL-NEXT: v_lshl_add_u32 v8, s6, 6, v8 +; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s32, v8 +; GFX8DAGISEL-NEXT: buffer_store_dword v3, off, s[0:3], s4 offset:12 +; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) +; GFX8DAGISEL-NEXT: buffer_store_dword v2, off, s[0:3], s4 offset:8 +; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) +; GFX8DAGISEL-NEXT: buffer_store_dword v1, off, s[0:3], s4 offset:4 +; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) +; GFX8DAGISEL-NEXT: buffer_store_dword v0, off, s[0:3], s4 +; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) +; GFX8DAGISEL-NEXT: buffer_store_dword v7, off, s[0:3], s4 offset:28 +; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) +; GFX8DAGISEL-NEXT: buffer_store_dword v6, off, s[0:3], s4 offset:24 +; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) +; GFX8DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s4 offset:20 +; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) +; GFX8DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s4 offset:16 +; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) +; GFX8DAGISEL-NEXT: s_addk_i32 s32, 0xfc00 +; GFX8DAGISEL-NEXT: s_mov_b32 s33, s9 +; GFX8DAGISEL-NEXT: s_setpc_b64 s[30:31] +; GFX8-LABEL: pointer_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s9, s33 +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 4, v8 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: s_mov_b32 s6, 0 +; GFX8-NEXT: s_mov_b32 s33, s32 +; GFX8-NEXT: s_addk_i32 s32, 0x400 +; GFX8-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX8-NEXT: v_readlane_b32 s8, v8, s7 +; GFX8-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX8-NEXT: s_max_u32 s6, s6, s8 +; GFX8-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX8-NEXT: ; %bb.2: +; GFX8-NEXT: v_lshlrev_b32_e64 v8, 6, s6 +; GFX8-NEXT: s_mov_b32 s4, s32 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v8 +; GFX8-NEXT: v_readfirstlane_b32 s32, v8 +; GFX8-NEXT: buffer_store_dword v3, off, s[0:3], s4 offset:12 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s4 offset:8 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_store_dword v1, off, s[0:3], s4 offset:4 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], s4 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_store_dword v7, off, s[0:3], s4 offset:28 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_store_dword v6, off, s[0:3], s4 offset:24 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_store_dword v5, off, s[0:3], s4 offset:20 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_store_dword v4, off, s[0:3], s4 offset:16 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_addk_i32 s32, 0xfc00 +; GFX8-NEXT: s_mov_b32 s33, s9 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: pointer_offset: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s9, s33 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 4, v8 +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: s_mov_b32 s6, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9-NEXT: v_readlane_b32 s8, v8, s7 +; GFX9-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9-NEXT: s_max_u32 s6, s6, s8 +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX9-NEXT: ; %bb.2: +; GFX9-NEXT: s_mov_b32 s4, s32 +; GFX9-NEXT: v_mov_b32_e32 v8, s4 +; GFX9-NEXT: v_lshl_add_u32 v8, s6, 6, v8 +; GFX9-NEXT: v_readfirstlane_b32 s32, v8 +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s4 offset:12 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s4 offset:8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s4 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s4 offset:28 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s4 offset:24 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s4 offset:20 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s4 offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-NEXT: s_mov_b32 s33, s9 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX1064-LABEL: pointer_offset: +; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1064-NEXT: v_lshlrev_b32_e32 v8, 4, v8 +; GFX1064-NEXT: s_mov_b32 s9, s33 +; GFX1064-NEXT: s_mov_b64 s[4:5], exec +; GFX1064-NEXT: s_mov_b32 s6, 0 +; GFX1064-NEXT: s_mov_b32 s33, s32 +; GFX1064-NEXT: s_addk_i32 s32, 0x400 +; GFX1064-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX1064-NEXT: v_readlane_b32 s8, v8, s7 +; GFX1064-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1064-NEXT: s_max_u32 s6, s6, s8 +; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1064-NEXT: ; %bb.2: +; GFX1064-NEXT: s_mov_b32 s4, s32 +; GFX1064-NEXT: s_mov_b32 s33, s9 +; GFX1064-NEXT: v_lshl_add_u32 v8, s6, 6, s4 +; GFX1064-NEXT: buffer_store_dword v3, off, s[0:3], s4 offset:12 +; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[0:3], s4 offset:8 +; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-NEXT: buffer_store_dword v1, off, s[0:3], s4 offset:4 +; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], s4 +; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-NEXT: buffer_store_dword v7, off, s[0:3], s4 offset:28 +; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-NEXT: buffer_store_dword v6, off, s[0:3], s4 offset:24 +; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-NEXT: buffer_store_dword v5, off, s[0:3], s4 offset:20 +; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-NEXT: buffer_store_dword v4, off, s[0:3], s4 offset:16 +; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-NEXT: v_readfirstlane_b32 s32, v8 +; GFX1064-NEXT: s_addk_i32 s32, 0xfc00 +; GFX1064-NEXT: s_setpc_b64 s[30:31] +; +; GFX1032-LABEL: pointer_offset: +; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1032-NEXT: v_lshlrev_b32_e32 v8, 4, v8 +; GFX1032-NEXT: s_mov_b32 s8, s33 +; GFX1032-NEXT: s_mov_b32 s5, exec_lo +; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_mov_b32 s33, s32 +; GFX1032-NEXT: s_addk_i32 s32, 0x200 +; GFX1032-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s6, s5 +; GFX1032-NEXT: v_readlane_b32 s7, v8, s6 +; GFX1032-NEXT: s_bitset0_b32 s5, s6 +; GFX1032-NEXT: s_max_u32 s4, s4, s7 +; GFX1032-NEXT: s_cmp_lg_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1032-NEXT: ; %bb.2: +; GFX1032-NEXT: s_mov_b32 s5, s32 +; GFX1032-NEXT: s_mov_b32 s33, s8 +; GFX1032-NEXT: v_lshl_add_u32 v8, s4, 5, s5 +; GFX1032-NEXT: buffer_store_dword v3, off, s[0:3], s5 offset:12 +; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-NEXT: buffer_store_dword v2, off, s[0:3], s5 offset:8 +; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-NEXT: buffer_store_dword v1, off, s[0:3], s5 offset:4 +; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], s5 +; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-NEXT: buffer_store_dword v7, off, s[0:3], s5 offset:28 +; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-NEXT: buffer_store_dword v6, off, s[0:3], s5 offset:24 +; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-NEXT: buffer_store_dword v5, off, s[0:3], s5 offset:20 +; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-NEXT: buffer_store_dword v4, off, s[0:3], s5 offset:16 +; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-NEXT: v_readfirstlane_b32 s32, v8 +; GFX1032-NEXT: s_addk_i32 s32, 0xfe00 +; GFX1032-NEXT: s_setpc_b64 s[30:31] +; +; GFX1164-LABEL: pointer_offset: +; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1164-NEXT: v_lshlrev_b32_e32 v8, 4, v8 +; GFX1164-NEXT: s_mov_b32 s5, s33 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_mov_b32 s33, s32 +; GFX1164-NEXT: s_add_i32 s32, s32, 16 +; GFX1164-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b64 s3, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_readlane_b32 s4, v8, s3 +; GFX1164-NEXT: s_bitset0_b64 s[0:1], s3 +; GFX1164-NEXT: s_max_u32 s2, s2, s4 +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1164-NEXT: ; %bb.2: +; GFX1164-NEXT: s_mov_b32 s0, s32 +; GFX1164-NEXT: s_mov_b32 s33, s5 +; GFX1164-NEXT: v_lshl_add_u32 v8, s2, 6, s0 +; GFX1164-NEXT: s_add_i32 s1, s0, 16 +; GFX1164-NEXT: scratch_store_b128 off, v[0:3], s0 dlc +; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1164-NEXT: scratch_store_b128 off, v[4:7], s1 dlc +; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1164-NEXT: v_readfirstlane_b32 s32, v8 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: s_add_i32 s32, s32, -16 +; GFX1164-NEXT: s_setpc_b64 s[30:31] +; +; GFX1132-LABEL: pointer_offset: +; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1132-NEXT: v_lshlrev_b32_e32 v8, 4, v8 +; GFX1132-NEXT: s_mov_b32 s4, s33 +; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: s_mov_b32 s33, s32 +; GFX1132-NEXT: s_add_i32 s32, s32, 16 +; GFX1132-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s2, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_readlane_b32 s3, v8, s2 +; GFX1132-NEXT: s_bitset0_b32 s1, s2 +; GFX1132-NEXT: s_max_u32 s0, s0, s3 +; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1132-NEXT: ; %bb.2: +; GFX1132-NEXT: s_mov_b32 s1, s32 +; GFX1132-NEXT: s_mov_b32 s33, s4 +; GFX1132-NEXT: v_lshl_add_u32 v8, s0, 5, s1 +; GFX1132-NEXT: s_add_i32 s0, s1, 16 +; GFX1132-NEXT: scratch_store_b128 off, v[0:3], s1 dlc +; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1132-NEXT: scratch_store_b128 off, v[4:7], s0 dlc +; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1132-NEXT: v_readfirstlane_b32 s32, v8 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: s_add_i32 s32, s32, -16 +; GFX1132-NEXT: s_setpc_b64 s[30:31] +entry: + %dyn_alloca = alloca <4 x i32>, i32 %n, addrspace(5) + %ptr = getelementptr <4 x i32>, ptr addrspace(5) %dyn_alloca, i32 1 + store volatile <4 x i32> %a, ptr addrspace(5) %dyn_alloca + store volatile <4 x i32> %b, ptr addrspace(5) %ptr + ret void +} + +define void @multiple_allocas(i32 %m, i32 %n) { +; GFX8DAGISEL-LABEL: multiple_allocas: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX8DAGISEL-NEXT: s_mov_b32 s10, s33 +; GFX8DAGISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX8DAGISEL-NEXT: s_mov_b32 s7, 0 +; GFX8DAGISEL-NEXT: s_mov_b32 s33, s32 +; GFX8DAGISEL-NEXT: s_addk_i32 s32, 0x800 +; GFX8DAGISEL-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s6, s[4:5] +; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s6 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[4:5], s6 +; GFX8DAGISEL-NEXT: s_max_u32 s7, s7, s8 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX8DAGISEL-NEXT: ; %bb.2: +; GFX8DAGISEL-NEXT: s_mov_b32 s6, s32 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX8DAGISEL-NEXT: v_lshl_add_u32 v0, s7, 6, v0 +; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s32, v0 +; GFX8DAGISEL-NEXT: v_lshl_add_u32 v0, v1, 2, 15 +; GFX8DAGISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX8DAGISEL-NEXT: s_mov_b32 s7, 0 +; GFX8DAGISEL-NEXT: .LBB5_3: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX8DAGISEL-NEXT: v_readlane_b32 s9, v0, s8 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[4:5], s8 +; GFX8DAGISEL-NEXT: s_max_u32 s7, s7, s9 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB5_3 +; GFX8DAGISEL-NEXT: ; %bb.4: +; GFX8DAGISEL-NEXT: s_mov_b32 s4, s32 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX8DAGISEL-NEXT: v_lshl_add_u32 v0, s7, 6, v0 +; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s32, v0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX8DAGISEL-NEXT: buffer_store_dword v0, off, s[0:3], s33 +; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) +; GFX8DAGISEL-NEXT: buffer_store_dword v0, off, s[0:3], s6 +; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) +; GFX8DAGISEL-NEXT: buffer_store_dword v0, off, s[0:3], s4 +; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) +; GFX8DAGISEL-NEXT: s_addk_i32 s32, 0xf800 +; GFX8DAGISEL-NEXT: s_mov_b32 s33, s10 +; GFX8DAGISEL-NEXT: s_setpc_b64 s[30:31] +; GFX8-LABEL: multiple_allocas: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 15, v0 +; GFX8-NEXT: s_mov_b32 s10, s33 +; GFX8-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: s_mov_b32 s6, 0 +; GFX8-NEXT: s_mov_b32 s33, s32 +; GFX8-NEXT: s_addk_i32 s32, 0x800 +; GFX8-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX8-NEXT: v_readlane_b32 s8, v0, s7 +; GFX8-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX8-NEXT: s_max_u32 s6, s6, s8 +; GFX8-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX8-NEXT: ; %bb.2: +; GFX8-NEXT: v_lshlrev_b32_e64 v0, 6, s6 +; GFX8-NEXT: s_mov_b32 s6, s32 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v0 +; GFX8-NEXT: v_readfirstlane_b32 s32, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 15, v0 +; GFX8-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: s_mov_b32 s7, 0 +; GFX8-NEXT: .LBB5_3: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX8-NEXT: v_readlane_b32 s9, v0, s8 +; GFX8-NEXT: s_bitset0_b64 s[4:5], s8 +; GFX8-NEXT: s_max_u32 s7, s7, s9 +; GFX8-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB5_3 +; GFX8-NEXT: ; %bb.4: +; GFX8-NEXT: v_lshlrev_b32_e64 v0, 6, s7 +; GFX8-NEXT: s_mov_b32 s4, s32 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0 +; GFX8-NEXT: v_readfirstlane_b32 s32, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], s33 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], s6 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], s4 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_addk_i32 s32, 0xf800 +; GFX8-NEXT: s_mov_b32 s33, s10 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: multiple_allocas: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-NEXT: s_mov_b32 s10, s33 +; GFX9-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: s_mov_b32 s7, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_addk_i32 s32, 0x800 +; GFX9-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s6, s[4:5] +; GFX9-NEXT: v_readlane_b32 s8, v0, s6 +; GFX9-NEXT: s_bitset0_b64 s[4:5], s6 +; GFX9-NEXT: s_max_u32 s7, s7, s8 +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX9-NEXT: ; %bb.2: +; GFX9-NEXT: s_mov_b32 s6, s32 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_lshl_add_u32 v0, s7, 6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s32, v0 +; GFX9-NEXT: v_lshl_add_u32 v0, v1, 2, 15 +; GFX9-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: s_mov_b32 s7, 0 +; GFX9-NEXT: .LBB5_3: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX9-NEXT: v_readlane_b32 s9, v0, s8 +; GFX9-NEXT: s_bitset0_b64 s[4:5], s8 +; GFX9-NEXT: s_max_u32 s7, s7, s9 +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB5_3 +; GFX9-NEXT: ; %bb.4: +; GFX9-NEXT: s_mov_b32 s4, s32 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_lshl_add_u32 v0, s7, 6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s32, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s33 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_addk_i32 s32, 0xf800 +; GFX9-NEXT: s_mov_b32 s33, s10 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX1064-LABEL: multiple_allocas: +; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1064-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX1064-NEXT: s_mov_b32 s10, s33 +; GFX1064-NEXT: s_mov_b64 s[4:5], exec +; GFX1064-NEXT: s_mov_b32 s7, 0 +; GFX1064-NEXT: s_mov_b32 s33, s32 +; GFX1064-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX1064-NEXT: s_addk_i32 s32, 0x800 +; GFX1064-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s6, s[4:5] +; GFX1064-NEXT: v_readlane_b32 s8, v0, s6 +; GFX1064-NEXT: s_bitset0_b64 s[4:5], s6 +; GFX1064-NEXT: s_max_u32 s7, s7, s8 +; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1064-NEXT: ; %bb.2: +; GFX1064-NEXT: v_lshl_add_u32 v0, v1, 2, 15 +; GFX1064-NEXT: s_mov_b32 s6, s32 +; GFX1064-NEXT: s_mov_b64 s[4:5], exec +; GFX1064-NEXT: v_lshl_add_u32 v1, s7, 6, s6 +; GFX1064-NEXT: s_mov_b32 s7, 0 +; GFX1064-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s32, v1 +; GFX1064-NEXT: .LBB5_3: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX1064-NEXT: v_readlane_b32 s9, v0, s8 +; GFX1064-NEXT: s_bitset0_b64 s[4:5], s8 +; GFX1064-NEXT: s_max_u32 s7, s7, s9 +; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB5_3 +; GFX1064-NEXT: ; %bb.4: +; GFX1064-NEXT: s_mov_b32 s4, s32 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX1064-NEXT: v_lshl_add_u32 v0, s7, 6, s4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[0:3], s33 +; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-NEXT: buffer_store_dword v1, off, s[0:3], s6 +; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-NEXT: buffer_store_dword v1, off, s[0:3], s4 +; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-NEXT: v_readfirstlane_b32 s32, v0 +; GFX1064-NEXT: s_mov_b32 s33, s10 +; GFX1064-NEXT: s_addk_i32 s32, 0xf800 +; GFX1064-NEXT: s_setpc_b64 s[30:31] +; +; GFX1032-LABEL: multiple_allocas: +; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1032-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX1032-NEXT: s_mov_b32 s9, s33 +; GFX1032-NEXT: s_mov_b32 s4, exec_lo +; GFX1032-NEXT: s_mov_b32 s5, 0 +; GFX1032-NEXT: s_mov_b32 s33, s32 +; GFX1032-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX1032-NEXT: s_addk_i32 s32, 0x400 +; GFX1032-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s6, s4 +; GFX1032-NEXT: v_readlane_b32 s7, v0, s6 +; GFX1032-NEXT: s_bitset0_b32 s4, s6 +; GFX1032-NEXT: s_max_u32 s5, s5, s7 +; GFX1032-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1032-NEXT: ; %bb.2: +; GFX1032-NEXT: v_lshl_add_u32 v0, v1, 2, 15 +; GFX1032-NEXT: s_mov_b32 s4, s32 +; GFX1032-NEXT: s_mov_b32 s6, exec_lo +; GFX1032-NEXT: v_lshl_add_u32 v1, s5, 5, s4 +; GFX1032-NEXT: s_mov_b32 s5, 0 +; GFX1032-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s32, v1 +; GFX1032-NEXT: .LBB5_3: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s7, s6 +; GFX1032-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1032-NEXT: s_bitset0_b32 s6, s7 +; GFX1032-NEXT: s_max_u32 s5, s5, s8 +; GFX1032-NEXT: s_cmp_lg_u32 s6, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB5_3 +; GFX1032-NEXT: ; %bb.4: +; GFX1032-NEXT: s_mov_b32 s6, s32 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX1032-NEXT: v_lshl_add_u32 v0, s5, 5, s6 +; GFX1032-NEXT: buffer_store_dword v1, off, s[0:3], s33 +; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-NEXT: buffer_store_dword v1, off, s[0:3], s4 +; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-NEXT: buffer_store_dword v1, off, s[0:3], s6 +; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-NEXT: v_readfirstlane_b32 s32, v0 +; GFX1032-NEXT: s_mov_b32 s33, s9 +; GFX1032-NEXT: s_addk_i32 s32, 0xfc00 +; GFX1032-NEXT: s_setpc_b64 s[30:31] +; +; GFX1164-LABEL: multiple_allocas: +; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1164-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX1164-NEXT: s_mov_b32 s6, s33 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_mov_b32 s33, s32 +; GFX1164-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX1164-NEXT: s_add_i32 s32, s32, 32 +; GFX1164-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1164-NEXT: s_bitset0_b64 s[0:1], s2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: s_max_u32 s3, s3, s4 +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1164-NEXT: ; %bb.2: +; GFX1164-NEXT: v_lshl_add_u32 v0, v1, 2, 15 +; GFX1164-NEXT: s_mov_b32 s2, s32 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: v_lshl_add_u32 v1, s3, 6, s2 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: v_readfirstlane_b32 s32, v1 +; GFX1164-NEXT: .LBB5_3: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1164-NEXT: s_bitset0_b64 s[0:1], s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: s_max_u32 s3, s3, s5 +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB5_3 +; GFX1164-NEXT: ; %bb.4: +; GFX1164-NEXT: s_mov_b32 s0, s32 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX1164-NEXT: v_lshl_add_u32 v0, s3, 6, s0 +; GFX1164-NEXT: scratch_store_b32 off, v1, s33 dlc +; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1164-NEXT: scratch_store_b32 off, v1, s2 dlc +; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1164-NEXT: scratch_store_b32 off, v1, s0 dlc +; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1164-NEXT: v_readfirstlane_b32 s32, v0 +; GFX1164-NEXT: s_mov_b32 s33, s6 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: s_addk_i32 s32, 0xffe0 +; GFX1164-NEXT: s_setpc_b64 s[30:31] +; +; GFX1132-LABEL: multiple_allocas: +; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1132-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX1132-NEXT: s_mov_b32 s5, s33 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s1, 0 +; GFX1132-NEXT: s_mov_b32 s33, s32 +; GFX1132-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX1132-NEXT: s_add_i32 s32, s32, 32 +; GFX1132-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s2, s0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1132-NEXT: s_bitset0_b32 s0, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: s_max_u32 s1, s1, s3 +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1132-NEXT: ; %bb.2: +; GFX1132-NEXT: v_lshl_add_u32 v0, v1, 2, 15 +; GFX1132-NEXT: s_mov_b32 s0, s32 +; GFX1132-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-NEXT: v_lshl_add_u32 v1, s1, 5, s0 +; GFX1132-NEXT: s_mov_b32 s1, 0 +; GFX1132-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-NEXT: v_readfirstlane_b32 s32, v1 +; GFX1132-NEXT: .LBB5_3: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_readlane_b32 s4, v0, s3 +; GFX1132-NEXT: s_bitset0_b32 s2, s3 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: s_max_u32 s1, s1, s4 +; GFX1132-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB5_3 +; GFX1132-NEXT: ; %bb.4: +; GFX1132-NEXT: s_mov_b32 s2, s32 +; GFX1132-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX1132-NEXT: v_lshl_add_u32 v0, s1, 5, s2 +; GFX1132-NEXT: scratch_store_b32 off, v1, s33 dlc +; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1132-NEXT: scratch_store_b32 off, v1, s0 dlc +; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1132-NEXT: scratch_store_b32 off, v1, s2 dlc +; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1132-NEXT: v_readfirstlane_b32 s32, v0 +; GFX1132-NEXT: s_mov_b32 s33, s5 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: s_addk_i32 s32, 0xffe0 +; GFX1132-NEXT: s_setpc_b64 s[30:31] +entry: + %static_alloca = alloca i32, i32 4, addrspace(5) + %dyn_alloca_1 = alloca i32, i32 %m, addrspace(5) + %dyn_alloca_2 = alloca i32, i32 %n, addrspace(5) + store volatile i32 123, ptr addrspace(5) %static_alloca + store volatile i32 123, ptr addrspace(5) %dyn_alloca_1 + store volatile i32 123, ptr addrspace(5) %dyn_alloca_2 + ret void +} + +define void @callee(<33 x i32> %a){ +; GFX8DAGISEL-LABEL: callee: +; GFX8DAGISEL: ; %bb.0: +; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8DAGISEL-NEXT: s_mov_b32 s9, s33 +; GFX8DAGISEL-NEXT: s_mov_b32 s33, s32 +; GFX8DAGISEL-NEXT: buffer_load_dword v0, off, s[0:3], s33 +; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX8DAGISEL-NEXT: s_addk_i32 s32, 0x400 +; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) +; GFX8DAGISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX8DAGISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX8DAGISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX8DAGISEL-NEXT: s_max_u32 s6, s6, s8 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX8DAGISEL-NEXT: ; %bb.2: +; GFX8DAGISEL-NEXT: s_mov_b32 s4, s32 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX8DAGISEL-NEXT: v_lshl_add_u32 v0, s6, 6, v0 +; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s32, v0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, 0x3039 +; GFX8DAGISEL-NEXT: buffer_store_dword v0, off, s[0:3], s4 +; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) +; GFX8DAGISEL-NEXT: s_addk_i32 s32, 0xfc00 +; GFX8DAGISEL-NEXT: s_mov_b32 s33, s9 +; GFX8DAGISEL-NEXT: s_setpc_b64 s[30:31] +; GFX8-LABEL: callee: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s9, s33 +; GFX8-NEXT: s_mov_b32 s33, s32 +; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s33 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: s_mov_b32 s6, 0 +; GFX8-NEXT: s_addk_i32 s32, 0x400 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 15, v0 +; GFX8-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX8-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX8-NEXT: v_readlane_b32 s8, v0, s7 +; GFX8-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX8-NEXT: s_max_u32 s6, s6, s8 +; GFX8-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX8-NEXT: ; %bb.2: +; GFX8-NEXT: v_lshlrev_b32_e64 v0, 6, s6 +; GFX8-NEXT: s_mov_b32 s4, s32 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0 +; GFX8-NEXT: v_readfirstlane_b32 s32, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, 0x3039 +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], s4 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_addk_i32 s32, 0xfc00 +; GFX8-NEXT: s_mov_b32 s33, s9 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: callee: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s9, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s33 +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: s_mov_b32 s6, 0 +; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX9-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9-NEXT: s_max_u32 s6, s6, s8 +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX9-NEXT: ; %bb.2: +; GFX9-NEXT: s_mov_b32 s4, s32 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_lshl_add_u32 v0, s6, 6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s32, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x3039 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-NEXT: s_mov_b32 s33, s9 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX1064-LABEL: callee: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_mov_b32 s9, s33 +; GFX1064-NEXT: s_mov_b32 s33, s32 +; GFX1064-NEXT: s_mov_b64 s[4:5], exec +; GFX1064-NEXT: buffer_load_dword v0, off, s[0:3], s33 +; GFX1064-NEXT: s_mov_b32 s6, 0 +; GFX1064-NEXT: s_addk_i32 s32, 0x400 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX1064-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX1064-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX1064-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1064-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1064-NEXT: s_max_u32 s6, s6, s8 +; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX1064-NEXT: ; %bb.2: +; GFX1064-NEXT: s_mov_b32 s4, s32 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX1064-NEXT: v_lshl_add_u32 v0, s6, 6, s4 +; GFX1064-NEXT: s_mov_b32 s33, s9 +; GFX1064-NEXT: buffer_store_dword v1, off, s[0:3], s4 +; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-NEXT: v_readfirstlane_b32 s32, v0 +; GFX1064-NEXT: s_addk_i32 s32, 0xfc00 +; GFX1064-NEXT: s_setpc_b64 s[30:31] +; +; GFX1032-LABEL: callee: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_mov_b32 s8, s33 +; GFX1032-NEXT: s_mov_b32 s33, s32 +; GFX1032-NEXT: s_mov_b32 s5, exec_lo +; GFX1032-NEXT: buffer_load_dword v0, off, s[0:3], s33 +; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_addk_i32 s32, 0x200 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX1032-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX1032-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s6, s5 +; GFX1032-NEXT: v_readlane_b32 s7, v0, s6 +; GFX1032-NEXT: s_bitset0_b32 s5, s6 +; GFX1032-NEXT: s_max_u32 s4, s4, s7 +; GFX1032-NEXT: s_cmp_lg_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX1032-NEXT: ; %bb.2: +; GFX1032-NEXT: s_mov_b32 s5, s32 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX1032-NEXT: v_lshl_add_u32 v0, s4, 5, s5 +; GFX1032-NEXT: s_mov_b32 s33, s8 +; GFX1032-NEXT: buffer_store_dword v1, off, s[0:3], s5 +; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-NEXT: v_readfirstlane_b32 s32, v0 +; GFX1032-NEXT: s_addk_i32 s32, 0xfe00 +; GFX1032-NEXT: s_setpc_b64 s[30:31] +; +; GFX1164-LABEL: callee: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1164-NEXT: s_mov_b32 s5, s33 +; GFX1164-NEXT: s_mov_b32 s33, s32 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: scratch_load_b32 v0, off, s33 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_add_i32 s32, s32, 16 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX1164-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b64 s3, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_readlane_b32 s4, v0, s3 +; GFX1164-NEXT: s_bitset0_b64 s[0:1], s3 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: s_max_u32 s2, s2, s4 +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX1164-NEXT: ; %bb.2: +; GFX1164-NEXT: s_mov_b32 s0, s32 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX1164-NEXT: v_lshl_add_u32 v0, s2, 6, s0 +; GFX1164-NEXT: s_mov_b32 s33, s5 +; GFX1164-NEXT: scratch_store_b32 off, v1, s0 dlc +; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1164-NEXT: v_readfirstlane_b32 s32, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: s_add_i32 s32, s32, -16 +; GFX1164-NEXT: s_setpc_b64 s[30:31] +; +; GFX1132-LABEL: callee: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1132-NEXT: s_mov_b32 s4, s33 +; GFX1132-NEXT: s_mov_b32 s33, s32 +; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: scratch_load_b32 v0, off, s33 +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: s_add_i32 s32, s32, 16 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX1132-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s2, s1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1132-NEXT: s_bitset0_b32 s1, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: s_max_u32 s0, s0, s3 +; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX1132-NEXT: ; %bb.2: +; GFX1132-NEXT: s_mov_b32 s1, s32 +; GFX1132-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX1132-NEXT: v_lshl_add_u32 v0, s0, 5, s1 +; GFX1132-NEXT: s_mov_b32 s33, s4 +; GFX1132-NEXT: scratch_store_b32 off, v1, s1 dlc +; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1132-NEXT: v_readfirstlane_b32 s32, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: s_add_i32 s32, s32, -16 +; GFX1132-NEXT: s_setpc_b64 s[30:31] + %val = extractelement <33 x i32> %a, i32 31 + %dyn_alloca_callee = alloca i32, i32 %val, addrspace(5) + store volatile i32 12345, ptr addrspace(5) %dyn_alloca_callee + ret void +} + +define amdgpu_kernel void @caller(<33 x i32> %a) { +; GFX8DAGISEL-LABEL: caller: +; GFX8DAGISEL: ; %bb.0: +; GFX8DAGISEL-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 +; GFX8DAGISEL-NEXT: s_mov_b32 s12, s8 +; GFX8DAGISEL-NEXT: s_load_dwordx16 s[16:31], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x64 +; GFX8DAGISEL-NEXT: s_load_dword s8, s[4:5], 0xa4 +; GFX8DAGISEL-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 +; GFX8DAGISEL-NEXT: s_mov_b32 s54, -1 +; GFX8DAGISEL-NEXT: s_mov_b32 s55, 0xe00000 +; GFX8DAGISEL-NEXT: s_add_u32 s52, s52, s11 +; GFX8DAGISEL-NEXT: v_lshl_add_u32 v3, v0, 3, 15 +; GFX8DAGISEL-NEXT: s_addc_u32 s53, s53, 0 +; GFX8DAGISEL-NEXT: s_mov_b32 s14, s10 +; GFX8DAGISEL-NEXT: s_mov_b32 s13, s9 +; GFX8DAGISEL-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX8DAGISEL-NEXT: v_and_b32_e32 v3, 0x3ff0, v3 +; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec +; GFX8DAGISEL-NEXT: s_mov_b32 s9, 0 +; GFX8DAGISEL-NEXT: s_mov_b32 s33, 0 +; GFX8DAGISEL-NEXT: s_movk_i32 s32, 0x400 +; GFX8DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s15, s[6:7] +; GFX8DAGISEL-NEXT: v_readlane_b32 s34, v3, s15 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[6:7], s15 +; GFX8DAGISEL-NEXT: s_max_u32 s9, s9, s34 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX8DAGISEL-NEXT: ; %bb.2: +; GFX8DAGISEL-NEXT: s_mov_b32 s6, s32 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s6 +; GFX8DAGISEL-NEXT: v_lshl_add_u32 v3, s9, 6, v3 +; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s32, v3 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, 0x7b +; GFX8DAGISEL-NEXT: buffer_store_dword v3, off, s[52:55], s6 +; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s51 +; GFX8DAGISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX8DAGISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX8DAGISEL-NEXT: buffer_store_dword v3, off, s[52:55], s32 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s8 +; GFX8DAGISEL-NEXT: s_add_u32 s8, s4, 0x124 +; GFX8DAGISEL-NEXT: s_addc_u32 s9, s5, 0 +; GFX8DAGISEL-NEXT: s_getpc_b64 s[4:5] +; GFX8DAGISEL-NEXT: s_add_u32 s4, s4, callee@gotpcrel32@lo+4 +; GFX8DAGISEL-NEXT: s_addc_u32 s5, s5, callee@gotpcrel32@hi+12 +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 +; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX8DAGISEL-NEXT: s_mov_b64 s[0:1], s[52:53] +; GFX8DAGISEL-NEXT: buffer_store_dword v3, off, s[52:55], s32 offset:4 +; GFX8DAGISEL-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], s[54:55] +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s16 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s17 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s18 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s19 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v4, s20 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v5, s21 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, s22 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, s23 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v8, s24 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v9, s25 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v10, s26 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v11, s27 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v12, s28 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v13, s29 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v14, s30 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v15, s31 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v16, s36 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v17, s37 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v18, s38 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v19, s39 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v20, s40 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v21, s41 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v22, s42 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v23, s43 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v24, s44 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v25, s45 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v26, s46 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v27, s47 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v28, s48 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v29, s49 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v30, s50 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX8DAGISEL-NEXT: s_endpgm +; GFX8-LABEL: caller: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s12, s8 +; GFX8-NEXT: s_load_dwordx16 s[16:31], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x64 +; GFX8-NEXT: s_load_dword s8, s[4:5], 0xa4 +; GFX8-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s90, -1 +; GFX8-NEXT: s_mov_b32 s91, 0xe80000 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX8-NEXT: s_add_u32 s88, s88, s11 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 15, v3 +; GFX8-NEXT: s_addc_u32 s89, s89, 0 +; GFX8-NEXT: s_mov_b32 s14, s10 +; GFX8-NEXT: s_mov_b32 s13, s9 +; GFX8-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX8-NEXT: v_and_b32_e32 v3, 0x3ff0, v3 +; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: s_mov_b32 s9, 0 +; GFX8-NEXT: s_mov_b32 s33, 0 +; GFX8-NEXT: s_movk_i32 s32, 0x400 +; GFX8-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b64 s15, s[6:7] +; GFX8-NEXT: v_readlane_b32 s34, v3, s15 +; GFX8-NEXT: s_bitset0_b64 s[6:7], s15 +; GFX8-NEXT: s_max_u32 s9, s9, s34 +; GFX8-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX8-NEXT: ; %bb.2: +; GFX8-NEXT: v_lshlrev_b32_e64 v3, 6, s9 +; GFX8-NEXT: s_mov_b32 s6, s32 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, s6, v3 +; GFX8-NEXT: v_readfirstlane_b32 s32, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, 0x7b +; GFX8-NEXT: buffer_store_dword v3, off, s[88:91], s6 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v3, s51 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX8-NEXT: buffer_store_dword v3, off, s[88:91], s32 +; GFX8-NEXT: v_mov_b32_e32 v3, s8 +; GFX8-NEXT: s_add_u32 s8, s4, 0x124 +; GFX8-NEXT: s_addc_u32 s9, s5, 0 +; GFX8-NEXT: s_getpc_b64 s[4:5] +; GFX8-NEXT: s_add_u32 s4, s4, callee@gotpcrel32@lo+4 +; GFX8-NEXT: s_addc_u32 s5, s5, callee@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX8-NEXT: s_mov_b64 s[0:1], s[88:89] +; GFX8-NEXT: buffer_store_dword v3, off, s[88:91], s32 offset:4 +; GFX8-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX8-NEXT: s_mov_b64 s[2:3], s[90:91] +; GFX8-NEXT: v_mov_b32_e32 v0, s16 +; GFX8-NEXT: v_mov_b32_e32 v1, s17 +; GFX8-NEXT: v_mov_b32_e32 v2, s18 +; GFX8-NEXT: v_mov_b32_e32 v3, s19 +; GFX8-NEXT: v_mov_b32_e32 v4, s20 +; GFX8-NEXT: v_mov_b32_e32 v5, s21 +; GFX8-NEXT: v_mov_b32_e32 v6, s22 +; GFX8-NEXT: v_mov_b32_e32 v7, s23 +; GFX8-NEXT: v_mov_b32_e32 v8, s24 +; GFX8-NEXT: v_mov_b32_e32 v9, s25 +; GFX8-NEXT: v_mov_b32_e32 v10, s26 +; GFX8-NEXT: v_mov_b32_e32 v11, s27 +; GFX8-NEXT: v_mov_b32_e32 v12, s28 +; GFX8-NEXT: v_mov_b32_e32 v13, s29 +; GFX8-NEXT: v_mov_b32_e32 v14, s30 +; GFX8-NEXT: v_mov_b32_e32 v15, s31 +; GFX8-NEXT: v_mov_b32_e32 v16, s36 +; GFX8-NEXT: v_mov_b32_e32 v17, s37 +; GFX8-NEXT: v_mov_b32_e32 v18, s38 +; GFX8-NEXT: v_mov_b32_e32 v19, s39 +; GFX8-NEXT: v_mov_b32_e32 v20, s40 +; GFX8-NEXT: v_mov_b32_e32 v21, s41 +; GFX8-NEXT: v_mov_b32_e32 v22, s42 +; GFX8-NEXT: v_mov_b32_e32 v23, s43 +; GFX8-NEXT: v_mov_b32_e32 v24, s44 +; GFX8-NEXT: v_mov_b32_e32 v25, s45 +; GFX8-NEXT: v_mov_b32_e32 v26, s46 +; GFX8-NEXT: v_mov_b32_e32 v27, s47 +; GFX8-NEXT: v_mov_b32_e32 v28, s48 +; GFX8-NEXT: v_mov_b32_e32 v29, s49 +; GFX8-NEXT: v_mov_b32_e32 v30, s50 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: caller: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_load_dwordx16 s[16:31], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x64 +; GFX9-NEXT: s_load_dword s8, s[4:5], 0xa4 +; GFX9-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s54, -1 +; GFX9-NEXT: s_mov_b32 s55, 0xe00000 +; GFX9-NEXT: s_add_u32 s52, s52, s11 +; GFX9-NEXT: v_lshl_add_u32 v3, v0, 3, 15 +; GFX9-NEXT: s_addc_u32 s53, s53, 0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_and_b32_e32 v3, 0x3ff0, v3 +; GFX9-NEXT: s_mov_b64 s[6:7], exec +; GFX9-NEXT: s_mov_b32 s9, 0 +; GFX9-NEXT: s_mov_b32 s33, 0 +; GFX9-NEXT: s_movk_i32 s32, 0x400 +; GFX9-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s15, s[6:7] +; GFX9-NEXT: v_readlane_b32 s34, v3, s15 +; GFX9-NEXT: s_bitset0_b64 s[6:7], s15 +; GFX9-NEXT: s_max_u32 s9, s9, s34 +; GFX9-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX9-NEXT: ; %bb.2: +; GFX9-NEXT: s_mov_b32 s6, s32 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_lshl_add_u32 v3, s9, 6, v3 +; GFX9-NEXT: v_readfirstlane_b32 s32, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x7b +; GFX9-NEXT: buffer_store_dword v3, off, s[52:55], s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, s51 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: buffer_store_dword v3, off, s[52:55], s32 +; GFX9-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-NEXT: s_add_u32 s8, s4, 0x124 +; GFX9-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, callee@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, callee@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-NEXT: s_mov_b64 s[0:1], s[52:53] +; GFX9-NEXT: buffer_store_dword v3, off, s[52:55], s32 offset:4 +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[2:3], s[54:55] +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: v_mov_b32_e32 v16, s36 +; GFX9-NEXT: v_mov_b32_e32 v17, s37 +; GFX9-NEXT: v_mov_b32_e32 v18, s38 +; GFX9-NEXT: v_mov_b32_e32 v19, s39 +; GFX9-NEXT: v_mov_b32_e32 v20, s40 +; GFX9-NEXT: v_mov_b32_e32 v21, s41 +; GFX9-NEXT: v_mov_b32_e32 v22, s42 +; GFX9-NEXT: v_mov_b32_e32 v23, s43 +; GFX9-NEXT: v_mov_b32_e32 v24, s44 +; GFX9-NEXT: v_mov_b32_e32 v25, s45 +; GFX9-NEXT: v_mov_b32_e32 v26, s46 +; GFX9-NEXT: v_mov_b32_e32 v27, s47 +; GFX9-NEXT: v_mov_b32_e32 v28, s48 +; GFX9-NEXT: v_mov_b32_e32 v29, s49 +; GFX9-NEXT: v_mov_b32_e32 v30, s50 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: caller: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s12, s8 +; GFX1064-NEXT: s_clause 0x2 +; GFX1064-NEXT: s_load_dwordx16 s[16:31], s[4:5], 0x24 +; GFX1064-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x64 +; GFX1064-NEXT: s_load_dword s8, s[4:5], 0xa4 +; GFX1064-NEXT: v_lshl_add_u32 v3, v0, 3, 15 +; GFX1064-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s54, -1 +; GFX1064-NEXT: s_mov_b32 s55, 0x31e16000 +; GFX1064-NEXT: v_and_b32_e32 v3, 0x3ff0, v3 +; GFX1064-NEXT: s_add_u32 s52, s52, s11 +; GFX1064-NEXT: s_addc_u32 s53, s53, 0 +; GFX1064-NEXT: s_mov_b32 s14, s10 +; GFX1064-NEXT: s_mov_b32 s13, s9 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1064-NEXT: s_mov_b64 s[6:7], exec +; GFX1064-NEXT: s_mov_b32 s9, 0 +; GFX1064-NEXT: s_mov_b32 s33, 0 +; GFX1064-NEXT: s_movk_i32 s32, 0x400 +; GFX1064-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s15, s[6:7] +; GFX1064-NEXT: v_readlane_b32 s34, v3, s15 +; GFX1064-NEXT: s_bitset0_b64 s[6:7], s15 +; GFX1064-NEXT: s_max_u32 s9, s9, s34 +; GFX1064-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1064-NEXT: ; %bb.2: +; GFX1064-NEXT: s_mov_b32 s6, s32 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v6, s8 +; GFX1064-NEXT: s_add_u32 s8, s4, 0x124 +; GFX1064-NEXT: v_lshl_add_u32 v3, s9, 6, s6 +; GFX1064-NEXT: s_addc_u32 s9, s5, 0 +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, callee@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, callee@gotpcrel32@hi+12 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0x7b +; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v5, s51 +; GFX1064-NEXT: v_readfirstlane_b32 s32, v3 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: buffer_store_dword v4, off, s[52:55], s6 +; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-NEXT: buffer_store_dword v5, off, s[52:55], s32 +; GFX1064-NEXT: buffer_store_dword v6, off, s[52:55], s32 offset:4 +; GFX1064-NEXT: v_mov_b32_e32 v3, s19 +; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: v_mov_b32_e32 v0, s16 +; GFX1064-NEXT: v_mov_b32_e32 v1, s17 +; GFX1064-NEXT: v_mov_b32_e32 v2, s18 +; GFX1064-NEXT: v_mov_b32_e32 v4, s20 +; GFX1064-NEXT: v_mov_b32_e32 v5, s21 +; GFX1064-NEXT: v_mov_b32_e32 v6, s22 +; GFX1064-NEXT: v_mov_b32_e32 v7, s23 +; GFX1064-NEXT: v_mov_b32_e32 v8, s24 +; GFX1064-NEXT: v_mov_b32_e32 v9, s25 +; GFX1064-NEXT: v_mov_b32_e32 v10, s26 +; GFX1064-NEXT: v_mov_b32_e32 v11, s27 +; GFX1064-NEXT: v_mov_b32_e32 v12, s28 +; GFX1064-NEXT: v_mov_b32_e32 v13, s29 +; GFX1064-NEXT: v_mov_b32_e32 v14, s30 +; GFX1064-NEXT: v_mov_b32_e32 v15, s31 +; GFX1064-NEXT: v_mov_b32_e32 v16, s36 +; GFX1064-NEXT: v_mov_b32_e32 v17, s37 +; GFX1064-NEXT: v_mov_b32_e32 v18, s38 +; GFX1064-NEXT: v_mov_b32_e32 v19, s39 +; GFX1064-NEXT: v_mov_b32_e32 v20, s40 +; GFX1064-NEXT: v_mov_b32_e32 v21, s41 +; GFX1064-NEXT: v_mov_b32_e32 v22, s42 +; GFX1064-NEXT: v_mov_b32_e32 v23, s43 +; GFX1064-NEXT: v_mov_b32_e32 v24, s44 +; GFX1064-NEXT: v_mov_b32_e32 v25, s45 +; GFX1064-NEXT: v_mov_b32_e32 v26, s46 +; GFX1064-NEXT: v_mov_b32_e32 v27, s47 +; GFX1064-NEXT: v_mov_b32_e32 v28, s48 +; GFX1064-NEXT: v_mov_b32_e32 v29, s49 +; GFX1064-NEXT: v_mov_b32_e32 v30, s50 +; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[52:53] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[54:55] +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: caller: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s54, -1 +; GFX1032-NEXT: s_mov_b32 s55, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s52, s52, s11 +; GFX1032-NEXT: s_mov_b32 s14, s10 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1032-NEXT: s_clause 0x2 +; GFX1032-NEXT: s_load_dwordx16 s[16:31], s[4:5], 0x24 +; GFX1032-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x64 +; GFX1032-NEXT: s_load_dword s6, s[4:5], 0xa4 +; GFX1032-NEXT: v_lshl_add_u32 v3, v0, 3, 15 +; GFX1032-NEXT: s_addc_u32 s53, s53, 0 +; GFX1032-NEXT: s_mov_b32 s13, s9 +; GFX1032-NEXT: s_mov_b32 s12, s8 +; GFX1032-NEXT: s_mov_b32 s8, exec_lo +; GFX1032-NEXT: v_and_b32_e32 v3, 0x3ff0, v3 +; GFX1032-NEXT: s_mov_b32 s7, 0 +; GFX1032-NEXT: s_mov_b32 s33, 0 +; GFX1032-NEXT: s_movk_i32 s32, 0x200 +; GFX1032-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s9, s8 +; GFX1032-NEXT: v_readlane_b32 s15, v3, s9 +; GFX1032-NEXT: s_bitset0_b32 s8, s9 +; GFX1032-NEXT: s_max_u32 s7, s7, s15 +; GFX1032-NEXT: s_cmp_lg_u32 s8, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1032-NEXT: ; %bb.2: +; GFX1032-NEXT: v_mov_b32_e32 v4, 0x7b +; GFX1032-NEXT: s_mov_b32 s8, s32 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v5, s51 +; GFX1032-NEXT: v_lshl_add_u32 v3, s7, 5, s8 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: buffer_store_dword v4, off, s[52:55], s8 +; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032-NEXT: s_add_u32 s8, s4, 0x124 +; GFX1032-NEXT: s_addc_u32 s9, s5, 0 +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, callee@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, callee@gotpcrel32@hi+12 +; GFX1032-NEXT: v_readfirstlane_b32 s32, v3 +; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: v_mov_b32_e32 v6, s6 +; GFX1032-NEXT: buffer_store_dword v5, off, s[52:55], s32 +; GFX1032-NEXT: buffer_store_dword v6, off, s[52:55], s32 offset:4 +; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: v_mov_b32_e32 v0, s16 +; GFX1032-NEXT: v_mov_b32_e32 v1, s17 +; GFX1032-NEXT: v_mov_b32_e32 v2, s18 +; GFX1032-NEXT: v_mov_b32_e32 v3, s19 +; GFX1032-NEXT: v_mov_b32_e32 v4, s20 +; GFX1032-NEXT: v_mov_b32_e32 v5, s21 +; GFX1032-NEXT: v_mov_b32_e32 v6, s22 +; GFX1032-NEXT: v_mov_b32_e32 v7, s23 +; GFX1032-NEXT: v_mov_b32_e32 v8, s24 +; GFX1032-NEXT: v_mov_b32_e32 v9, s25 +; GFX1032-NEXT: v_mov_b32_e32 v10, s26 +; GFX1032-NEXT: v_mov_b32_e32 v11, s27 +; GFX1032-NEXT: v_mov_b32_e32 v12, s28 +; GFX1032-NEXT: v_mov_b32_e32 v13, s29 +; GFX1032-NEXT: v_mov_b32_e32 v14, s30 +; GFX1032-NEXT: v_mov_b32_e32 v15, s31 +; GFX1032-NEXT: v_mov_b32_e32 v16, s36 +; GFX1032-NEXT: v_mov_b32_e32 v17, s37 +; GFX1032-NEXT: v_mov_b32_e32 v18, s38 +; GFX1032-NEXT: v_mov_b32_e32 v19, s39 +; GFX1032-NEXT: v_mov_b32_e32 v20, s40 +; GFX1032-NEXT: v_mov_b32_e32 v21, s41 +; GFX1032-NEXT: v_mov_b32_e32 v22, s42 +; GFX1032-NEXT: v_mov_b32_e32 v23, s43 +; GFX1032-NEXT: v_mov_b32_e32 v24, s44 +; GFX1032-NEXT: v_mov_b32_e32 v25, s45 +; GFX1032-NEXT: v_mov_b32_e32 v26, s46 +; GFX1032-NEXT: v_mov_b32_e32 v27, s47 +; GFX1032-NEXT: v_mov_b32_e32 v28, s48 +; GFX1032-NEXT: v_mov_b32_e32 v29, s49 +; GFX1032-NEXT: v_mov_b32_e32 v30, s50 +; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[52:53] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[54:55] +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: caller: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b32 s12, s8 +; GFX1164-NEXT: s_clause 0x2 +; GFX1164-NEXT: s_load_b512 s[16:31], s[4:5], 0x24 +; GFX1164-NEXT: s_load_b512 s[36:51], s[4:5], 0x64 +; GFX1164-NEXT: s_load_b32 s8, s[4:5], 0xa4 +; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164-NEXT: s_mov_b32 s14, s10 +; GFX1164-NEXT: s_mov_b32 s13, s9 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1164-NEXT: s_mov_b64 s[6:7], exec +; GFX1164-NEXT: v_lshl_add_u32 v1, v1, 3, 15 +; GFX1164-NEXT: s_mov_b32 s9, 0 +; GFX1164-NEXT: s_mov_b32 s33, 0 +; GFX1164-NEXT: s_mov_b32 s32, 16 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff0, v1 +; GFX1164-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b64 s15, s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_readlane_b32 s34, v1, s15 +; GFX1164-NEXT: s_bitset0_b64 s[6:7], s15 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: s_max_u32 s9, s9, s34 +; GFX1164-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1164-NEXT: ; %bb.2: +; GFX1164-NEXT: s_mov_b32 s6, s32 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v2, s8 +; GFX1164-NEXT: s_add_u32 s8, s4, 0x124 +; GFX1164-NEXT: v_lshl_add_u32 v3, s9, 6, s6 +; GFX1164-NEXT: s_addc_u32 s9, s5, 0 +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, callee@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, callee@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0x7b +; GFX1164-NEXT: s_load_b64 s[34:35], s[4:5], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v1, s51 +; GFX1164-NEXT: v_readfirstlane_b32 s32, v3 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: scratch_store_b32 off, v4, s6 dlc +; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, s16 +; GFX1164-NEXT: v_mov_b32_e32 v3, s19 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], s32 +; GFX1164-NEXT: v_mov_b32_e32 v1, s17 +; GFX1164-NEXT: v_mov_b32_e32 v2, s18 +; GFX1164-NEXT: v_mov_b32_e32 v4, s20 +; GFX1164-NEXT: v_mov_b32_e32 v5, s21 +; GFX1164-NEXT: v_mov_b32_e32 v6, s22 +; GFX1164-NEXT: v_mov_b32_e32 v7, s23 +; GFX1164-NEXT: v_mov_b32_e32 v8, s24 +; GFX1164-NEXT: v_mov_b32_e32 v9, s25 +; GFX1164-NEXT: v_mov_b32_e32 v10, s26 +; GFX1164-NEXT: v_mov_b32_e32 v11, s27 +; GFX1164-NEXT: v_mov_b32_e32 v12, s28 +; GFX1164-NEXT: v_mov_b32_e32 v13, s29 +; GFX1164-NEXT: v_mov_b32_e32 v14, s30 +; GFX1164-NEXT: v_mov_b32_e32 v15, s31 +; GFX1164-NEXT: v_mov_b32_e32 v16, s36 +; GFX1164-NEXT: v_mov_b32_e32 v17, s37 +; GFX1164-NEXT: v_mov_b32_e32 v18, s38 +; GFX1164-NEXT: v_mov_b32_e32 v19, s39 +; GFX1164-NEXT: v_mov_b32_e32 v20, s40 +; GFX1164-NEXT: v_mov_b32_e32 v21, s41 +; GFX1164-NEXT: v_mov_b32_e32 v22, s42 +; GFX1164-NEXT: v_mov_b32_e32 v23, s43 +; GFX1164-NEXT: v_mov_b32_e32 v24, s44 +; GFX1164-NEXT: v_mov_b32_e32 v25, s45 +; GFX1164-NEXT: v_mov_b32_e32 v26, s46 +; GFX1164-NEXT: v_mov_b32_e32 v27, s47 +; GFX1164-NEXT: v_mov_b32_e32 v28, s48 +; GFX1164-NEXT: v_mov_b32_e32 v29, s49 +; GFX1164-NEXT: v_mov_b32_e32 v30, s50 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: caller: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1132-NEXT: s_clause 0x2 +; GFX1132-NEXT: s_load_b512 s[16:31], s[4:5], 0x24 +; GFX1132-NEXT: s_load_b512 s[36:51], s[4:5], 0x64 +; GFX1132-NEXT: s_load_b32 s6, s[4:5], 0xa4 +; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b32 s8, exec_lo +; GFX1132-NEXT: s_mov_b32 s7, 0 +; GFX1132-NEXT: s_mov_b32 s33, 0 +; GFX1132-NEXT: v_lshl_add_u32 v1, v1, 3, 15 +; GFX1132-NEXT: s_mov_b32 s32, 16 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff0, v1 +; GFX1132-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s9, s8 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_readlane_b32 s13, v1, s9 +; GFX1132-NEXT: s_bitset0_b32 s8, s9 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: s_max_u32 s7, s7, s13 +; GFX1132-NEXT: s_cmp_lg_u32 s8, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1132-NEXT: ; %bb.2: +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_dual_mov_b32 v4, 0x7b :: v_dual_mov_b32 v1, s51 +; GFX1132-NEXT: s_mov_b32 s8, s32 +; GFX1132-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: v_lshl_add_u32 v3, s7, 5, s8 +; GFX1132-NEXT: scratch_store_b32 off, v4, s8 dlc +; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1132-NEXT: s_add_u32 s8, s4, 0x124 +; GFX1132-NEXT: s_addc_u32 s9, s5, 0 +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, callee@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, callee@gotpcrel32@hi+12 +; GFX1132-NEXT: v_readfirstlane_b32 s32, v3 +; GFX1132-NEXT: s_load_b64 s[34:35], s[4:5], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v3, s19 +; GFX1132-NEXT: v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v5, s21 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], s32 +; GFX1132-NEXT: v_dual_mov_b32 v1, s17 :: v_dual_mov_b32 v2, s18 +; GFX1132-NEXT: v_dual_mov_b32 v6, s22 :: v_dual_mov_b32 v7, s23 +; GFX1132-NEXT: v_dual_mov_b32 v8, s24 :: v_dual_mov_b32 v9, s25 +; GFX1132-NEXT: v_dual_mov_b32 v10, s26 :: v_dual_mov_b32 v11, s27 +; GFX1132-NEXT: v_dual_mov_b32 v12, s28 :: v_dual_mov_b32 v13, s29 +; GFX1132-NEXT: v_dual_mov_b32 v14, s30 :: v_dual_mov_b32 v15, s31 +; GFX1132-NEXT: v_dual_mov_b32 v16, s36 :: v_dual_mov_b32 v17, s37 +; GFX1132-NEXT: v_dual_mov_b32 v18, s38 :: v_dual_mov_b32 v19, s39 +; GFX1132-NEXT: v_dual_mov_b32 v20, s40 :: v_dual_mov_b32 v21, s41 +; GFX1132-NEXT: v_dual_mov_b32 v22, s42 :: v_dual_mov_b32 v23, s43 +; GFX1132-NEXT: v_dual_mov_b32 v24, s44 :: v_dual_mov_b32 v25, s45 +; GFX1132-NEXT: v_dual_mov_b32 v26, s46 :: v_dual_mov_b32 v27, s47 +; GFX1132-NEXT: v_dual_mov_b32 v28, s48 :: v_dual_mov_b32 v29, s49 +; GFX1132-NEXT: v_mov_b32_e32 v30, s50 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX1132-NEXT: s_endpgm + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %dyn_alloca_caller = alloca i64, i32 %idx, addrspace(5) + store volatile i32 123, ptr addrspace(5) %dyn_alloca_caller + call void @callee(<33 x i32> %a) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX10: {{.*}}