diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp index 0137b3f5943d7..1da795dd5d138 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp @@ -27,6 +27,12 @@ using namespace llvm; #define DEBUG_TYPE "amdgpu-prepare-agpr-alloc" +static cl::opt InflateToAVClass( + "amdgpu-avgpr-inflation", cl::Hidden, + cl::desc("Whether to inflate register to the avgpr register " + "class -- which is assignable to either vgpr or agpr."), + cl::init(false)); + namespace { class AMDGPUPrepareAGPRAllocImpl { @@ -97,6 +103,8 @@ bool AMDGPUPrepareAGPRAllocImpl::run(MachineFunction &MF) { const MCInstrDesc &AVImmPseudo32 = TII.get(AMDGPU::AV_MOV_B32_IMM_PSEUDO); const MCInstrDesc &AVImmPseudo64 = TII.get(AMDGPU::AV_MOV_B64_IMM_PSEUDO); + const SIRegisterInfo *TRI = + static_cast(MRI.getTargetRegisterInfo()); bool Changed = false; for (MachineBasicBlock &MBB : MF) { @@ -119,6 +127,20 @@ bool AMDGPUPrepareAGPRAllocImpl::run(MachineFunction &MF) { Changed = true; continue; } + + if (!InflateToAVClass) + continue; + + for (MachineOperand &Op : MI.all_defs()) { + Register DefReg = Op.getReg(); + if (DefReg.isPhysical()) + continue; + + const TargetRegisterClass *RC = MRI.getRegClass(DefReg); + + if (TRI->hasVectorRegisters(RC)) + Changed |= MRI.recomputeRegClass(DefReg); + } } } diff --git a/llvm/test/CodeGen/AMDGPU/preinflate-avgpr.ll b/llvm/test/CodeGen/AMDGPU/preinflate-avgpr.ll new file mode 100644 index 0000000000000..bf4bf25e6b02a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/preinflate-avgpr.ll @@ -0,0 +1,854 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 --amdgpu-mfma-vgpr-form=1 --greedy-regclass-priority-trumps-globalness=1 --amdgpu-avgpr-inflation < %s | FileCheck -check-prefixes=INFLATE %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 --amdgpu-mfma-vgpr-form=1 --greedy-regclass-priority-trumps-globalness=1 < %s | FileCheck -check-prefixes=GCN %s + +define amdgpu_kernel void @bad_rp(ptr addrspace(3) %in0, ptr addrspace(0) %out, i1 %cond) #0 { +; CHECK-LABEL: bad_rp: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0 +; CHECK-NEXT: s_load_dword s1, s[4:5], 0x10 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-NEXT: ds_read_b128 a[0:3], v0 +; CHECK-NEXT: ds_read_b128 a[4:7], v0 offset:16 +; CHECK-NEXT: ds_read_b128 a[8:11], v0 offset:32 +; CHECK-NEXT: ds_read_b128 a[12:15], v0 offset:48 +; CHECK-NEXT: ds_read_b128 a[16:19], v0 offset:64 +; CHECK-NEXT: ds_read_b128 a[20:23], v0 offset:80 +; CHECK-NEXT: ds_read_b128 a[24:27], v0 offset:96 +; CHECK-NEXT: ds_read_b128 a[28:31], v0 offset:112 +; CHECK-NEXT: ds_read_b128 a[32:35], v0 offset:128 +; CHECK-NEXT: ds_read_b128 a[36:39], v0 offset:144 +; CHECK-NEXT: ds_read_b128 a[40:43], v0 offset:160 +; CHECK-NEXT: ds_read_b128 a[44:47], v0 offset:176 +; CHECK-NEXT: ds_read_b128 a[48:51], v0 offset:192 +; CHECK-NEXT: ds_read_b128 a[52:55], v0 offset:208 +; CHECK-NEXT: ds_read_b128 a[56:59], v0 offset:224 +; CHECK-NEXT: ds_read_b128 a[60:63], v0 offset:240 +; CHECK-NEXT: s_bitcmp1_b32 s1, 0 +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: s_xor_b64 s[0:1], s[0:1], -1 +; CHECK-NEXT: .LBB0_1: ; %bb.1 +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_waitcnt lgkmcnt(14) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[240:255], a[0:3], a[0:3], 0 +; CHECK-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[224:239], a[4:7], a[4:7], v[240:255] +; CHECK-NEXT: s_waitcnt lgkmcnt(13) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[208:223], a[8:11], a[8:11], v[224:239] +; CHECK-NEXT: s_waitcnt lgkmcnt(12) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[192:207], a[12:15], a[12:15], v[208:223] +; CHECK-NEXT: s_waitcnt lgkmcnt(11) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[176:191], a[16:19], a[16:19], v[192:207] +; CHECK-NEXT: s_waitcnt lgkmcnt(10) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[160:175], a[20:23], a[20:23], v[176:191] +; CHECK-NEXT: s_waitcnt lgkmcnt(9) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[144:159], a[24:27], a[24:27], v[160:175] +; CHECK-NEXT: s_waitcnt lgkmcnt(8) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[128:143], a[28:31], a[28:31], v[144:159] +; CHECK-NEXT: s_waitcnt lgkmcnt(7) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[112:127], a[32:35], a[32:35], v[128:143] +; CHECK-NEXT: s_waitcnt lgkmcnt(6) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[96:111], a[36:39], a[36:39], v[112:127] +; CHECK-NEXT: s_waitcnt lgkmcnt(5) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[80:95], a[40:43], a[40:43], v[96:111] +; CHECK-NEXT: s_waitcnt lgkmcnt(4) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[64:79], a[44:47], a[44:47], v[80:95] +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[48:63], a[48:51], a[48:51], v[64:79] +; CHECK-NEXT: s_waitcnt lgkmcnt(2) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[32:47], a[52:55], a[52:55], v[48:63] +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], a[56:59], a[56:59], v[32:47] +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], a[60:63], a[60:63], v[16:31] +; CHECK-NEXT: s_cbranch_vccnz .LBB0_1 +; CHECK-NEXT: ; %bb.2: ; %bb.2 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b64_e32 v[168:169], s[0:1] +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[244:247] offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[240:243] +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[224:227] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[228:231] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[208:211] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[212:215] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[192:195] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[196:199] offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[180:183] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[176:179] offset:128 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[160:163] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[164:167] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[148:151] offset:208 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[156:159] offset:240 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[152:155] offset:224 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[144:147] offset:192 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[140:143] offset:272 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[136:139] offset:256 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[132:135] offset:240 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[128:131] offset:224 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[124:127] offset:304 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[120:123] offset:288 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[116:119] offset:272 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[112:115] offset:256 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[108:111] offset:336 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[104:107] offset:320 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[100:103] offset:304 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[96:99] offset:288 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[92:95] offset:368 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[88:91] offset:352 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[84:87] offset:336 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[80:83] offset:320 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[76:79] offset:400 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[72:75] offset:384 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[68:71] offset:368 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[64:67] offset:352 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[60:63] offset:432 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[56:59] offset:416 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[52:55] offset:400 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[48:51] offset:384 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[44:47] offset:464 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[40:43] offset:448 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[36:39] offset:432 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[32:35] offset:416 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[28:31] offset:496 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[24:27] offset:480 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[20:23] offset:464 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[16:19] offset:448 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[12:15] offset:528 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[8:11] offset:512 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[4:7] offset:496 +; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[0:3] offset:480 +; CHECK-NEXT: s_endpgm +; INFLATE-LABEL: bad_rp: +; INFLATE: ; %bb.0: +; INFLATE-NEXT: s_load_dword s0, s[4:5], 0x0 +; INFLATE-NEXT: s_load_dword s1, s[4:5], 0x10 +; INFLATE-NEXT: s_waitcnt lgkmcnt(0) +; INFLATE-NEXT: v_mov_b32_e32 v0, s0 +; INFLATE-NEXT: ds_read_b128 a[0:3], v0 +; INFLATE-NEXT: ds_read_b128 a[4:7], v0 offset:16 +; INFLATE-NEXT: ds_read_b128 a[8:11], v0 offset:32 +; INFLATE-NEXT: ds_read_b128 a[12:15], v0 offset:48 +; INFLATE-NEXT: ds_read_b128 a[16:19], v0 offset:64 +; INFLATE-NEXT: ds_read_b128 a[20:23], v0 offset:80 +; INFLATE-NEXT: ds_read_b128 a[24:27], v0 offset:96 +; INFLATE-NEXT: ds_read_b128 a[28:31], v0 offset:112 +; INFLATE-NEXT: ds_read_b128 a[32:35], v0 offset:128 +; INFLATE-NEXT: ds_read_b128 a[36:39], v0 offset:144 +; INFLATE-NEXT: ds_read_b128 a[40:43], v0 offset:160 +; INFLATE-NEXT: ds_read_b128 a[44:47], v0 offset:176 +; INFLATE-NEXT: ds_read_b128 a[48:51], v0 offset:192 +; INFLATE-NEXT: ds_read_b128 a[52:55], v0 offset:208 +; INFLATE-NEXT: ds_read_b128 a[56:59], v0 offset:224 +; INFLATE-NEXT: ds_read_b128 a[60:63], v0 offset:240 +; INFLATE-NEXT: s_bitcmp1_b32 s1, 0 +; INFLATE-NEXT: s_cselect_b64 s[0:1], -1, 0 +; INFLATE-NEXT: s_xor_b64 s[0:1], s[0:1], -1 +; INFLATE-NEXT: .LBB0_1: ; %bb.1 +; INFLATE-NEXT: ; =>This Inner Loop Header: Depth=1 +; INFLATE-NEXT: s_waitcnt lgkmcnt(14) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[240:255], a[0:3], a[0:3], 0 +; INFLATE-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[224:239], a[4:7], a[4:7], v[240:255] +; INFLATE-NEXT: s_waitcnt lgkmcnt(13) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[208:223], a[8:11], a[8:11], v[224:239] +; INFLATE-NEXT: s_waitcnt lgkmcnt(12) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[192:207], a[12:15], a[12:15], v[208:223] +; INFLATE-NEXT: s_waitcnt lgkmcnt(11) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[176:191], a[16:19], a[16:19], v[192:207] +; INFLATE-NEXT: s_waitcnt lgkmcnt(10) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[160:175], a[20:23], a[20:23], v[176:191] +; INFLATE-NEXT: s_waitcnt lgkmcnt(9) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[144:159], a[24:27], a[24:27], v[160:175] +; INFLATE-NEXT: s_waitcnt lgkmcnt(8) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[128:143], a[28:31], a[28:31], v[144:159] +; INFLATE-NEXT: s_waitcnt lgkmcnt(7) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[112:127], a[32:35], a[32:35], v[128:143] +; INFLATE-NEXT: s_waitcnt lgkmcnt(6) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[96:111], a[36:39], a[36:39], v[112:127] +; INFLATE-NEXT: s_waitcnt lgkmcnt(5) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[80:95], a[40:43], a[40:43], v[96:111] +; INFLATE-NEXT: s_waitcnt lgkmcnt(4) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[64:79], a[44:47], a[44:47], v[80:95] +; INFLATE-NEXT: s_waitcnt lgkmcnt(3) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[48:63], a[48:51], a[48:51], v[64:79] +; INFLATE-NEXT: s_waitcnt lgkmcnt(2) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[32:47], a[52:55], a[52:55], v[48:63] +; INFLATE-NEXT: s_waitcnt lgkmcnt(1) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], a[56:59], a[56:59], v[32:47] +; INFLATE-NEXT: s_waitcnt lgkmcnt(0) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], a[60:63], a[60:63], v[16:31] +; INFLATE-NEXT: s_cbranch_vccnz .LBB0_1 +; INFLATE-NEXT: ; %bb.2: ; %bb.2 +; INFLATE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; INFLATE-NEXT: s_waitcnt lgkmcnt(0) +; INFLATE-NEXT: v_mov_b64_e32 v[168:169], s[0:1] +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[244:247] offset:16 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[240:243] +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[224:227] offset:32 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[228:231] offset:48 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[208:211] offset:64 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[212:215] offset:80 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[192:195] offset:96 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[196:199] offset:112 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[180:183] offset:144 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[176:179] offset:128 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[160:163] offset:160 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[164:167] offset:176 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[148:151] offset:208 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[156:159] offset:240 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[152:155] offset:224 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[144:147] offset:192 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[140:143] offset:272 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[136:139] offset:256 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[132:135] offset:240 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[128:131] offset:224 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[124:127] offset:304 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[120:123] offset:288 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[116:119] offset:272 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[112:115] offset:256 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[108:111] offset:336 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[104:107] offset:320 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[100:103] offset:304 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[96:99] offset:288 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[92:95] offset:368 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[88:91] offset:352 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[84:87] offset:336 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[80:83] offset:320 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[76:79] offset:400 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[72:75] offset:384 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[68:71] offset:368 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[64:67] offset:352 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[60:63] offset:432 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[56:59] offset:416 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[52:55] offset:400 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[48:51] offset:384 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[44:47] offset:464 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[40:43] offset:448 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[36:39] offset:432 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[32:35] offset:416 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[28:31] offset:496 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[24:27] offset:480 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[20:23] offset:464 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[16:19] offset:448 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[12:15] offset:528 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[8:11] offset:512 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[4:7] offset:496 +; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[0:3] offset:480 +; INFLATE-NEXT: s_endpgm +; +; GCN-LABEL: bad_rp: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s0, s[4:5], 0x0 +; GCN-NEXT: s_load_dword s1, s[4:5], 0x10 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v12, s0 +; GCN-NEXT: ds_read_b128 a[48:51], v12 +; GCN-NEXT: ds_read_b128 a[52:55], v12 offset:16 +; GCN-NEXT: ds_read_b128 a[56:59], v12 offset:32 +; GCN-NEXT: ds_read_b128 a[60:63], v12 offset:48 +; GCN-NEXT: ds_read_b128 a[64:67], v12 offset:64 +; GCN-NEXT: ds_read_b128 a[68:71], v12 offset:80 +; GCN-NEXT: ds_read_b128 a[72:75], v12 offset:96 +; GCN-NEXT: ds_read_b128 a[76:79], v12 offset:112 +; GCN-NEXT: ds_read_b128 v[0:3], v12 offset:128 +; GCN-NEXT: ds_read_b128 v[4:7], v12 offset:144 +; GCN-NEXT: ds_read_b128 v[8:11], v12 offset:160 +; GCN-NEXT: ds_read_b128 v[32:35], v12 offset:176 +; GCN-NEXT: ds_read_b128 v[36:39], v12 offset:192 +; GCN-NEXT: ds_read_b128 v[40:43], v12 offset:208 +; GCN-NEXT: ds_read_b128 v[44:47], v12 offset:224 +; GCN-NEXT: ds_read_b128 v[12:15], v12 offset:240 +; GCN-NEXT: s_bitcmp1_b32 s1, 0 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], -1 +; GCN-NEXT: .LBB0_1: ; %bb.1 +; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: s_waitcnt lgkmcnt(14) +; GCN-NEXT: s_nop 9 +; GCN-NEXT: v_accvgpr_read_b32 v16, a48 +; GCN-NEXT: v_accvgpr_read_b32 v17, a49 +; GCN-NEXT: v_accvgpr_read_b32 v18, a50 +; GCN-NEXT: v_accvgpr_read_b32 v19, a51 +; GCN-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[240:255], v[16:19], v[16:19], 0 +; GCN-NEXT: v_accvgpr_read_b32 v16, a52 +; GCN-NEXT: v_accvgpr_read_b32 v17, a53 +; GCN-NEXT: v_accvgpr_read_b32 v18, a54 +; GCN-NEXT: v_accvgpr_read_b32 v19, a55 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[224:239], v[16:19], v[16:19], v[240:255] +; GCN-NEXT: s_waitcnt lgkmcnt(13) +; GCN-NEXT: v_accvgpr_read_b32 v16, a56 +; GCN-NEXT: v_accvgpr_read_b32 v17, a57 +; GCN-NEXT: v_accvgpr_read_b32 v18, a58 +; GCN-NEXT: v_accvgpr_read_b32 v19, a59 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[208:223], v[16:19], v[16:19], v[224:239] +; GCN-NEXT: s_waitcnt lgkmcnt(12) +; GCN-NEXT: v_accvgpr_read_b32 v16, a60 +; GCN-NEXT: v_accvgpr_read_b32 v17, a61 +; GCN-NEXT: v_accvgpr_read_b32 v18, a62 +; GCN-NEXT: v_accvgpr_read_b32 v19, a63 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[192:207], v[16:19], v[16:19], v[208:223] +; GCN-NEXT: s_waitcnt lgkmcnt(11) +; GCN-NEXT: v_accvgpr_read_b32 v16, a64 +; GCN-NEXT: v_accvgpr_read_b32 v17, a65 +; GCN-NEXT: v_accvgpr_read_b32 v18, a66 +; GCN-NEXT: v_accvgpr_read_b32 v19, a67 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[176:191], v[16:19], v[16:19], v[192:207] +; GCN-NEXT: s_waitcnt lgkmcnt(10) +; GCN-NEXT: v_accvgpr_read_b32 v16, a68 +; GCN-NEXT: v_accvgpr_read_b32 v17, a69 +; GCN-NEXT: v_accvgpr_read_b32 v18, a70 +; GCN-NEXT: v_accvgpr_read_b32 v19, a71 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[160:175], v[16:19], v[16:19], v[176:191] +; GCN-NEXT: s_waitcnt lgkmcnt(9) +; GCN-NEXT: v_accvgpr_read_b32 v16, a72 +; GCN-NEXT: v_accvgpr_read_b32 v17, a73 +; GCN-NEXT: v_accvgpr_read_b32 v18, a74 +; GCN-NEXT: v_accvgpr_read_b32 v19, a75 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[144:159], v[16:19], v[16:19], v[160:175] +; GCN-NEXT: s_waitcnt lgkmcnt(8) +; GCN-NEXT: v_accvgpr_read_b32 v16, a76 +; GCN-NEXT: v_accvgpr_read_b32 v17, a77 +; GCN-NEXT: v_accvgpr_read_b32 v18, a78 +; GCN-NEXT: v_accvgpr_read_b32 v19, a79 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[128:143], v[16:19], v[16:19], v[144:159] +; GCN-NEXT: s_waitcnt lgkmcnt(7) +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[112:127], v[0:3], v[0:3], v[128:143] +; GCN-NEXT: s_waitcnt lgkmcnt(6) +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[96:111], v[4:7], v[4:7], v[112:127] +; GCN-NEXT: s_waitcnt lgkmcnt(5) +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[80:95], v[8:11], v[8:11], v[96:111] +; GCN-NEXT: s_waitcnt lgkmcnt(4) +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[64:79], v[32:35], v[32:35], v[80:95] +; GCN-NEXT: s_waitcnt lgkmcnt(3) +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[48:63], v[36:39], v[36:39], v[64:79] +; GCN-NEXT: s_waitcnt lgkmcnt(2) +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[40:43], v[40:43], v[48:63] +; GCN-NEXT: s_nop 9 +; GCN-NEXT: v_accvgpr_write_b32 a32, v48 +; GCN-NEXT: v_accvgpr_write_b32 a33, v49 +; GCN-NEXT: v_accvgpr_write_b32 a34, v50 +; GCN-NEXT: v_accvgpr_write_b32 a35, v51 +; GCN-NEXT: v_accvgpr_write_b32 a36, v52 +; GCN-NEXT: v_accvgpr_write_b32 a37, v53 +; GCN-NEXT: v_accvgpr_write_b32 a38, v54 +; GCN-NEXT: v_accvgpr_write_b32 a39, v55 +; GCN-NEXT: v_accvgpr_write_b32 a40, v56 +; GCN-NEXT: v_accvgpr_write_b32 a41, v57 +; GCN-NEXT: v_accvgpr_write_b32 a42, v58 +; GCN-NEXT: v_accvgpr_write_b32 a43, v59 +; GCN-NEXT: v_accvgpr_write_b32 a44, v60 +; GCN-NEXT: v_accvgpr_write_b32 a45, v61 +; GCN-NEXT: v_accvgpr_write_b32 a46, v62 +; GCN-NEXT: v_accvgpr_write_b32 a47, v63 +; GCN-NEXT: s_waitcnt lgkmcnt(1) +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[48:63], v[44:47], v[44:47], v[16:31] +; GCN-NEXT: v_accvgpr_write_b32 a16, v16 +; GCN-NEXT: v_accvgpr_write_b32 a17, v17 +; GCN-NEXT: v_accvgpr_write_b32 a18, v18 +; GCN-NEXT: v_accvgpr_write_b32 a19, v19 +; GCN-NEXT: v_accvgpr_write_b32 a20, v20 +; GCN-NEXT: v_accvgpr_write_b32 a21, v21 +; GCN-NEXT: v_accvgpr_write_b32 a22, v22 +; GCN-NEXT: v_accvgpr_write_b32 a23, v23 +; GCN-NEXT: v_accvgpr_write_b32 a24, v24 +; GCN-NEXT: v_accvgpr_write_b32 a25, v25 +; GCN-NEXT: v_accvgpr_write_b32 a26, v26 +; GCN-NEXT: v_accvgpr_write_b32 a27, v27 +; GCN-NEXT: v_accvgpr_write_b32 a28, v28 +; GCN-NEXT: v_accvgpr_write_b32 a29, v29 +; GCN-NEXT: v_accvgpr_write_b32 a30, v30 +; GCN-NEXT: v_accvgpr_write_b32 a31, v31 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[12:15], v[12:15], v[48:63] +; GCN-NEXT: s_cbranch_vccnz .LBB0_1 +; GCN-NEXT: ; %bb.2: ; %bb.2 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GCN-NEXT: v_accvgpr_write_b32 a0, v48 +; GCN-NEXT: v_accvgpr_write_b32 a1, v49 +; GCN-NEXT: v_accvgpr_write_b32 a2, v50 +; GCN-NEXT: v_accvgpr_write_b32 a3, v51 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GCN-NEXT: v_accvgpr_write_b32 a4, v52 +; GCN-NEXT: v_accvgpr_write_b32 a5, v53 +; GCN-NEXT: v_accvgpr_write_b32 a6, v54 +; GCN-NEXT: v_accvgpr_write_b32 a7, v55 +; GCN-NEXT: v_accvgpr_write_b32 a8, v56 +; GCN-NEXT: v_accvgpr_write_b32 a9, v57 +; GCN-NEXT: v_accvgpr_write_b32 a10, v58 +; GCN-NEXT: v_accvgpr_write_b32 a11, v59 +; GCN-NEXT: v_accvgpr_write_b32 a12, v60 +; GCN-NEXT: v_accvgpr_write_b32 a13, v61 +; GCN-NEXT: v_accvgpr_write_b32 a14, v62 +; GCN-NEXT: v_accvgpr_write_b32 a15, v63 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[244:247] offset:16 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[240:243] +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[224:227] offset:32 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[228:231] offset:48 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[208:211] offset:64 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[212:215] offset:80 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[192:195] offset:96 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[196:199] offset:112 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[180:183] offset:144 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[176:179] offset:128 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[160:163] offset:160 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[164:167] offset:176 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[148:151] offset:208 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[156:159] offset:240 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[152:155] offset:224 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[144:147] offset:192 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[140:143] offset:272 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[136:139] offset:256 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[132:135] offset:240 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[128:131] offset:224 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[124:127] offset:304 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[120:123] offset:288 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[116:119] offset:272 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[112:115] offset:256 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[108:111] offset:336 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[104:107] offset:320 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[100:103] offset:304 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[96:99] offset:288 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[92:95] offset:368 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[88:91] offset:352 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[84:87] offset:336 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[80:83] offset:320 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[76:79] offset:400 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[72:75] offset:384 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[68:71] offset:368 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[64:67] offset:352 +; GCN-NEXT: flat_store_dwordx4 v[0:1], a[44:47] offset:432 +; GCN-NEXT: flat_store_dwordx4 v[0:1], a[40:43] offset:416 +; GCN-NEXT: flat_store_dwordx4 v[0:1], a[36:39] offset:400 +; GCN-NEXT: flat_store_dwordx4 v[0:1], a[32:35] offset:384 +; GCN-NEXT: flat_store_dwordx4 v[0:1], a[28:31] offset:464 +; GCN-NEXT: flat_store_dwordx4 v[0:1], a[24:27] offset:448 +; GCN-NEXT: flat_store_dwordx4 v[0:1], a[20:23] offset:432 +; GCN-NEXT: flat_store_dwordx4 v[0:1], a[16:19] offset:416 +; GCN-NEXT: flat_store_dwordx4 v[0:1], a[12:15] offset:496 +; GCN-NEXT: flat_store_dwordx4 v[0:1], a[8:11] offset:480 +; GCN-NEXT: flat_store_dwordx4 v[0:1], a[4:7] offset:464 +; GCN-NEXT: flat_store_dwordx4 v[0:1], a[0:3] offset:448 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[28:31] offset:528 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[24:27] offset:512 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[20:23] offset:496 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[16:19] offset:480 +; GCN-NEXT: s_endpgm + %gep1 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 4 + %gep2 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 8 + %gep3 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 12 + %gep4 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 16 + %gep5 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 20 + %gep6 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 24 + %gep7 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 28 + %gep8 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 32 + %gep9 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 36 + %gep10 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 40 + %gep11 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 44 + %gep12 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 48 + %gep13 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 52 + %gep14 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 56 + %gep15 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 60 + %load0 = load <8 x half>, ptr addrspace(3) %in0, align 16 + %load1 = load <8 x half>, ptr addrspace(3) %gep1, align 16 + %load2 = load <8 x half>, ptr addrspace(3) %gep2, align 16 + %load3 = load <8 x half>, ptr addrspace(3) %gep3, align 16 + %load4 = load <8 x half>, ptr addrspace(3) %gep4, align 16 + %load5 = load <8 x half>, ptr addrspace(3) %gep5, align 16 + %load6 = load <8 x half>, ptr addrspace(3) %gep6, align 16 + %load7 = load <8 x half>, ptr addrspace(3) %gep7, align 16 + %load8 = load <8 x half>, ptr addrspace(3) %gep8, align 16 + %load9 = load <8 x half>, ptr addrspace(3) %gep9, align 16 + %load10 = load <8 x half>, ptr addrspace(3) %gep10, align 16 + %load11 = load <8 x half>, ptr addrspace(3) %gep11, align 16 + %load12 = load <8 x half>, ptr addrspace(3) %gep12, align 16 + %load13 = load <8 x half>, ptr addrspace(3) %gep13, align 16 + %load14 = load <8 x half>, ptr addrspace(3) %gep14, align 16 + %load15 = load <8 x half>, ptr addrspace(3) %gep15, align 16 + br label %bb.1 + +bb.1: + %mfma0 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load0, <8 x half> %load0, <16 x float> zeroinitializer, i32 0, i32 0, i32 0) + %mfma1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load1, <8 x half> %load1, <16 x float> %mfma0, i32 0, i32 0, i32 0) + %mfma2 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load2, <8 x half> %load2, <16 x float> %mfma1, i32 0, i32 0, i32 0) + %mfma3 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load3, <8 x half> %load3, <16 x float> %mfma2, i32 0, i32 0, i32 0) + %mfma4 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load4, <8 x half> %load4, <16 x float> %mfma3, i32 0, i32 0, i32 0) + %mfma5 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load5, <8 x half> %load5, <16 x float> %mfma4, i32 0, i32 0, i32 0) + %mfma6 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load6, <8 x half> %load6, <16 x float> %mfma5, i32 0, i32 0, i32 0) + %mfma7 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load7, <8 x half> %load7, <16 x float> %mfma6, i32 0, i32 0, i32 0) + %mfma8 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load8, <8 x half> %load8, <16 x float> %mfma7, i32 0, i32 0, i32 0) + %mfma9 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load9, <8 x half> %load9, <16 x float> %mfma8, i32 0, i32 0, i32 0) + %mfma10 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load10, <8 x half> %load10, <16 x float> %mfma9, i32 0, i32 0, i32 0) + %mfma11 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load11, <8 x half> %load11, <16 x float> %mfma10, i32 0, i32 0, i32 0) + %mfma12 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load12, <8 x half> %load12, <16 x float> %mfma11, i32 0, i32 0, i32 0) + %mfma13 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load13, <8 x half> %load13, <16 x float> %mfma12, i32 0, i32 0, i32 0) + %mfma14 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load14, <8 x half> %load14, <16 x float> %mfma13, i32 0, i32 0, i32 0) + %mfma15 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load15, <8 x half> %load15, <16 x float> %mfma14, i32 0, i32 0, i32 0) + br i1 %cond, label %bb.1, label %bb.2 + +bb.2: + %out1 = getelementptr ptr, ptr %out, i32 4 + %out2 = getelementptr ptr, ptr %out, i32 8 + %out3 = getelementptr ptr, ptr %out, i32 12 + %out4 = getelementptr ptr, ptr %out, i32 16 + %out5 = getelementptr ptr, ptr %out, i32 20 + %out6 = getelementptr ptr, ptr %out, i32 24 + %out7 = getelementptr ptr, ptr %out, i32 28 + %out8 = getelementptr ptr, ptr %out, i32 32 + %out9 = getelementptr ptr, ptr %out, i32 36 + %out10 = getelementptr ptr, ptr %out, i32 40 + %out11 = getelementptr ptr, ptr %out, i32 44 + %out12 = getelementptr ptr, ptr %out, i32 48 + %out13 = getelementptr ptr, ptr %out, i32 52 + %out14 = getelementptr ptr, ptr %out, i32 56 + %out15 = getelementptr ptr, ptr %out, i32 60 + store <16 x float> %mfma0, ptr addrspace(0) %out + store <16 x float> %mfma1, ptr addrspace(0) %out1 + store <16 x float> %mfma2, ptr addrspace(0) %out2 + store <16 x float> %mfma3, ptr addrspace(0) %out3 + store <16 x float> %mfma4, ptr addrspace(0) %out4 + store <16 x float> %mfma5, ptr addrspace(0) %out5 + store <16 x float> %mfma6, ptr addrspace(0) %out6 + store <16 x float> %mfma7, ptr addrspace(0) %out7 + store <16 x float> %mfma8, ptr addrspace(0) %out8 + store <16 x float> %mfma9, ptr addrspace(0) %out9 + store <16 x float> %mfma10, ptr addrspace(0) %out10 + store <16 x float> %mfma11, ptr addrspace(0) %out11 + store <16 x float> %mfma12, ptr addrspace(0) %out12 + store <16 x float> %mfma13, ptr addrspace(0) %out13 + store <16 x float> %mfma14, ptr addrspace(0) %out14 + store <16 x float> %mfma15, ptr addrspace(0) %out15 + ret void +} + +define amdgpu_kernel void @good_rp(ptr addrspace(3) %in0, ptr addrspace(0) %out, i1 %cond) #0 { +; CHECK-LABEL: good_rp: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s0, s[4:5], 0x10 +; CHECK-NEXT: s_load_dword s1, s[4:5], 0x0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_bitcmp1_b32 s0, 0 +; CHECK-NEXT: v_mov_b32_e32 v0, s1 +; CHECK-NEXT: ds_read_b128 v[176:179], v0 +; CHECK-NEXT: ds_read_b128 v[180:183], v0 offset:16 +; CHECK-NEXT: ds_read_b128 v[184:187], v0 offset:32 +; CHECK-NEXT: ds_read_b128 v[188:191], v0 offset:48 +; CHECK-NEXT: ds_read_b128 v[192:195], v0 offset:64 +; CHECK-NEXT: ds_read_b128 v[196:199], v0 offset:80 +; CHECK-NEXT: ds_read_b128 v[200:203], v0 offset:96 +; CHECK-NEXT: ds_read_b128 v[204:207], v0 offset:112 +; CHECK-NEXT: ds_read_b128 v[208:211], v0 offset:128 +; CHECK-NEXT: ds_read_b128 v[212:215], v0 offset:144 +; CHECK-NEXT: ds_read_b128 v[216:219], v0 offset:160 +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: s_xor_b64 s[0:1], s[0:1], -1 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 +; CHECK-NEXT: .LBB1_1: ; %bb.1 +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_waitcnt lgkmcnt(10) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[160:175], v[176:179], v[176:179], 0 +; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] +; CHECK-NEXT: s_waitcnt lgkmcnt(9) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[144:159], v[180:183], v[180:183], v[160:175] +; CHECK-NEXT: s_waitcnt lgkmcnt(8) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[128:143], v[184:187], v[184:187], v[144:159] +; CHECK-NEXT: s_waitcnt lgkmcnt(7) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[112:127], v[188:191], v[188:191], v[128:143] +; CHECK-NEXT: s_waitcnt lgkmcnt(6) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[96:111], v[192:195], v[192:195], v[112:127] +; CHECK-NEXT: s_waitcnt lgkmcnt(5) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[80:95], v[196:199], v[196:199], v[96:111] +; CHECK-NEXT: s_waitcnt lgkmcnt(4) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[64:79], v[200:203], v[200:203], v[80:95] +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[48:63], v[204:207], v[204:207], v[64:79] +; CHECK-NEXT: s_waitcnt lgkmcnt(2) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[32:47], v[208:211], v[208:211], v[48:63] +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[212:215], v[212:215], v[32:47] +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[216:219], v[216:219], v[16:31] +; CHECK-NEXT: s_cbranch_vccnz .LBB1_1 +; CHECK-NEXT: ; %bb.2: ; %bb.2 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b64_e32 v[88:89], s[0:1] +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[164:167] offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[160:163] +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[144:147] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[148:151] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[128:131] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[132:135] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[112:115] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[116:119] offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[100:103] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[96:99] offset:128 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[80:83] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[84:87] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[68:71] offset:208 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[76:79] offset:240 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[72:75] offset:224 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[64:67] offset:192 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[60:63] offset:272 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[56:59] offset:256 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[52:55] offset:240 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[48:51] offset:224 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[44:47] offset:304 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[40:43] offset:288 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[36:39] offset:272 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[32:35] offset:256 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[28:31] offset:336 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[24:27] offset:320 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[20:23] offset:304 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[16:19] offset:288 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[12:15] offset:368 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[8:11] offset:352 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[4:7] offset:336 +; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[0:3] offset:320 +; CHECK-NEXT: s_endpgm +; INFLATE-LABEL: good_rp: +; INFLATE: ; %bb.0: +; INFLATE-NEXT: s_load_dword s0, s[4:5], 0x10 +; INFLATE-NEXT: s_load_dword s1, s[4:5], 0x0 +; INFLATE-NEXT: s_waitcnt lgkmcnt(0) +; INFLATE-NEXT: s_bitcmp1_b32 s0, 0 +; INFLATE-NEXT: v_mov_b32_e32 v0, s1 +; INFLATE-NEXT: ds_read_b128 v[176:179], v0 +; INFLATE-NEXT: ds_read_b128 v[180:183], v0 offset:16 +; INFLATE-NEXT: ds_read_b128 v[184:187], v0 offset:32 +; INFLATE-NEXT: ds_read_b128 v[188:191], v0 offset:48 +; INFLATE-NEXT: ds_read_b128 v[192:195], v0 offset:64 +; INFLATE-NEXT: ds_read_b128 v[196:199], v0 offset:80 +; INFLATE-NEXT: ds_read_b128 v[200:203], v0 offset:96 +; INFLATE-NEXT: ds_read_b128 v[204:207], v0 offset:112 +; INFLATE-NEXT: ds_read_b128 v[208:211], v0 offset:128 +; INFLATE-NEXT: ds_read_b128 v[212:215], v0 offset:144 +; INFLATE-NEXT: ds_read_b128 v[216:219], v0 offset:160 +; INFLATE-NEXT: s_cselect_b64 s[0:1], -1, 0 +; INFLATE-NEXT: s_xor_b64 s[0:1], s[0:1], -1 +; INFLATE-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; INFLATE-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 +; INFLATE-NEXT: .LBB1_1: ; %bb.1 +; INFLATE-NEXT: ; =>This Inner Loop Header: Depth=1 +; INFLATE-NEXT: s_waitcnt lgkmcnt(10) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[160:175], v[176:179], v[176:179], 0 +; INFLATE-NEXT: s_and_b64 vcc, exec, s[0:1] +; INFLATE-NEXT: s_waitcnt lgkmcnt(9) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[144:159], v[180:183], v[180:183], v[160:175] +; INFLATE-NEXT: s_waitcnt lgkmcnt(8) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[128:143], v[184:187], v[184:187], v[144:159] +; INFLATE-NEXT: s_waitcnt lgkmcnt(7) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[112:127], v[188:191], v[188:191], v[128:143] +; INFLATE-NEXT: s_waitcnt lgkmcnt(6) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[96:111], v[192:195], v[192:195], v[112:127] +; INFLATE-NEXT: s_waitcnt lgkmcnt(5) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[80:95], v[196:199], v[196:199], v[96:111] +; INFLATE-NEXT: s_waitcnt lgkmcnt(4) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[64:79], v[200:203], v[200:203], v[80:95] +; INFLATE-NEXT: s_waitcnt lgkmcnt(3) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[48:63], v[204:207], v[204:207], v[64:79] +; INFLATE-NEXT: s_waitcnt lgkmcnt(2) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[32:47], v[208:211], v[208:211], v[48:63] +; INFLATE-NEXT: s_waitcnt lgkmcnt(1) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[212:215], v[212:215], v[32:47] +; INFLATE-NEXT: s_waitcnt lgkmcnt(0) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[216:219], v[216:219], v[16:31] +; INFLATE-NEXT: s_cbranch_vccnz .LBB1_1 +; INFLATE-NEXT: ; %bb.2: ; %bb.2 +; INFLATE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; INFLATE-NEXT: s_waitcnt lgkmcnt(0) +; INFLATE-NEXT: v_mov_b64_e32 v[88:89], s[0:1] +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[164:167] offset:16 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[160:163] +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[144:147] offset:32 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[148:151] offset:48 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[128:131] offset:64 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[132:135] offset:80 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[112:115] offset:96 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[116:119] offset:112 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[100:103] offset:144 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[96:99] offset:128 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[80:83] offset:160 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[84:87] offset:176 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[68:71] offset:208 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[76:79] offset:240 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[72:75] offset:224 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[64:67] offset:192 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[60:63] offset:272 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[56:59] offset:256 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[52:55] offset:240 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[48:51] offset:224 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[44:47] offset:304 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[40:43] offset:288 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[36:39] offset:272 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[32:35] offset:256 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[28:31] offset:336 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[24:27] offset:320 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[20:23] offset:304 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[16:19] offset:288 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[12:15] offset:368 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[8:11] offset:352 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[4:7] offset:336 +; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[0:3] offset:320 +; INFLATE-NEXT: s_endpgm +; +; GCN-LABEL: good_rp: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s0, s[4:5], 0x10 +; GCN-NEXT: s_load_dword s1, s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_bitcmp1_b32 s0, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: ds_read_b128 v[176:179], v0 +; GCN-NEXT: ds_read_b128 v[180:183], v0 offset:16 +; GCN-NEXT: ds_read_b128 v[184:187], v0 offset:32 +; GCN-NEXT: ds_read_b128 v[188:191], v0 offset:48 +; GCN-NEXT: ds_read_b128 v[192:195], v0 offset:64 +; GCN-NEXT: ds_read_b128 v[196:199], v0 offset:80 +; GCN-NEXT: ds_read_b128 v[200:203], v0 offset:96 +; GCN-NEXT: ds_read_b128 v[204:207], v0 offset:112 +; GCN-NEXT: ds_read_b128 v[208:211], v0 offset:128 +; GCN-NEXT: ds_read_b128 v[212:215], v0 offset:144 +; GCN-NEXT: ds_read_b128 v[216:219], v0 offset:160 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 +; GCN-NEXT: .LBB1_1: ; %bb.1 +; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: s_waitcnt lgkmcnt(10) +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[160:175], v[176:179], v[176:179], 0 +; GCN-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-NEXT: s_waitcnt lgkmcnt(9) +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[144:159], v[180:183], v[180:183], v[160:175] +; GCN-NEXT: s_waitcnt lgkmcnt(8) +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[128:143], v[184:187], v[184:187], v[144:159] +; GCN-NEXT: s_waitcnt lgkmcnt(7) +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[112:127], v[188:191], v[188:191], v[128:143] +; GCN-NEXT: s_waitcnt lgkmcnt(6) +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[96:111], v[192:195], v[192:195], v[112:127] +; GCN-NEXT: s_waitcnt lgkmcnt(5) +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[80:95], v[196:199], v[196:199], v[96:111] +; GCN-NEXT: s_waitcnt lgkmcnt(4) +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[64:79], v[200:203], v[200:203], v[80:95] +; GCN-NEXT: s_waitcnt lgkmcnt(3) +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[48:63], v[204:207], v[204:207], v[64:79] +; GCN-NEXT: s_waitcnt lgkmcnt(2) +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[32:47], v[208:211], v[208:211], v[48:63] +; GCN-NEXT: s_waitcnt lgkmcnt(1) +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[212:215], v[212:215], v[32:47] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[216:219], v[216:219], v[16:31] +; GCN-NEXT: s_cbranch_vccnz .LBB1_1 +; GCN-NEXT: ; %bb.2: ; %bb.2 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b64_e32 v[88:89], s[0:1] +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[164:167] offset:16 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[160:163] +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[144:147] offset:32 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[148:151] offset:48 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[128:131] offset:64 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[132:135] offset:80 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[112:115] offset:96 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[116:119] offset:112 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[100:103] offset:144 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[96:99] offset:128 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[80:83] offset:160 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[84:87] offset:176 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[68:71] offset:208 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[76:79] offset:240 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[72:75] offset:224 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[64:67] offset:192 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[60:63] offset:272 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[56:59] offset:256 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[52:55] offset:240 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[48:51] offset:224 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[44:47] offset:304 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[40:43] offset:288 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[36:39] offset:272 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[32:35] offset:256 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[28:31] offset:336 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[24:27] offset:320 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[20:23] offset:304 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[16:19] offset:288 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[12:15] offset:368 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[8:11] offset:352 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[4:7] offset:336 +; GCN-NEXT: flat_store_dwordx4 v[88:89], v[0:3] offset:320 +; GCN-NEXT: s_endpgm + %gep1 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 4 + %gep2 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 8 + %gep3 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 12 + %gep4 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 16 + %gep5 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 20 + %gep6 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 24 + %gep7 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 28 + %gep8 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 32 + %gep9 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 36 + %gep10 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 40 + %load0 = load <8 x half>, ptr addrspace(3) %in0, align 16 + %load1 = load <8 x half>, ptr addrspace(3) %gep1, align 16 + %load2 = load <8 x half>, ptr addrspace(3) %gep2, align 16 + %load3 = load <8 x half>, ptr addrspace(3) %gep3, align 16 + %load4 = load <8 x half>, ptr addrspace(3) %gep4, align 16 + %load5 = load <8 x half>, ptr addrspace(3) %gep5, align 16 + %load6 = load <8 x half>, ptr addrspace(3) %gep6, align 16 + %load7 = load <8 x half>, ptr addrspace(3) %gep7, align 16 + %load8 = load <8 x half>, ptr addrspace(3) %gep8, align 16 + %load9 = load <8 x half>, ptr addrspace(3) %gep9, align 16 + %load10 = load <8 x half>, ptr addrspace(3) %gep10, align 16 + br label %bb.1 + +bb.1: + %mfma0 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load0, <8 x half> %load0, <16 x float> zeroinitializer, i32 0, i32 0, i32 0) + %mfma1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load1, <8 x half> %load1, <16 x float> %mfma0, i32 0, i32 0, i32 0) + %mfma2 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load2, <8 x half> %load2, <16 x float> %mfma1, i32 0, i32 0, i32 0) + %mfma3 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load3, <8 x half> %load3, <16 x float> %mfma2, i32 0, i32 0, i32 0) + %mfma4 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load4, <8 x half> %load4, <16 x float> %mfma3, i32 0, i32 0, i32 0) + %mfma5 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load5, <8 x half> %load5, <16 x float> %mfma4, i32 0, i32 0, i32 0) + %mfma6 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load6, <8 x half> %load6, <16 x float> %mfma5, i32 0, i32 0, i32 0) + %mfma7 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load7, <8 x half> %load7, <16 x float> %mfma6, i32 0, i32 0, i32 0) + %mfma8 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load8, <8 x half> %load8, <16 x float> %mfma7, i32 0, i32 0, i32 0) + %mfma9 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load9, <8 x half> %load9, <16 x float> %mfma8, i32 0, i32 0, i32 0) + %mfma10 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load10, <8 x half> %load10, <16 x float> %mfma9, i32 0, i32 0, i32 0) + br i1 %cond, label %bb.1, label %bb.2 + +bb.2: + %out1 = getelementptr ptr, ptr %out, i32 4 + %out2 = getelementptr ptr, ptr %out, i32 8 + %out3 = getelementptr ptr, ptr %out, i32 12 + %out4 = getelementptr ptr, ptr %out, i32 16 + %out5 = getelementptr ptr, ptr %out, i32 20 + %out6 = getelementptr ptr, ptr %out, i32 24 + %out7 = getelementptr ptr, ptr %out, i32 28 + %out8 = getelementptr ptr, ptr %out, i32 32 + %out9 = getelementptr ptr, ptr %out, i32 36 + %out10 = getelementptr ptr, ptr %out, i32 40 + store <16 x float> %mfma0, ptr addrspace(0) %out + store <16 x float> %mfma1, ptr addrspace(0) %out1 + store <16 x float> %mfma2, ptr addrspace(0) %out2 + store <16 x float> %mfma3, ptr addrspace(0) %out3 + store <16 x float> %mfma4, ptr addrspace(0) %out4 + store <16 x float> %mfma5, ptr addrspace(0) %out5 + store <16 x float> %mfma6, ptr addrspace(0) %out6 + store <16 x float> %mfma7, ptr addrspace(0) %out7 + store <16 x float> %mfma8, ptr addrspace(0) %out8 + store <16 x float> %mfma9, ptr addrspace(0) %out9 + store <16 x float> %mfma10, ptr addrspace(0) %out10 + ret void +} + +attributes #0 = { "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1" }